gallery-dl/gallery_dl/extractor/gelbooru.py

# -*- coding: utf-8 -*-

# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://gelbooru.com/"""

from . import booru
from .. import text, exception


class GelbooruBase():
    """Base class for gelbooru extractors"""
    category = "gelbooru"
    root = "https://gelbooru.com"

    @staticmethod
    def _file_url(post):
        url = post["file_url"]
        if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")):
            md5 = post["md5"]
            url = "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
                md5[0:2], md5[2:4], md5)
        return url


class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
    """Extractor for images from gelbooru.com based on search-tags"""
    pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
    test = (
        ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
            "count": 5,
        }),
        ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
            "options": (("api", False),),
            "count": 5,
        }),
    )


class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
    """Extractor for image-pools from gelbooru.com"""
    pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=pool&s=show&id=(?P<pool>\d+)")
    test = (
        ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
            "count": 6,
        }),
        ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
            "options": (("api", False),),
            "count": 6,
        }),
    )

    def metadata(self):
        url = "{}/index.php?page=pool&s=show&id={}".format(
            self.root, self.pool_id)
        page = self.request(url).text

        name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
        if not name:
            raise exception.NotFoundError("pool")
        self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos)

        return {
            "pool": text.parse_int(self.pool_id),
            "pool_name": text.unescape(name),
        }


class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):
    """Extractor for single images from gelbooru.com"""
    pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=post&s=view&id=(?P<post>\d+)")
    test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
        "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
        "count": 1,
    })
[gelbooru] update to new extractor interface 10 years ago			`# -- coding: utf-8 --`
initial commit 10 years ago
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`# Copyright 2014-2020 Mike Fährmann`
[gelbooru] update to new extractor interface 10 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`"""Extractors for https://gelbooru.com/"""`
[gelbooru] update to new extractor interface 10 years ago
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`from . import booru`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`from .. import text, exception`
[gelbooru] update to new extractor interface 10 years ago
code adjustments according to pep8 nr2 8 years ago
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`class GelbooruBase():`
[gelbooru] update to new format 9 years ago			`"""Base class for gelbooru extractors"""`
			`category = "gelbooru"`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`root = "https://gelbooru.com"`
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago
[booru] split '_prepare_post()' 4 years ago			`@staticmethod`
			`def _file_url(post):`
			`url = post["file_url"]`
			`if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")):`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`md5 = post["md5"]`
[booru] split '_prepare_post()' 4 years ago			`url = "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`md5[0:2], md5[2:4], md5)`
[gelbooru] rewrite mp4 video URLs (fixes #1048) 4 years ago			`return url`

[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):`
consistent extractor naming scheme + docstrings 8 years ago			`"""Extractor for images from gelbooru.com based on search-tags"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"`
			`r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")`
			`test = (`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {`
			`"count": 5,`
			`}),`
			`("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {`
			`"options": (("api", False),),`
			`"count": 5,`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`)`
[gelbooru] update to new format 9 years ago

[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`"""Extractor for image-pools from gelbooru.com"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"`
			`r"\?page=pool&s=show&id=(?P<pool>\d+)")`
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`test = (`
			`("https://gelbooru.com/index.php?page=pool&s=show&id=761", {`
			`"count": 6,`
			`}),`
			`("https://gelbooru.com/index.php?page=pool&s=show&id=761", {`
			`"options": (("api", False),),`
			`"count": 6,`
			`}),`
			`)`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`def metadata(self):`
			`url = "{}/index.php?page=pool&s=show&id={}".format(`
			`self.root, self.pool_id)`
			`page = self.request(url).text`

			`name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")`
			`if not name:`
			`raise exception.NotFoundError("pool")`
			`self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos)`

			`return {`
			`"pool": text.parse_int(self.pool_id),`
			`"pool_name": text.unescape(name),`
			`}`

[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago
[booru] add generalized extractors for *booru sites similar to cc15fbe7 4 years ago			`class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):`
consistent extractor naming scheme + docstrings 8 years ago			`"""Extractor for single images from gelbooru.com"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"`
			`r"\?page=post&s=view&id=(?P<post>\d+)")`
			`test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {`
update booru testdata 9 years ago			`"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`"count": 1,`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`})`