gallery-dl/gallery_dl/extractor/gelbooru.py

# -*- coding: utf-8 -*-

# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://gelbooru.com/"""

from . import booru
from .common import Message
from .. import text


class GelbooruExtractor(booru.XmlParserMixin,
                        booru.GelbooruPageMixin,
                        booru.BooruExtractor):
    """Base class for gelbooru extractors"""
    category = "gelbooru"
    api_url = "https://gelbooru.com/index.php"
    post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
    pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"

    def __init__(self, match):
        super().__init__(match)

        self.use_api = self.config("api", True)
        if self.use_api:
            self.params.update({"page": "dapi", "s": "post", "q": "index"})
        else:
            self.items = self.items_noapi
            self.session.cookies["fringeBenefits"] = "yup"
            self.per_page = 42

    @staticmethod
    def get_file_url(image):
        url = image["file_url"]
        if url.startswith("https://mp4.gelbooru.com/"):
            ihash = image["md5"]
            return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
                ihash[0:2], ihash[2:4], ihash)
        return url

    def items_noapi(self):
        yield Message.Version, 1
        data = self.get_metadata()

        for post in self.get_posts():
            post = self.get_post_data(post)
            url = post["file_url"]
            post.update(data)
            text.nameext_from_url(url, post)
            yield Message.Directory, post
            yield Message.Url, url, post

    def get_posts(self):
        """Return an iterable containing all relevant post objects"""
        url = "https://gelbooru.com/index.php?page=post&s=list"
        params = {
            "tags": self.params["tags"],
            "pid" : self.page_start * self.per_page
        }

        while True:
            page = self.request(url, params=params).text
            ids = list(text.extract_iter(page, '<span id="s', '"'))
            yield from ids
            if len(ids) < self.per_page:
                return
            params["pid"] += self.per_page

    def get_post_data(self, post_id):
        """Extract metadata of a single post"""
        page = self.request(self.post_url.format(post_id)).text
        data = text.extract_all(page, (
            (None        , '<meta name="keywords"', ''),
            ("tags"      , ' imageboard- ', '"'),
            ("id"        , '<li>Id: ', '<'),
            ("created_at", '<li>Posted: ', '<'),
            ("width"     , '<li>Size: ', 'x'),
            ("height"    , '', '<'),
            ("source"    , '<li>Source: <a href="', '"'),
            ("rating"    , '<li>Rating: ', '<'),
            (None        , '<li>Score: ', ''),
            ("score"     , '>', '<'),
            ("file_url"  , '<li><a href="http', '"'),
            ("change"    , ' id="lupdated" value="', '"'),
        ))[0]
        data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
        data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
        data["rating"] = (data["rating"] or "?")[0].lower()
        data["tags"] = " ".join(
            [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
        if self.extags:
            self.extended_tags(data, page)
        return data


class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
    """Extractor for images from gelbooru.com based on search-tags"""
    pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
    test = (
        ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
            "count": 5,
        }),
        ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
            "options": (("api", False),),
            "count": 5,
        }),
    )


class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
    """Extractor for image-pools from gelbooru.com"""
    pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=pool&s=show&id=(?P<pool>\d+)")
    test = (
        ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
            "count": 6,
        }),
        ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
            "options": (("api", False),),
            "count": 6,
        }),
    )


class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
    """Extractor for single images from gelbooru.com"""
    pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=post&s=view&id=(?P<post>\d+)")
    test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
        "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
        "count": 1,
    })

    def get_posts(self):
        return (self.post,)
[gelbooru] update to new extractor interface 10 years ago			`# -- coding: utf-8 --`
initial commit 10 years ago
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`# Copyright 2014-2020 Mike Fährmann`
[gelbooru] update to new extractor interface 10 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

update some extractors to use https 8 years ago			`"""Extract images from https://gelbooru.com/"""`
[gelbooru] update to new extractor interface 10 years ago
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`from . import booru`
			`from .common import Message`
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`from .. import text`
[gelbooru] update to new extractor interface 10 years ago
code adjustments according to pep8 nr2 8 years ago
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`class GelbooruExtractor(booru.XmlParserMixin,`
			`booru.GelbooruPageMixin,`
			`booru.BooruExtractor):`
[gelbooru] update to new format 9 years ago			`"""Base class for gelbooru extractors"""`
			`category = "gelbooru"`
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`api_url = "https://gelbooru.com/index.php"`
			`post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"`
[gelbooru] restore pool functionality 6 years ago			`pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"`
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago
			`def __init__(self, match):`
			`super().__init__(match)`
[gelbooru] update to new format 9 years ago
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`self.use_api = self.config("api", True)`
			`if self.use_api:`
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`self.params.update({"page": "dapi", "s": "post", "q": "index"})`
			`else:`
			`self.items = self.items_noapi`
[gelbooru] enable all content when not using API 5 years ago			`self.session.cookies["fringeBenefits"] = "yup"`
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`self.per_page = 42`
[gelbooru] various improvements - better metadata for pools - map ratings to s/q/e like other boorus do - skip() support 7 years ago
[gelbooru] rewrite mp4 video URLs (fixes #1048) 4 years ago			`@staticmethod`
			`def get_file_url(image):`
			`url = image["file_url"]`
			`if url.startswith("https://mp4.gelbooru.com/"):`
			`ihash = image["md5"]`
			`return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(`
			`ihash[0:2], ihash[2:4], ihash)`
			`return url`

[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`def items_noapi(self):`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`yield Message.Version, 1`
[booru] build directory path for each file (#385) 5 years ago			`data = self.get_metadata()`
initial commit 10 years ago
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`for post in self.get_posts():`
			`post = self.get_post_data(post)`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`url = post["file_url"]`
update archive IDs ... to behave in a more straightforward way when dealing with bookmarks/favourites/etc. specific IDs are now grouped by their owner, album-id, ... to allow for duplicates when it would be expected. 7 years ago			`post.update(data)`
[booru] build directory path for each file (#385) 5 years ago			`text.nameext_from_url(url, post)`
			`yield Message.Directory, post`
			`yield Message.Url, url, post`
code adjustments according to pep8 nr2 8 years ago
[gelbooru] various improvements - better metadata for pools - map ratings to s/q/e like other boorus do - skip() support 7 years ago			`def get_posts(self):`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`"""Return an iterable containing all relevant post objects"""`
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`url = "https://gelbooru.com/index.php?page=post&s=list"`
			`params = {`
			`"tags": self.params["tags"],`
			`"pid" : self.page_start * self.per_page`
			`}`

			`while True:`
			`page = self.request(url, params=params).text`
[gelbooru] fix extraction without API 4 years ago			`ids = list(text.extract_iter(page, '<span id="s', '"'))`
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`yield from ids`
			`if len(ids) < self.per_page:`
			`return`
			`params["pid"] += self.per_page`
[gelbooru] various improvements - better metadata for pools - map ratings to s/q/e like other boorus do - skip() support 7 years ago
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`def get_post_data(self, post_id):`
			`"""Extract metadata of a single post"""`
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`page = self.request(self.post_url.format(post_id)).text`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`data = text.extract_all(page, (`
			`(None , '<meta name="keywords"', ''),`
[gelbooru] fix non-API tag extraction 5 years ago			`("tags" , ' imageboard- ', '"'),`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`("id" , '<li>Id: ', '<'),`
			`("created_at", '<li>Posted: ', '<'),`
			`("width" , '<li>Size: ', 'x'),`
			`("height" , '', '<'),`
			`("source" , '<li>Source: <a href="', '"'),`
			`("rating" , '<li>Rating: ', '<'),`
			`(None , '<li>Score: ', ''),`
			`("score" , '>', '<'),`
			`("file_url" , '<li><a href="http', '"'),`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`("change" , ' id="lupdated" value="', '"'),`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`))[0]`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]`
various smaller changes/additions 7 years ago			`data["rating"] = (data["rating"] or "?")[0].lower()`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`data["tags"] = " ".join(`
			`[tag.replace(" ", "_") for tag in data["tags"].split(", ")])`
[gelbooru] tag-splitting for non-api mode 6 years ago			`if self.extags:`
			`self.extended_tags(data, page)`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`return data`

[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):`
consistent extractor naming scheme + docstrings 8 years ago			`"""Extractor for images from gelbooru.com based on search-tags"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"`
			`r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")`
			`test = (`
[gelbooru] re-enable API use (closes #56) Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again. 7 years ago			`("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {`
			`"count": 5,`
			`}),`
			`("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {`
			`"options": (("api", False),),`
			`"count": 5,`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`)`
[gelbooru] update to new format 9 years ago

[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`"""Extractor for image-pools from gelbooru.com"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"`
			`r"\?page=pool&s=show&id=(?P<pool>\d+)")`
[gelbooru] simplify and fix pool extraction use 'pool:<pool id>' as search tag to get pool posts 4 years ago			`test = (`
			`("https://gelbooru.com/index.php?page=pool&s=show&id=761", {`
			`"count": 6,`
			`}),`
			`("https://gelbooru.com/index.php?page=pool&s=show&id=761", {`
			`"options": (("api", False),),`
			`"count": 6,`
			`}),`
			`)`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago

[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):`
consistent extractor naming scheme + docstrings 8 years ago			`"""Extractor for single images from gelbooru.com"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"`
			`r"\?page=post&s=view&id=(?P<post>\d+)")`
			`test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {`
update booru testdata 9 years ago			`"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago			`"count": 1,`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`})`
[gelbooru] use manual extraction ... to compensate for their disabled API. (https://gelbooru.com/index.php?page=forum&s=view&id=3875) This also adds an extractor for image-pools. 7 years ago
[gelbooru] various improvements - better metadata for pools - map ratings to s/q/e like other boorus do - skip() support 7 years ago			`def get_posts(self):`
[gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. 6 years ago			`return (self.post,)`