gallery-dl/gallery_dl/extractor/danbooru.py

# -*- coding: utf-8 -*-

# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://danbooru.donmai.us/"""

from .common import Extractor, Message, SharedConfigMixin
from .. import text
import datetime


BASE_PATTERN = (
    r"(?:https?://)?"
    r"(danbooru|hijiribe|sonohara|safebooru)"
    r"\.donmai\.us"
)


class DanbooruExtractor(SharedConfigMixin, Extractor):
    """Base class for danbooru extractors"""
    basecategory = "booru"
    category = "danbooru"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    page_limit = 1000
    page_start = None
    per_page = 200

    def __init__(self, match):
        super().__init__(match)
        self.root = "https://{}.donmai.us".format(match.group(1))
        self.ugoira = self.config("ugoira", True)
        self.params = {}

        username, api_key = self._get_auth_info()
        if username:
            self.log.debug("Using HTTP Basic Auth for user '%s'", username)
            self.session.auth = (username, api_key)

    def skip(self, num):
        pages = num // self.per_page
        if pages >= self.page_limit:
            pages = self.page_limit - 1
        self.page_start = pages + 1
        return pages * self.per_page

    def items(self):
        data = self.metadata()
        for post in self.posts():
            try:
                url = post["file_url"]
            except KeyError:
                continue

            text.nameext_from_url(url, post)
            if post["extension"] == "zip":
                if self.ugoira:
                    post["frames"] = self.request(
                        "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format(
                            self.root, post["id"])
                    ).json()["pixiv_ugoira_frame_data"]["data"]
                else:
                    url = post["large_file_url"]
                    post["extension"] = "webm"

            post.update(data)
            yield Message.Directory, post
            yield Message.Url, url, post

    def metadata(self):
        return {}

    def posts(self):
        return self._pagination(self.root + "/posts.json")

    def _pagination(self, url, pagenum=False):
        params = self.params.copy()
        params["limit"] = self.per_page
        params["page"] = self.page_start

        while True:
            posts = self.request(url, params=params).json()
            if "posts" in posts:
                posts = posts["posts"]
            yield from posts

            if len(posts) < self.per_page:
                return

            if pagenum:
                params["page"] += 1
            else:
                params["page"] = "b{}".format(posts[-1]["id"])


class DanbooruTagExtractor(DanbooruExtractor):
    """Extractor for danbooru posts from tag searches"""
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    archive_fmt = "t_{search_tags}_{id}"
    pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=([^&#]+)"
    test = (
        ("https://danbooru.donmai.us/posts?tags=bonocho", {
            "content": "b196fb9f1668109d7774a0a82efea3ffdda07746",
        }),
        # test page transitions
        ("https://danbooru.donmai.us/posts?tags=mushishi", {
            "count": ">= 300",
        }),
        ("https://hijiribe.donmai.us/posts?tags=bonocho"),
        ("https://sonohara.donmai.us/posts?tags=bonocho"),
        ("https://safebooru.donmai.us/posts?tags=bonocho"),
    )

    def __init__(self, match):
        super().__init__(match)
        self.params["tags"] = text.unquote(match.group(2).replace("+", " "))

    def metadata(self):
        return {"search_tags": self.params["tags"]}


class DanbooruPoolExtractor(DanbooruExtractor):
    """Extractor for posts from danbooru pools"""
    subcategory = "pool"
    directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
    archive_fmt = "p_{pool[id]}_{id}"
    pattern = BASE_PATTERN + r"/pools/(\d+)"
    test = ("https://danbooru.donmai.us/pools/7659", {
        "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
    })

    def __init__(self, match):
        super().__init__(match)
        self.pool_id = match.group(2)
        self.params["tags"] = "pool:" + self.pool_id

    def metadata(self):
        url = "{}/pools/{}.json".format(self.root, self.pool_id)
        pool = self.request(url).json()
        pool["name"] = pool["name"].replace("_", " ")
        del pool["post_ids"]
        return {"pool": pool}


class DanbooruPostExtractor(DanbooruExtractor):
    """Extractor for single danbooru posts"""
    subcategory = "post"
    archive_fmt = "{id}"
    pattern = BASE_PATTERN + r"/posts/(\d+)"
    test = (
        ("https://danbooru.donmai.us/posts/294929", {
            "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
        }),
        ("https://danbooru.donmai.us/posts/3613024", {
            "pattern": r"https?://.+\.webm$",
            "options": (("ugoira", False),)
        })
    )

    def __init__(self, match):
        super().__init__(match)
        self.post_id = match.group(2)

    def posts(self):
        url = "{}/posts/{}.json".format(self.root, self.post_id)
        post = self.request(url).json()
        return (post["post"] if "post" in post else post,)


class DanbooruPopularExtractor(DanbooruExtractor):
    """Extractor for popular images from danbooru"""
    subcategory = "popular"
    directory_fmt = ("{category}", "popular", "{scale}", "{date}")
    archive_fmt = "P_{scale[0]}_{date}_{id}"
    pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
    test = (
        ("https://danbooru.donmai.us/explore/posts/popular"),
        (("https://danbooru.donmai.us/explore/posts/popular"
          "?date=2013-06-06&scale=week"), {
            "range": "1-120",
            "count": 120,
        }),
    )

    def __init__(self, match):
        super().__init__(match)
        self.params.update(text.parse_query(match.group(2)))

    def metadata(self):
        self.page_start = self.page_start or 1
        scale = self.params.get("scale", "day")
        date = self.params.get("date") or datetime.date.today().isoformat()

        if scale == "week":
            date = datetime.date.fromisoformat(date)
            date = (date - datetime.timedelta(days=date.weekday())).isoformat()
        elif scale == "month":
            date = date[:-3]

        return {"date": date, "scale": scale}

    def posts(self):
        url = self.root + "/explore/posts/popular.json"
        return self._pagination(url, True)
[danbooru] update to new extractor interface 10 years ago			`# -- coding: utf-8 --`
added extractor 'danbooru' + split BooruExtractor to handle XML and JSON 10 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`# Copyright 2014-2020 Mike Fährmann`
[danbooru] update to new extractor interface 10 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`
added extractor 'danbooru' + split BooruExtractor to handle XML and JSON 10 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`"""Extractors for https://danbooru.donmai.us/"""`
added extractor 'danbooru' + split BooruExtractor to handle XML and JSON 10 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`from .common import Extractor, Message, SharedConfigMixin`
			`from .. import text`
[danbooru] restore 'popular' functionality 5 years ago			`import datetime`
added extractor 'danbooru' + split BooruExtractor to handle XML and JSON 10 years ago
code adjustments according to pep8 nr2 8 years ago
[danbooru] use alternate subdomains; support safebooru 7 years ago			`BASE_PATTERN = (`
			`r"(?:https?://)?"`
[danbooru] move extractor logic from booru.py 5 years ago			`r"(danbooru\|hijiribe\|sonohara\|safebooru)"`
			`r"\.donmai\.us"`
			`)`
[danbooru] use alternate subdomains; support safebooru 7 years ago

[danbooru] move extractor logic from booru.py 5 years ago			`class DanbooruExtractor(SharedConfigMixin, Extractor):`
[danbooru] update to new format 9 years ago			`"""Base class for danbooru extractors"""`
[danbooru] move extractor logic from booru.py 5 years ago			`basecategory = "booru"`
[danbooru] update to new format 9 years ago			`category = "danbooru"`
[danbooru] move extractor logic from booru.py 5 years ago			`filename_fmt = "{category}_{id}_{md5}.{extension}"`
[booru] rewrite using Mixin classes (#59) - improved code structure - improved URL patterns - better pagination to work around page limits on - Danbooru - e621 - 3dbooru 7 years ago			`page_limit = 1000`
[danbooru] move extractor logic from booru.py 5 years ago			`page_start = None`
[danbooru][e621] increase page limits 5 years ago			`per_page = 200`
added extractor 'danbooru' + split BooruExtractor to handle XML and JSON 10 years ago
[danbooru] use alternate subdomains; support safebooru 7 years ago			`def __init__(self, match):`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`super().__init__(match)`
[danbooru] move extractor logic from booru.py 5 years ago			`self.root = "https://{}.donmai.us".format(match.group(1))`
[danbooru] add 'ugoira' option (#406) to choose between ZIP archives or converted video files for Ugoira posts 5 years ago			`self.ugoira = self.config("ugoira", True)`
[danbooru] move extractor logic from booru.py 5 years ago			`self.params = {}`
[danbooru] use alternate subdomains; support safebooru 7 years ago
[danbooru] add authentication support (closes #151) ... via HTTP Basic Auth with username and "password". The password value in this case is not the account password itself, but the"api_key" found in your user profile. 6 years ago			`username, api_key = self._get_auth_info()`
			`if username:`
			`self.log.debug("Using HTTP Basic Auth for user '%s'", username)`
			`self.session.auth = (username, api_key)`

[danbooru] move extractor logic from booru.py 5 years ago			`def skip(self, num):`
			`pages = num // self.per_page`
			`if pages >= self.page_limit:`
			`pages = self.page_limit - 1`
			`self.page_start = pages + 1`
			`return pages * self.per_page`

			`def items(self):`
			`data = self.metadata()`
			`for post in self.posts():`
			`try:`
			`url = post["file_url"]`
			`except KeyError:`
			`continue`

			`text.nameext_from_url(url, post)`
			`if post["extension"] == "zip":`
			`if self.ugoira:`
			`post["frames"] = self.request(`
			`"{}/posts/{}.json?only=pixiv_ugoira_frame_data".format(`
			`self.root, post["id"])`
			`).json()["pixiv_ugoira_frame_data"]["data"]`
			`else:`
			`url = post["large_file_url"]`
			`post["extension"] = "webm"`

			`post.update(data)`
			`yield Message.Directory, post`
			`yield Message.Url, url, post`

			`def metadata(self):`
			`return {}`

			`def posts(self):`
[danbooru] restore 'popular' functionality 5 years ago			`return self._pagination(self.root + "/posts.json")`

			`def _pagination(self, url, pagenum=False):`
[danbooru] move extractor logic from booru.py 5 years ago			`params = self.params.copy()`
			`params["limit"] = self.per_page`
			`params["page"] = self.page_start`

			`while True:`
[danbooru] restore 'popular' functionality 5 years ago			`posts = self.request(url, params=params).json()`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`if "posts" in posts:`
			`posts = posts["posts"]`
[danbooru] move extractor logic from booru.py 5 years ago			`yield from posts`

			`if len(posts) < self.per_page:`
			`return`
[danbooru] restore 'popular' functionality 5 years ago
			`if pagenum:`
			`params["page"] += 1`
			`else:`
			`params["page"] = "b{}".format(posts[-1]["id"])`
[danbooru] move extractor logic from booru.py 5 years ago

			`class DanbooruTagExtractor(DanbooruExtractor):`
			`"""Extractor for danbooru posts from tag searches"""`
			`subcategory = "tag"`
			`directory_fmt = ("{category}", "{search_tags}")`
			`archive_fmt = "t_{search_tags}_{id}"`
[danbooru] restore 'popular' functionality 5 years ago			`pattern = BASE_PATTERN + r"/posts\?(?:[^&#]&)tags=([^&#]+)"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`test = (`
[danbooru] extend and improve URL regex - add support for danbooru mirrors: - hijiribe.donmai.us - sonohara.donmai.us - todo: actually use these domains instead of redirecting everything to danbooru itself - improve handling of query string parameters 7 years ago			`("https://danbooru.donmai.us/posts?tags=bonocho", {`
			`"content": "b196fb9f1668109d7774a0a82efea3ffdda07746",`
			`}),`
[booru] call update_page() with correct dict (closes #82) 7 years ago			`# test page transitions`
[danbooru] restore 'popular' functionality 5 years ago			`("https://danbooru.donmai.us/posts?tags=mushishi", {`
			`"count": ">= 300",`
[booru] call update_page() with correct dict (closes #82) 7 years ago			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`("https://hijiribe.donmai.us/posts?tags=bonocho"),`
			`("https://sonohara.donmai.us/posts?tags=bonocho"),`
			`("https://safebooru.donmai.us/posts?tags=bonocho"),`
			`)`
added extractor 'danbooru' + split BooruExtractor to handle XML and JSON 10 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`def __init__(self, match):`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`super().__init__(match)`
[danbooru] move extractor logic from booru.py 5 years ago			`self.params["tags"] = text.unquote(match.group(2).replace("+", " "))`

			`def metadata(self):`
			`return {"search_tags": self.params["tags"]}`

code adjustments according to pep8 nr2 8 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`class DanbooruPoolExtractor(DanbooruExtractor):`
			`"""Extractor for posts from danbooru pools"""`
			`subcategory = "pool"`
			`directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")`
			`archive_fmt = "p_{pool[id]}_{id}"`
			`pattern = BASE_PATTERN + r"/pools/(\d+)"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`test = ("https://danbooru.donmai.us/pools/7659", {`
update booru testdata 9 years ago			`"content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`})`
[danbooru] rewrite to use multiple extractors 9 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`def __init__(self, match):`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`super().__init__(match)`
[danbooru] move extractor logic from booru.py 5 years ago			`self.pool_id = match.group(2)`
			`self.params["tags"] = "pool:" + self.pool_id`

			`def metadata(self):`
			`url = "{}/pools/{}.json".format(self.root, self.pool_id)`
			`pool = self.request(url).json()`
[danbooru] restore 'popular' functionality 5 years ago			`pool["name"] = pool["name"].replace("_", " ")`
[danbooru] move extractor logic from booru.py 5 years ago			`del pool["post_ids"]`
			`return {"pool": pool}`


			`class DanbooruPostExtractor(DanbooruExtractor):`
			`"""Extractor for single danbooru posts"""`
			`subcategory = "post"`
			`archive_fmt = "{id}"`
			`pattern = BASE_PATTERN + r"/posts/(\d+)"`
[danbooru] add 'ugoira' option (#406) to choose between ZIP archives or converted video files for Ugoira posts 5 years ago			`test = (`
			`("https://danbooru.donmai.us/posts/294929", {`
			`"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",`
			`}),`
			`("https://danbooru.donmai.us/posts/3613024", {`
			`"pattern": r"https?://.+\.webm$",`
			`"options": (("ugoira", False),)`
			`})`
			`)`
[booru] add extractors for "Popular" images 7 years ago
[danbooru] move extractor logic from booru.py 5 years ago			`def __init__(self, match):`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`super().__init__(match)`
[danbooru] move extractor logic from booru.py 5 years ago			`self.post_id = match.group(2)`

			`def posts(self):`
			`url = "{}/posts/{}.json".format(self.root, self.post_id)`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`post = self.request(url).json()`
			`return (post["post"] if "post" in post else post,)`
[danbooru] move extractor logic from booru.py 5 years ago
[booru] add extractors for "Popular" images 7 years ago
[danbooru] restore 'popular' functionality 5 years ago			`class DanbooruPopularExtractor(DanbooruExtractor):`
[booru] add extractors for "Popular" images 7 years ago			`"""Extractor for popular images from danbooru"""`
[danbooru] restore 'popular' functionality 5 years ago			`subcategory = "popular"`
			`directory_fmt = ("{category}", "popular", "{scale}", "{date}")`
			`archive_fmt = "P_{scale[0]}_{date}_{id}"`
			`pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`test = (`
			`("https://danbooru.donmai.us/explore/posts/popular"),`
[booru] add extractors for "Popular" images 7 years ago			`(("https://danbooru.donmai.us/explore/posts/popular"`
[danbooru] restore 'popular' functionality 5 years ago			`"?date=2013-06-06&scale=week"), {`
			`"range": "1-120",`
			`"count": 120,`
[booru] add extractors for "Popular" images 7 years ago			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`)`
[danbooru] use alternate subdomains; support safebooru 7 years ago
			`def __init__(self, match):`
[e621] derive from Danbooru extractors (#651) - use extractor implementations from 'danbooru' - use "page": "b[ID]" to paginate over results instead of "tags": "id:<[ID]", avoiding infinite loops with certain post orders - bump User-Agent version 5 years ago			`super().__init__(match)`
[danbooru] restore 'popular' functionality 5 years ago			`self.params.update(text.parse_query(match.group(2)))`

			`def metadata(self):`
			`self.page_start = self.page_start or 1`
			`scale = self.params.get("scale", "day")`
			`date = self.params.get("date") or datetime.date.today().isoformat()`

			`if scale == "week":`
			`date = datetime.date.fromisoformat(date)`
			`date = (date - datetime.timedelta(days=date.weekday())).isoformat()`
			`elif scale == "month":`
			`date = date[:-3]`

			`return {"date": date, "scale": scale}`

			`def posts(self):`
			`url = self.root + "/explore/posts/popular.json"`
			`return self._pagination(url, True)`