From 86c00f9e6694def9db92e5b6e3fc2ea7e03a3928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 28 Feb 2020 22:53:45 +0100 Subject: [PATCH] [danbooru] move extractor logic from booru.py --- gallery_dl/extractor/booru.py | 13 +--- gallery_dl/extractor/danbooru.py | 129 ++++++++++++++++++++++++++----- 2 files changed, 109 insertions(+), 33 deletions(-) diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index ac45e0bc..a48b48b1 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -27,7 +27,6 @@ class BooruExtractor(SharedConfigMixin, Extractor): page_start = 1 page_limit = None sort = False - ugoira = True def __init__(self, match): super().__init__(match) @@ -52,11 +51,7 @@ class BooruExtractor(SharedConfigMixin, Extractor): for image in images: try: - if "pixiv_ugoira_frame_data" in image and \ - "large_file_url" in image and not self.ugoira: - url = image["large_file_url"] - else: - url = image["file_url"] + url = image["file_url"] except KeyError: continue if url.startswith("/"): @@ -112,12 +107,6 @@ class XmlParserMixin(): return [post.attrib for post in root] -class DanbooruPageMixin(): - """Pagination for Danbooru v2""" - def update_page(self, data): - self.params["page"] = "b{}".format(data["id"]) - - class MoebooruPageMixin(): """Pagination for Moebooru and Danbooru v1""" def update_page(self, data): diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index e8d3abf9..3d7f78f2 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -1,43 +1,97 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://danbooru.donmai.us/""" +"""Extractors for https://danbooru.donmai.us/""" -from . import booru +from .common import Extractor, Message, SharedConfigMixin +from .. import text BASE_PATTERN = ( r"(?:https?://)?" - r"(?Pdanbooru|hijiribe|sonohara|safebooru)" - r"\.donmai\.us") + r"(danbooru|hijiribe|sonohara|safebooru)" + r"\.donmai\.us" +) -class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor): +class DanbooruExtractor(SharedConfigMixin, Extractor): """Base class for danbooru extractors""" + basecategory = "booru" category = "danbooru" + filename_fmt = "{category}_{id}_{md5}.{extension}" page_limit = 1000 + page_start = None + per_page = 100 def __init__(self, match): - super().__init__(match) - self.subdomain = match.group("subdomain") - self.scheme = "https" if self.subdomain == "danbooru" else "http" - self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format( - scheme=self.scheme, subdomain=self.subdomain) + Extractor.__init__(self, match) + self.root = "https://{}.donmai.us".format(match.group(1)) + self.api_url = self.root + "/posts.json" self.ugoira = self.config("ugoira", True) + self.params = {} username, api_key = self._get_auth_info() if username: self.log.debug("Using HTTP Basic Auth for user '%s'", username) self.session.auth = (username, api_key) - -class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor): - """Extractor for images from danbooru based on search-tags""" + def skip(self, num): + pages = num // self.per_page + if pages >= self.page_limit: + pages = self.page_limit - 1 + self.page_start = pages + 1 + return pages * self.per_page + + def items(self): + data = self.metadata() + for post in self.posts(): + try: + url = post["file_url"] + except KeyError: + continue + + text.nameext_from_url(url, post) + if post["extension"] == "zip": + if self.ugoira: + post["frames"] = self.request( + "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( + self.root, post["id"]) + ).json()["pixiv_ugoira_frame_data"]["data"] + else: + url = post["large_file_url"] + post["extension"] = "webm" + + post.update(data) + yield Message.Directory, post + yield Message.Url, url, post + + def metadata(self): + return {} + + def posts(self): + params = self.params.copy() + params["limit"] = self.per_page + params["page"] = self.page_start + + while True: + posts = self.request(self.api_url, params=params).json() + yield from posts + + if len(posts) < self.per_page: + return + params["page"] = "b{}".format(posts[-1]["id"]) + + +class DanbooruTagExtractor(DanbooruExtractor): + """Extractor for danbooru posts from tag searches""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P[^&#]+)" test = ( ("https://danbooru.donmai.us/posts?tags=bonocho", { @@ -52,18 +106,41 @@ class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor): ("https://safebooru.donmai.us/posts?tags=bonocho"), ) + def __init__(self, match): + DanbooruExtractor.__init__(self, match) + self.params["tags"] = text.unquote(match.group(2).replace("+", " ")) + + def metadata(self): + return {"search_tags": self.params["tags"]} + -class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor): - """Extractor for image-pools from danbooru""" - pattern = BASE_PATTERN + r"/pools/(?P\d+)" +class DanbooruPoolExtractor(DanbooruExtractor): + """Extractor for posts from danbooru pools""" + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}") + archive_fmt = "p_{pool[id]}_{id}" + pattern = BASE_PATTERN + r"/pools/(\d+)" test = ("https://danbooru.donmai.us/pools/7659", { "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99", }) - -class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor): - """Extractor for single images from danbooru""" - pattern = BASE_PATTERN + r"/posts/(?P\d+)" + def __init__(self, match): + DanbooruExtractor.__init__(self, match) + self.pool_id = match.group(2) + self.params["tags"] = "pool:" + self.pool_id + + def metadata(self): + url = "{}/pools/{}.json".format(self.root, self.pool_id) + pool = self.request(url).json() + del pool["post_ids"] + return {"pool": pool} + + +class DanbooruPostExtractor(DanbooruExtractor): + """Extractor for single danbooru posts""" + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/posts/(\d+)" test = ( ("https://danbooru.donmai.us/posts/294929", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", @@ -74,7 +151,16 @@ class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor): }) ) + def __init__(self, match): + DanbooruExtractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + url = "{}/posts/{}.json".format(self.root, self.post_id) + return (self.request(url).json(),) + +r''' class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor): """Extractor for popular images from danbooru""" pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P[^#]*))?" @@ -91,3 +177,4 @@ class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor): urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json" self.api_url = urlfmt.format( scheme=self.scheme, subdomain=self.subdomain) +'''