From cc6b9e4c186bba827863a84c86a98fc83d7600ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 25 Feb 2024 00:36:14 +0100 Subject: [PATCH] [zerochan] use API by default (#3669) add 'pagination' option --- docs/configuration.rst | 15 +++++++++ gallery_dl/extractor/zerochan.py | 58 ++++++++++++++++++++++++++++---- test/results/zerochan.py | 52 ++++++++++++++++++++++++++-- 3 files changed, 117 insertions(+), 8 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 1b61a4c9..26a8e2b7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4131,6 +4131,21 @@ Description Note: This requires 1-2 additional HTTP requests per post. +extractor.zerochan.pagination +----------------------------- +Type + ``string`` +Default + ``"api"`` +Description + Controls how to paginate over tag search results. + + * ``"api"``: Use the `JSON API `__ + (no ``extension`` metadata) + * ``"html"``: Parse HTML pages + (limited to 100 pages * 24 posts) + + extractor.[booru].tags ---------------------- Type diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 6ee96e67..fc61dffc 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -10,7 +10,7 @@ from .booru import BooruExtractor from ..cache import cache -from .. import text, exception +from .. import text, util, exception BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -21,8 +21,11 @@ class ZerochanExtractor(BooruExtractor): root = "https://www.zerochan.net" filename_fmt = "{id}.{extension}" archive_fmt = "{id}" + page_start = 1 + per_page = 250 cookies_domain = ".zerochan.net" cookies_names = ("z_id", "z_hash") + request_interval = (0.5, 1.5) def login(self): self._logged_in = True @@ -86,7 +89,7 @@ class ZerochanExtractor(BooruExtractor): return data - def _parse_entry_json(self, entry_id): + def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) item = self.request(url).json() @@ -117,14 +120,22 @@ class ZerochanTagExtractor(ZerochanExtractor): ZerochanExtractor.__init__(self, match) self.search_tag, self.query = match.groups() + def _init(self): + if self.config("pagination") == "html": + self.posts = self.posts_html + self.per_page = 24 + else: + self.posts = self.posts_api + self.session.headers["User-Agent"] = util.USERAGENT + def metadata(self): return {"search_tags": text.unquote( self.search_tag.replace("+", " "))} - def posts(self): + def posts_html(self): url = self.root + "/" + self.search_tag params = text.parse_query(self.query) - params["p"] = text.parse_int(params.get("p"), 1) + params["p"] = text.parse_int(params.get("p"), self.page_start) metadata = self.config("metadata") while True: @@ -140,7 +151,7 @@ class ZerochanTagExtractor(ZerochanExtractor): if metadata: entry_id = extr('href="/', '"') post = self._parse_entry_html(entry_id) - post.update(self._parse_entry_json(entry_id)) + post.update(self._parse_entry_api(entry_id)) yield post else: yield { @@ -157,6 +168,41 @@ class ZerochanTagExtractor(ZerochanExtractor): break params["p"] += 1 + def posts_api(self): + url = self.root + "/" + self.search_tag + metadata = self.config("metadata") + params = { + "json": "1", + "l" : self.per_page, + "p" : self.page_start, + } + + static = "https://static.zerochan.net/.full." + + while True: + data = self.request(url, params=params).json() + try: + posts = data["items"] + except ValueError: + return + + if metadata: + for post in posts: + post_id = post["id"] + post.update(self._parse_entry_html(post_id)) + post.update(self._parse_entry_api(post_id)) + else: + for post in posts: + base = static + str(post["id"]) + post["file_url"] = base + ".jpg" + post["_fallback"] = (base + ".png",) + + yield from posts + + if not data.get("next"): + return + params["p"] += 1 + class ZerochanImageExtractor(ZerochanExtractor): subcategory = "image" @@ -170,5 +216,5 @@ class ZerochanImageExtractor(ZerochanExtractor): def posts(self): post = self._parse_entry_html(self.image_id) if self.config("metadata"): - post.update(self._parse_entry_json(self.image_id)) + post.update(self._parse_entry_api(self.image_id)) return (post,) diff --git a/test/results/zerochan.py b/test/results/zerochan.py index dac7890b..e56812fe 100644 --- a/test/results/zerochan.py +++ b/test/results/zerochan.py @@ -12,8 +12,27 @@ __tests__ = ( "#url" : "https://www.zerochan.net/Perth+%28Kantai+Collection%29", "#category": ("booru", "zerochan", "tag"), "#class" : zerochan.ZerochanTagExtractor, + "#pattern" : r"https://static\.zerochan\.net/\.full\.\d+\.jpg", + "#count" : "> 50", + + "extension" : r"jpg", + "file_url" : r"re:https://static\.zerochan\.net/\.full\.\d+\.jpg", + "filename" : r"re:\.full\.\d+", + "height" : int, + "id" : int, + "search_tags": "Perth (Kantai Collection)", + "tag" : r"re:(Perth \(Kantai Collection\)|Kantai Collection)", + "tags" : list, + "width" : int, +}, + +{ + "#url" : "https://www.zerochan.net/Perth+%28Kantai+Collection%29", + "#category": ("booru", "zerochan", "tag"), + "#class" : zerochan.ZerochanTagExtractor, + "#options" : {"pagination": "html"}, "#pattern" : r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", - "#count" : "> 24", + "#count" : "> 45", "extension" : r"re:jpg|png", "file_url" : r"re:https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", @@ -58,8 +77,37 @@ __tests__ = ( "Theme:Personification", "Theme:Pins", "Theme:Ribbon", - "Theme:Shirt", "Theme:Short Hair", + "Theme:Top", + ], + "uploader": "YukinoTokisaki", + "width" : 1920, +}, + +{ + "#url" : "https://www.zerochan.net/2920445", + "#category": ("booru", "zerochan", "image"), + "#class" : zerochan.ZerochanImageExtractor, + "#pattern" : r"https://static\.zerochan\.net/Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg", + "#auth" : False, + + "author" : "YeFan 葉凡", + "date" : "dt:2020-04-24 21:33:44", + "file_url": "https://static.zerochan.net/Perth.%28Kantai.Collection%29.full.2920445.jpg", + "filename": "Perth.(Kantai.Collection).full.2920445", + "height" : 1366, + "id" : 2920445, + "path" : [ + "Kantai Collection", + "Perth (Kantai Collection)", + ], + "size" : 1975296, + "tags" : [ + "Mangaka:YeFan 葉凡", + "Game:Kantai Collection", + "Character:Perth (Kantai Collection)", + "Theme:Firefighter Outfit", + "Theme:Pins", ], "uploader": "YukinoTokisaki", "width" : 1920,