From 3a8addfe458ae933885fae253f0b159f3f786130 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 27 Jul 2022 22:58:23 +0200 Subject: [PATCH] [zerochan] add 'tag' and 'image' extractors (#1434) --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/zerochan.py | 120 +++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 gallery_dl/extractor/zerochan.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c37fc4ae..30e74ded 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -895,6 +895,12 @@ Consider all sites to be NSFW unless otherwise known. Galleries, User Profiles + + Zerochan + https://www.zerochan.net/ + individual Images, Tag Searches + + かべうち https://kabe-uchiroom.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 70cebb37..118000b7 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -150,6 +150,7 @@ modules = [ "wikieat", "xhamster", "xvideos", + "zerochan", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py new file mode 100644 index 00000000..256eb8e4 --- /dev/null +++ b/gallery_dl/extractor/zerochan.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.zerochan.net/""" + +from .booru import BooruExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" + + +class ZerochanExtractor(BooruExtractor): + """Base class for zerochan extractors""" + category = "zerochan" + root = "https://www.zerochan.net" + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + + def _parse_entry_page(self, entry_id): + url = "{}/{}".format(self.root, entry_id) + extr = text.extract_from(self.request(url).text) + + return { + "id" : entry_id, + "author": extr('"author": "', '"'), + "file_url": extr('"contentUrl": "', '"'), + "date" : text.parse_datetime(extr( + '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), + "width" : extr('"width": "', ' '), + "height": extr('"height": "', ' '), + "size" : extr('"contentSize": "', 'B'), + } + + +class ZerochanTagExtractor(ZerochanExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?" + test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", { + "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", + "count": "> 24", + "keywords": { + "extension": r"re:jpg|png", + "file_url": "", + "filename": r"re:Perth.\(Kantai.Collection\).full.\d+", + "height": r"re:^\d+$", + "id": r"re:^\d+$", + "name": "Perth (Kantai Collection)", + "search_tags": "Perth (Kantai Collection)", + "size": r"re:^\d+k$", + "width": r"re:^\d+$", + }, + }) + + def __init__(self, match): + ZerochanExtractor.__init__(self, match) + self.search_tag, self.query = match.groups() + + def metadata(self): + return {"search_tags": text.unquote( + self.search_tag.replace("+", " "))} + + def posts(self): + url = self.root + "/" + self.search_tag + params = text.parse_query(self.query) + params["p"] = text.parse_int(params.get("p"), 1) + + while True: + page = self.request(url, params=params).text + thumbs = text.extract(page, '