[zerochan] add 'tag' and 'image' extractors (#1434)

2 years ago · 3a8addfe45
parent e660e48a60
commit 3a8addfe45
3 changed files with 127 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -895,6 +895,12 @@ Consider all sites to be NSFW unless otherwise known.
    <td>Galleries, User Profiles</td>
    <td></td>
 </tr>
+<tr>
+    <td>Zerochan</td>
+    <td>https://www.zerochan.net/</td>
+    <td>individual Images, Tag Searches</td>
+    <td></td>
+</tr>
 <tr>
    <td>かべうち</td>
    <td>https://kabe-uchiroom.com/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -150,6 +150,7 @@ modules = [
    "wikieat",
    "xhamster",
    "xvideos",
+    "zerochan",
    "booru",
    "moebooru",
    "foolfuuka",
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.zerochan.net/"""
+
+from .booru import BooruExtractor
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
+
+
+class ZerochanExtractor(BooruExtractor):
+    """Base class for zerochan extractors"""
+    category = "zerochan"
+    root = "https://www.zerochan.net"
+    filename_fmt = "{id}.{extension}"
+    archive_fmt = "{id}"
+
+    def _parse_entry_page(self, entry_id):
+        url = "{}/{}".format(self.root, entry_id)
+        extr = text.extract_from(self.request(url).text)
+
+        return {
+            "id"    : entry_id,
+            "author": extr('"author": "', '"'),
+            "file_url": extr('"contentUrl": "', '"'),
+            "date"  : text.parse_datetime(extr(
+                '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
+            "width" : extr('"width": "', ' '),
+            "height": extr('"height": "', ' '),
+            "size"  : extr('"contentSize": "', 'B'),
+        }
+
+
+class ZerochanTagExtractor(ZerochanExtractor):
+    subcategory = "tag"
+    directory_fmt = ("{category}", "{search_tags}")
+    pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
+    test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", {
+        "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)",
+        "count": "> 24",
+        "keywords": {
+            "extension": r"re:jpg|png",
+            "file_url": "",
+            "filename": r"re:Perth.\(Kantai.Collection\).full.\d+",
+            "height": r"re:^\d+$",
+            "id": r"re:^\d+$",
+            "name": "Perth (Kantai Collection)",
+            "search_tags": "Perth (Kantai Collection)",
+            "size": r"re:^\d+k$",
+            "width": r"re:^\d+$",
+        },
+    })
+
+    def __init__(self, match):
+        ZerochanExtractor.__init__(self, match)
+        self.search_tag, self.query = match.groups()
+
+    def metadata(self):
+        return {"search_tags": text.unquote(
+            self.search_tag.replace("+", " "))}
+
+    def posts(self):
+        url = self.root + "/" + self.search_tag
+        params = text.parse_query(self.query)
+        params["p"] = text.parse_int(params.get("p"), 1)
+
+        while True:
+            page = self.request(url, params=params).text
+            thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0]
+            extr = text.extract_from(thumbs)
+
+            while True:
+                post = extr('<li class="', '>')
+                if not post:
+                    break
+                yield {
+                    "id"    : extr('href="/', '"'),
+                    "name"  : extr('alt="', '"'),
+                    "width" : extr('title="', 'x'),
+                    "height": extr('', ' '),
+                    "size"  : extr('', 'B'),
+                    "file_url": "https://static." + extr(
+                        '<a href="https://static.', '"'),
+                }
+
+            if 'rel="next"' not in page:
+                break
+            params["p"] += 1
+
+
+class ZerochanImageExtractor(ZerochanExtractor):
+    subcategory = "image"
+    pattern = BASE_PATTERN + r"/(\d+)"
+    test = ("https://www.zerochan.net/2920445", {
+        "pattern": r"https://static\.zerochan\.net/"
+                   r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg",
+        "keyword": {
+            "author": "YukinoTokisaki",
+            "date": "dt:2020-04-24 21:33:44",
+            "file_url": str,
+            "filename": "Perth.(Kantai.Collection).full.2920445",
+            "height": "1366",
+            "id": "2920445",
+            "size": "1929k",
+            "width": "1920",
+        },
+    })
+
+    def __init__(self, match):
+        ZerochanExtractor.__init__(self, match)
+        self.image_id = match.group(1)
+
+    def posts(self):
+        return (self._parse_entry_page(self.image_id),)