generalize tag-splitting option (#92)

- extend functionality to other booru sites: - http://behoimi.org/ - https://konachan.com/ - https://e621.net/ - https://rule34.xxx/ - https://safebooru.org/ - https://yande.re/
6 years ago · 4a57509392
parent 188e956c4e
commit 4a57509392
8 changed files with 92 additions and 56 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -617,20 +617,23 @@ Description A (comma-separated) list of post types to extract images, etc. from.
 =========== =====


+extractor.3dbooru.tags
+----------------------
+extractor.e621.tags
+-------------------
+extractor.konachan.tags
+-----------------------
+extractor.rule34.tags
+---------------------
+extractor.safebooru.tags
+------------------------
 extractor.yandere.tags
 ----------------------
 =========== =====
 Type        ``bool``
 Default     ``false``
-Description Split tags into different categories
-            and provide the following additional metadata-entries:
-
-            - ``tags_artist``
-            - ``tags_character``
-            - ``tags_circle``
-            - ``tags_copyright``
-            - ``tags_faults``
-            - ``tags_general``
+Description Categorize tags by their respective types
+            and provide them as ``tags_<type>`` metadata fields.

            Note: This requires 1 additional HTTP request for each post.
 =========== =====
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@ -15,6 +15,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
    """Base class for 3dbooru extractors"""
    category = "3dbooru"
    api_url = "http://behoimi.org/post/index.json"
+    post_url = "http://behoimi.org/post/show/{}"
    page_limit = 1000

    def __init__(self, match):
@ -53,6 +54,13 @@ class ThreedeebooruPostExtractor(booru.PostMixin,
    test = [("http://behoimi.org/post/show/140852", {
        "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
        "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_character": "furude_rika",
+            "tags_copyright": "higurashi_no_naku_koro_ni",
+            "tags_model": "himekawa_azuru",
+            "tags_general": str,
+        },
    })]


--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@ -11,8 +11,10 @@
 from .common import SharedConfigExtractor, Message
 from .. import text
 from xml.etree import ElementTree
+import collections
 import datetime
 import operator
+import re


 class BooruExtractor(SharedConfigExtractor):
@ -20,6 +22,7 @@ class BooruExtractor(SharedConfigExtractor):
    basecategory = "booru"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    api_url = ""
+    post_url = ""
    per_page = 50
    page_start = 1
    page_limit = None
@ -28,6 +31,10 @@ class BooruExtractor(SharedConfigExtractor):
    def __init__(self, match):
        super().__init__()
        self.params = {}
+        self.prepare = None
+
+        if self.post_url and self.config("tags", False):
+            self.prepare = self._extended_tags

    def skip(self, num):
        pages = num // self.per_page
@ -50,17 +57,18 @@ class BooruExtractor(SharedConfigExtractor):
            for image in images:
                try:
                    url = image["file_url"]
-                    if url.startswith("/"):
-                        url = text.urljoin(self.api_url, url)
-                    image.update(data)
-                    self.prepare(image)
-                    yield Message.Url, url, text.nameext_from_url(url, image)
                except KeyError:
                    continue
+                if url.startswith("/"):
+                    url = text.urljoin(self.api_url, url)
+                image.update(data)
+                if self.prepare:
+                    self.prepare(image)
+                yield Message.Url, url, text.nameext_from_url(url, image)

            if len(images) < self.per_page:
                return
-            self.update_page(images[-1])
+            self.update_page(image)

    def reset_page(self):
        """Initialize params to point to the first page"""
@ -81,8 +89,19 @@ class BooruExtractor(SharedConfigExtractor):
        """Collect metadata for extractor-job"""
        return {}

-    def prepare(self, image):
-        """Prepare and modify an 'image' object"""
+    def _extended_tags(self, image):
+        """Rerieve extended tag information"""
+        url = self.post_url.format(image["id"])
+        page = self.request(url).text
+        tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+
+        tags = collections.defaultdict(list)
+        pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
+        for tag_type, tag_name in pattern.findall(tag_html):
+            tags[tag_type].append(text.unquote(tag_name))
+
+        for key, value in tags.items():
+            image["tags_" + key] = " ".join(value)


 class XmlParserMixin():
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@ -15,6 +15,7 @@ class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
    """Base class for e621 extractors"""
    category = "e621"
    api_url = "https://e621.net/post/index.json"
+    post_url = "https://e621.net/post/show/{}"
    page_limit = 750


@ -48,6 +49,12 @@ class E621PostExtractor(booru.PostMixin, E621Extractor):
    test = [("https://e621.net/post/show/535", {
        "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
        "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_artist": "anry",
+            "tags_general": str,
+            "tags_species": str,
+        },
    })]


--- a/gallery_dl/extractor/konachan.py
+++ b/gallery_dl/extractor/konachan.py
@ -16,9 +16,10 @@ class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
    category = "konachan"

    def __init__(self, match):
+        root = "https://konachan." + match.group("tld")
+        self.api_url = root + "/post.json"
+        self.post_url = root + "/post/show/{}"
        super().__init__(match)
-        self.api_url = "https://konachan.{tld}/post.json".format(
-            tld=match.group("tld"))


 class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
@ -26,10 +27,10 @@ class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
    pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
               r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"]
    test = [
-        ("http://konachan.com/post?tags=patata", {
+        ("https://konachan.com/post?tags=patata", {
            "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
        }),
-        ("http://konachan.net/post?tags=patata", None),
+        ("https://konachan.net/post?tags=patata", None),
    ]


@ -38,10 +39,10 @@ class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
    pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
               r"/pool/show/(?P<pool>\d+)"]
    test = [
-        ("http://konachan.com/pool/show/95", {
+        ("https://konachan.com/pool/show/95", {
            "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
        }),
-        ("http://konachan.net/pool/show/95", None),
+        ("https://konachan.net/pool/show/95", None),
    ]


@ -50,10 +51,17 @@ class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
    pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
               r"/post/show/(?P<post>\d+)"]
    test = [
-        ("http://konachan.com/post/show/205189", {
+        ("https://konachan.com/post/show/205189", {
            "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
+            "options": (("tags", True),),
+            "keyword": {
+                "tags_artist": "patata",
+                "tags_character": "clownpiece",
+                "tags_copyright": "touhou",
+                "tags_general": str,
+            },
        }),
-        ("http://konachan.net/post/show/205189", None),
+        ("https://konachan.net/post/show/205189", None),
    ]


--- a/gallery_dl/extractor/rule34.py
+++ b/gallery_dl/extractor/rule34.py
@ -17,6 +17,7 @@ class Rule34Extractor(booru.XmlParserMixin,
    """Base class for rule34 extractors"""
    category = "rule34"
    api_url = "https://rule34.xxx/index.php"
+    post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
    page_limit = 4000

    def __init__(self, match):
@ -28,7 +29,7 @@ class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
    """Extractor for images from rule34.xxx based on search-tags"""
    pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
                r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
-    test = [("http://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+    test = [("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
        "content": "a01768c6f86f32eb7ebbdeb87c30b0d9968d7f97",
        "pattern": r"https?://(.?img\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
        "count": 2,
@ -39,6 +40,14 @@ class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
    """Extractor for single images from rule34.xxx"""
    pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
                r"\?page=post&s=view&id=(?P<post>\d+)")]
-    test = [("http://rule34.xxx/index.php?page=post&s=view&id=1974854", {
+    test = [("https://rule34.xxx/index.php?page=post&s=view&id=1974854", {
        "content": "fd2820df78fb937532da0a46f7af6cefc4dc94be",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_artist": "danraku",
+            "tags_character": "io_(pso2)",
+            "tags_copyright": "phantasy_star phantasy_star_online_2",
+            "tags_general": "blue_hair female",
+            "tags_metadata": "absurdres highres",
+        },
    })]
--- a/gallery_dl/extractor/safebooru.py
+++ b/gallery_dl/extractor/safebooru.py
@ -17,6 +17,7 @@ class SafebooruExtractor(booru.XmlParserMixin,
    """Base class for safebooru extractors"""
    category = "safebooru"
    api_url = "https://safebooru.org/index.php"
+    post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"

    def __init__(self, match):
        super().__init__(match)
@ -27,7 +28,7 @@ class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
    """Extractor for images from safebooru.org based on search-tags"""
    pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
                r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
-    test = [("http://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+    test = [("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
        "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
        "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
    })]
@ -37,7 +38,14 @@ class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
    """Extractor for single images from safebooru.org"""
    pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
                r"\?page=post&s=view&id=(?P<post>\d+)")]
-    test = [("http://safebooru.org/index.php?page=post&s=view&id=1169132", {
+    test = [("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
        "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
        "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_artist": "kawanakajima",
+            "tags_character": "heath_ledger ronald_mcdonald the_joker",
+            "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+            "tags_general": str,
+        },
    })]
--- a/gallery_dl/extractor/yandere.py
+++ b/gallery_dl/extractor/yandere.py
@ -9,37 +9,13 @@
 """Extract images from https://yande.re/"""

 from . import booru
-from .. import text


 class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
    """Base class for yandere extractors"""
    category = "yandere"
    api_url = "https://yande.re/post.json"
-
-    def __init__(self, match):
-        super().__init__(match)
-        if self.config("tags", False):
-            self.prepare = self._categorize_tags
-
-    def _categorize_tags(self, image):
-        url = "https://yande.re/post/show/{}".format(image["id"])
-        page = self.request(url).text
-        taghtml = text.extract(page, '<ul id="tag-sidebar">', '</ul>')[0]
-
-        pos = 0
-        tags = {"artist": [], "copyright": [], "character": [],
-                "circle": [], "faults": [], "general": []}
-
-        while True:
-            tagtype, pos = text.extract(taghtml, "tag-type-", '"', pos)
-            if not tagtype:
-                break
-            tagname, pos = text.extract(taghtml, "?tags=", '"', pos)
-            tags[tagtype].append(text.unquote(tagname))
-
-        for key, value in tags.items():
-            image["tags_" + key] = " ".join(value)
+    post_url = "https://yande.re/post/show/{}"


 class YandereTagExtractor(booru.TagMixin, YandereExtractor):
@ -69,8 +45,6 @@ class YanderePostExtractor(booru.PostMixin, YandereExtractor):
            "tags_artist": "sasaki_tamaru",
            "tags_circle": "softhouse_chara",
            "tags_copyright": "ouzoku",
-            "tags_character": str,
-            "tags_faults": str,
            "tags_general": str,
        },
    })]