From 4a57509392d992d9ae6d957b1ac8fb338028ec7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Sun, 1 Jul 2018 22:28:52 +0200
Subject: [PATCH] generalize tag-splitting option (#92)

- extend functionality to other booru sites:
  - http://behoimi.org/
  - https://konachan.com/
  - https://e621.net/
  - https://rule34.xxx/
  - https://safebooru.org/
  - https://yande.re/
---
 docs/configuration.rst            | 21 +++++++++++--------
 gallery_dl/extractor/3dbooru.py   |  8 +++++++
 gallery_dl/extractor/booru.py     | 35 ++++++++++++++++++++++++-------
 gallery_dl/extractor/e621.py      |  7 +++++++
 gallery_dl/extractor/konachan.py  | 24 ++++++++++++++-------
 gallery_dl/extractor/rule34.py    | 13 ++++++++++--
 gallery_dl/extractor/safebooru.py | 12 +++++++++--
 gallery_dl/extractor/yandere.py   | 28 +------------------------
 8 files changed, 92 insertions(+), 56 deletions(-)
diff --git a/docs/configuration.rst b/docs/configuration.rst
index 8b3a393f..2acdebed 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -617,20 +617,23 @@ Description A (comma-separated) list of post types to extract images, etc. from.
 =========== =====
 
 
+extractor.3dbooru.tags
+----------------------
+extractor.e621.tags
+-------------------
+extractor.konachan.tags
+-----------------------
+extractor.rule34.tags
+---------------------
+extractor.safebooru.tags
+------------------------
 extractor.yandere.tags
 ----------------------
 =========== =====
 Type        ``bool``
 Default     ``false``
-Description Split tags into different categories
-            and provide the following additional metadata-entries:
-
-            - ``tags_artist``
-            - ``tags_character``
-            - ``tags_circle``
-            - ``tags_copyright``
-            - ``tags_faults``
-            - ``tags_general``
+Description Categorize tags by their respective types
+            and provide them as ``tags_<type>`` metadata fields.
 
             Note: This requires 1 additional HTTP request for each post.
 =========== =====
diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py
index c47036eb..d6ac50aa 100644
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@@ -15,6 +15,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
     """Base class for 3dbooru extractors"""
     category = "3dbooru"
     api_url = "http://behoimi.org/post/index.json"
+    post_url = "http://behoimi.org/post/show/{}"
     page_limit = 1000
 
     def __init__(self, match):
@@ -53,6 +54,13 @@ class ThreedeebooruPostExtractor(booru.PostMixin,
     test = [("http://behoimi.org/post/show/140852", {
         "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
         "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_character": "furude_rika",
+            "tags_copyright": "higurashi_no_naku_koro_ni",
+            "tags_model": "himekawa_azuru",
+            "tags_general": str,
+        },
     })]
 
 
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 922d8201..f702b8d7 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -11,8 +11,10 @@
 from .common import SharedConfigExtractor, Message
 from .. import text
 from xml.etree import ElementTree
+import collections
 import datetime
 import operator
+import re
 
 
 class BooruExtractor(SharedConfigExtractor):
@@ -20,6 +22,7 @@ class BooruExtractor(SharedConfigExtractor):
     basecategory = "booru"
     filename_fmt = "{category}_{id}_{md5}.{extension}"
     api_url = ""
+    post_url = ""
     per_page = 50
     page_start = 1
     page_limit = None
@@ -28,6 +31,10 @@ class BooruExtractor(SharedConfigExtractor):
     def __init__(self, match):
         super().__init__()
         self.params = {}
+        self.prepare = None
+
+        if self.post_url and self.config("tags", False):
+            self.prepare = self._extended_tags
 
     def skip(self, num):
         pages = num // self.per_page
@@ -50,17 +57,18 @@ class BooruExtractor(SharedConfigExtractor):
             for image in images:
                 try:
                     url = image["file_url"]
-                    if url.startswith("/"):
-                        url = text.urljoin(self.api_url, url)
-                    image.update(data)
-                    self.prepare(image)
-                    yield Message.Url, url, text.nameext_from_url(url, image)
                 except KeyError:
                     continue
+                if url.startswith("/"):
+                    url = text.urljoin(self.api_url, url)
+                image.update(data)
+                if self.prepare:
+                    self.prepare(image)
+                yield Message.Url, url, text.nameext_from_url(url, image)
 
             if len(images) < self.per_page:
                 return
-            self.update_page(images[-1])
+            self.update_page(image)
 
     def reset_page(self):
         """Initialize params to point to the first page"""
@@ -81,8 +89,19 @@ class BooruExtractor(SharedConfigExtractor):
         """Collect metadata for extractor-job"""
         return {}
 
-    def prepare(self, image):
-        """Prepare and modify an 'image' object"""
+    def _extended_tags(self, image):
+        """Rerieve extended tag information"""
+        url = self.post_url.format(image["id"])
+        page = self.request(url).text
+        tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+
+        tags = collections.defaultdict(list)
+        pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
+        for tag_type, tag_name in pattern.findall(tag_html):
+            tags[tag_type].append(text.unquote(tag_name))
+
+        for key, value in tags.items():
+            image["tags_" + key] = " ".join(value)
 
 
 class XmlParserMixin():
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 770e51e7..726f6c36 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -15,6 +15,7 @@ class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
     """Base class for e621 extractors"""
     category = "e621"
     api_url = "https://e621.net/post/index.json"
+    post_url = "https://e621.net/post/show/{}"
     page_limit = 750
 
 
@@ -48,6 +49,12 @@ class E621PostExtractor(booru.PostMixin, E621Extractor):
     test = [("https://e621.net/post/show/535", {
         "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
         "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_artist": "anry",
+            "tags_general": str,
+            "tags_species": str,
+        },
     })]
 
 
diff --git a/gallery_dl/extractor/konachan.py b/gallery_dl/extractor/konachan.py
index 7b537976..7b8e771e 100644
--- a/gallery_dl/extractor/konachan.py
+++ b/gallery_dl/extractor/konachan.py
@@ -16,9 +16,10 @@ class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
     category = "konachan"
 
     def __init__(self, match):
+        root = "https://konachan." + match.group("tld")
+        self.api_url = root + "/post.json"
+        self.post_url = root + "/post/show/{}"
         super().__init__(match)
-        self.api_url = "https://konachan.{tld}/post.json".format(
-            tld=match.group("tld"))
 
 
 class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
@@ -26,10 +27,10 @@ class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
     pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
                r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"]
     test = [
-        ("http://konachan.com/post?tags=patata", {
+        ("https://konachan.com/post?tags=patata", {
             "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
         }),
-        ("http://konachan.net/post?tags=patata", None),
+        ("https://konachan.net/post?tags=patata", None),
     ]
 
 
@@ -38,10 +39,10 @@ class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
     pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
                r"/pool/show/(?P<pool>\d+)"]
     test = [
-        ("http://konachan.com/pool/show/95", {
+        ("https://konachan.com/pool/show/95", {
             "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
         }),
-        ("http://konachan.net/pool/show/95", None),
+        ("https://konachan.net/pool/show/95", None),
     ]
 
 
@@ -50,10 +51,17 @@ class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
     pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
                r"/post/show/(?P<post>\d+)"]
     test = [
-        ("http://konachan.com/post/show/205189", {
+        ("https://konachan.com/post/show/205189", {
             "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
+            "options": (("tags", True),),
+            "keyword": {
+                "tags_artist": "patata",
+                "tags_character": "clownpiece",
+                "tags_copyright": "touhou",
+                "tags_general": str,
+            },
         }),
-        ("http://konachan.net/post/show/205189", None),
+        ("https://konachan.net/post/show/205189", None),
     ]
 
 
diff --git a/gallery_dl/extractor/rule34.py b/gallery_dl/extractor/rule34.py
index d07e2264..fcf03643 100644
--- a/gallery_dl/extractor/rule34.py
+++ b/gallery_dl/extractor/rule34.py
@@ -17,6 +17,7 @@ class Rule34Extractor(booru.XmlParserMixin,
     """Base class for rule34 extractors"""
     category = "rule34"
     api_url = "https://rule34.xxx/index.php"
+    post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
     page_limit = 4000
 
     def __init__(self, match):
@@ -28,7 +29,7 @@ class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
     """Extractor for images from rule34.xxx based on search-tags"""
     pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
                 r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
-    test = [("http://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+    test = [("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
         "content": "a01768c6f86f32eb7ebbdeb87c30b0d9968d7f97",
         "pattern": r"https?://(.?img\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
         "count": 2,
@@ -39,6 +40,14 @@ class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
     """Extractor for single images from rule34.xxx"""
     pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
                 r"\?page=post&s=view&id=(?P<post>\d+)")]
-    test = [("http://rule34.xxx/index.php?page=post&s=view&id=1974854", {
+    test = [("https://rule34.xxx/index.php?page=post&s=view&id=1974854", {
         "content": "fd2820df78fb937532da0a46f7af6cefc4dc94be",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_artist": "danraku",
+            "tags_character": "io_(pso2)",
+            "tags_copyright": "phantasy_star phantasy_star_online_2",
+            "tags_general": "blue_hair female",
+            "tags_metadata": "absurdres highres",
+        },
     })]
diff --git a/gallery_dl/extractor/safebooru.py b/gallery_dl/extractor/safebooru.py
index 442748a4..13f9aff5 100644
--- a/gallery_dl/extractor/safebooru.py
+++ b/gallery_dl/extractor/safebooru.py
@@ -17,6 +17,7 @@ class SafebooruExtractor(booru.XmlParserMixin,
     """Base class for safebooru extractors"""
     category = "safebooru"
     api_url = "https://safebooru.org/index.php"
+    post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
 
     def __init__(self, match):
         super().__init__(match)
@@ -27,7 +28,7 @@ class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
     """Extractor for images from safebooru.org based on search-tags"""
     pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
                 r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
-    test = [("http://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+    test = [("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
         "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
         "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
     })]
@@ -37,7 +38,14 @@ class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
     """Extractor for single images from safebooru.org"""
     pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
                 r"\?page=post&s=view&id=(?P<post>\d+)")]
-    test = [("http://safebooru.org/index.php?page=post&s=view&id=1169132", {
+    test = [("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
         "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
         "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+        "options": (("tags", True),),
+        "keyword": {
+            "tags_artist": "kawanakajima",
+            "tags_character": "heath_ledger ronald_mcdonald the_joker",
+            "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+            "tags_general": str,
+        },
     })]
diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py
index 62146ce3..508b3d9f 100644
--- a/gallery_dl/extractor/yandere.py
+++ b/gallery_dl/extractor/yandere.py
@@ -9,37 +9,13 @@
 """Extract images from https://yande.re/"""
 
 from . import booru
-from .. import text
 
 
 class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
     """Base class for yandere extractors"""
     category = "yandere"
     api_url = "https://yande.re/post.json"
-
-    def __init__(self, match):
-        super().__init__(match)
-        if self.config("tags", False):
-            self.prepare = self._categorize_tags
-
-    def _categorize_tags(self, image):
-        url = "https://yande.re/post/show/{}".format(image["id"])
-        page = self.request(url).text
-        taghtml = text.extract(page, '<ul id="tag-sidebar">', '</ul>')[0]
-
-        pos = 0
-        tags = {"artist": [], "copyright": [], "character": [],
-                "circle": [], "faults": [], "general": []}
-
-        while True:
-            tagtype, pos = text.extract(taghtml, "tag-type-", '"', pos)
-            if not tagtype:
-                break
-            tagname, pos = text.extract(taghtml, "?tags=", '"', pos)
-            tags[tagtype].append(text.unquote(tagname))
-
-        for key, value in tags.items():
-            image["tags_" + key] = " ".join(value)
+    post_url = "https://yande.re/post/show/{}"
 
 
 class YandereTagExtractor(booru.TagMixin, YandereExtractor):
@@ -69,8 +45,6 @@ class YanderePostExtractor(booru.PostMixin, YandereExtractor):
             "tags_artist": "sasaki_tamaru",
             "tags_circle": "softhouse_chara",
             "tags_copyright": "ouzoku",
-            "tags_character": str,
-            "tags_faults": str,
             "tags_general": str,
         },
     })]