From 4a57509392d992d9ae6d957b1ac8fb338028ec7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 1 Jul 2018 22:28:52 +0200 Subject: [PATCH] generalize tag-splitting option (#92) - extend functionality to other booru sites: - http://behoimi.org/ - https://konachan.com/ - https://e621.net/ - https://rule34.xxx/ - https://safebooru.org/ - https://yande.re/ --- docs/configuration.rst | 21 +++++++++++-------- gallery_dl/extractor/3dbooru.py | 8 +++++++ gallery_dl/extractor/booru.py | 35 ++++++++++++++++++++++++------- gallery_dl/extractor/e621.py | 7 +++++++ gallery_dl/extractor/konachan.py | 24 ++++++++++++++------- gallery_dl/extractor/rule34.py | 13 ++++++++++-- gallery_dl/extractor/safebooru.py | 12 +++++++++-- gallery_dl/extractor/yandere.py | 28 +------------------------ 8 files changed, 92 insertions(+), 56 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 8b3a393f..2acdebed 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -617,20 +617,23 @@ Description A (comma-separated) list of post types to extract images, etc. from. =========== ===== +extractor.3dbooru.tags +---------------------- +extractor.e621.tags +------------------- +extractor.konachan.tags +----------------------- +extractor.rule34.tags +--------------------- +extractor.safebooru.tags +------------------------ extractor.yandere.tags ---------------------- =========== ===== Type ``bool`` Default ``false`` -Description Split tags into different categories - and provide the following additional metadata-entries: - - - ``tags_artist`` - - ``tags_character`` - - ``tags_circle`` - - ``tags_copyright`` - - ``tags_faults`` - - ``tags_general`` +Description Categorize tags by their respective types + and provide them as ``tags_`` metadata fields. Note: This requires 1 additional HTTP request for each post. =========== ===== diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index c47036eb..d6ac50aa 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -15,6 +15,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): """Base class for 3dbooru extractors""" category = "3dbooru" api_url = "http://behoimi.org/post/index.json" + post_url = "http://behoimi.org/post/show/{}" page_limit = 1000 def __init__(self, match): @@ -53,6 +54,13 @@ class ThreedeebooruPostExtractor(booru.PostMixin, test = [("http://behoimi.org/post/show/140852", { "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6", "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4", + "options": (("tags", True),), + "keyword": { + "tags_character": "furude_rika", + "tags_copyright": "higurashi_no_naku_koro_ni", + "tags_model": "himekawa_azuru", + "tags_general": str, + }, })] diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 922d8201..f702b8d7 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -11,8 +11,10 @@ from .common import SharedConfigExtractor, Message from .. import text from xml.etree import ElementTree +import collections import datetime import operator +import re class BooruExtractor(SharedConfigExtractor): @@ -20,6 +22,7 @@ class BooruExtractor(SharedConfigExtractor): basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" api_url = "" + post_url = "" per_page = 50 page_start = 1 page_limit = None @@ -28,6 +31,10 @@ class BooruExtractor(SharedConfigExtractor): def __init__(self, match): super().__init__() self.params = {} + self.prepare = None + + if self.post_url and self.config("tags", False): + self.prepare = self._extended_tags def skip(self, num): pages = num // self.per_page @@ -50,17 +57,18 @@ class BooruExtractor(SharedConfigExtractor): for image in images: try: url = image["file_url"] - if url.startswith("/"): - url = text.urljoin(self.api_url, url) - image.update(data) - self.prepare(image) - yield Message.Url, url, text.nameext_from_url(url, image) except KeyError: continue + if url.startswith("/"): + url = text.urljoin(self.api_url, url) + image.update(data) + if self.prepare: + self.prepare(image) + yield Message.Url, url, text.nameext_from_url(url, image) if len(images) < self.per_page: return - self.update_page(images[-1]) + self.update_page(image) def reset_page(self): """Initialize params to point to the first page""" @@ -81,8 +89,19 @@ class BooruExtractor(SharedConfigExtractor): """Collect metadata for extractor-job""" return {} - def prepare(self, image): - """Prepare and modify an 'image' object""" + def _extended_tags(self, image): + """Rerieve extended tag information""" + url = self.post_url.format(image["id"]) + page = self.request(url).text + tag_html = text.extract(page, '