generalize tag-splitting option (#92)

- extend functionality to other booru sites:
  - http://behoimi.org/
  - https://konachan.com/
  - https://e621.net/
  - https://rule34.xxx/
  - https://safebooru.org/
  - https://yande.re/
pull/133/head
Mike Fährmann 6 years ago
parent 188e956c4e
commit 4a57509392
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -617,20 +617,23 @@ Description A (comma-separated) list of post types to extract images, etc. from.
=========== ===== =========== =====
extractor.3dbooru.tags
----------------------
extractor.e621.tags
-------------------
extractor.konachan.tags
-----------------------
extractor.rule34.tags
---------------------
extractor.safebooru.tags
------------------------
extractor.yandere.tags extractor.yandere.tags
---------------------- ----------------------
=========== ===== =========== =====
Type ``bool`` Type ``bool``
Default ``false`` Default ``false``
Description Split tags into different categories Description Categorize tags by their respective types
and provide the following additional metadata-entries: and provide them as ``tags_<type>`` metadata fields.
- ``tags_artist``
- ``tags_character``
- ``tags_circle``
- ``tags_copyright``
- ``tags_faults``
- ``tags_general``
Note: This requires 1 additional HTTP request for each post. Note: This requires 1 additional HTTP request for each post.
=========== ===== =========== =====

@ -15,6 +15,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
"""Base class for 3dbooru extractors""" """Base class for 3dbooru extractors"""
category = "3dbooru" category = "3dbooru"
api_url = "http://behoimi.org/post/index.json" api_url = "http://behoimi.org/post/index.json"
post_url = "http://behoimi.org/post/show/{}"
page_limit = 1000 page_limit = 1000
def __init__(self, match): def __init__(self, match):
@ -53,6 +54,13 @@ class ThreedeebooruPostExtractor(booru.PostMixin,
test = [("http://behoimi.org/post/show/140852", { test = [("http://behoimi.org/post/show/140852", {
"url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6", "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
"content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4", "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
"options": (("tags", True),),
"keyword": {
"tags_character": "furude_rika",
"tags_copyright": "higurashi_no_naku_koro_ni",
"tags_model": "himekawa_azuru",
"tags_general": str,
},
})] })]

@ -11,8 +11,10 @@
from .common import SharedConfigExtractor, Message from .common import SharedConfigExtractor, Message
from .. import text from .. import text
from xml.etree import ElementTree from xml.etree import ElementTree
import collections
import datetime import datetime
import operator import operator
import re
class BooruExtractor(SharedConfigExtractor): class BooruExtractor(SharedConfigExtractor):
@ -20,6 +22,7 @@ class BooruExtractor(SharedConfigExtractor):
basecategory = "booru" basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}" filename_fmt = "{category}_{id}_{md5}.{extension}"
api_url = "" api_url = ""
post_url = ""
per_page = 50 per_page = 50
page_start = 1 page_start = 1
page_limit = None page_limit = None
@ -28,6 +31,10 @@ class BooruExtractor(SharedConfigExtractor):
def __init__(self, match): def __init__(self, match):
super().__init__() super().__init__()
self.params = {} self.params = {}
self.prepare = None
if self.post_url and self.config("tags", False):
self.prepare = self._extended_tags
def skip(self, num): def skip(self, num):
pages = num // self.per_page pages = num // self.per_page
@ -50,17 +57,18 @@ class BooruExtractor(SharedConfigExtractor):
for image in images: for image in images:
try: try:
url = image["file_url"] url = image["file_url"]
except KeyError:
continue
if url.startswith("/"): if url.startswith("/"):
url = text.urljoin(self.api_url, url) url = text.urljoin(self.api_url, url)
image.update(data) image.update(data)
if self.prepare:
self.prepare(image) self.prepare(image)
yield Message.Url, url, text.nameext_from_url(url, image) yield Message.Url, url, text.nameext_from_url(url, image)
except KeyError:
continue
if len(images) < self.per_page: if len(images) < self.per_page:
return return
self.update_page(images[-1]) self.update_page(image)
def reset_page(self): def reset_page(self):
"""Initialize params to point to the first page""" """Initialize params to point to the first page"""
@ -81,8 +89,19 @@ class BooruExtractor(SharedConfigExtractor):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
return {} return {}
def prepare(self, image): def _extended_tags(self, image):
"""Prepare and modify an 'image' object""" """Rerieve extended tag information"""
url = self.post_url.format(image["id"])
page = self.request(url).text
tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
tags = collections.defaultdict(list)
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
for tag_type, tag_name in pattern.findall(tag_html):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
image["tags_" + key] = " ".join(value)
class XmlParserMixin(): class XmlParserMixin():

@ -15,6 +15,7 @@ class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
"""Base class for e621 extractors""" """Base class for e621 extractors"""
category = "e621" category = "e621"
api_url = "https://e621.net/post/index.json" api_url = "https://e621.net/post/index.json"
post_url = "https://e621.net/post/show/{}"
page_limit = 750 page_limit = 750
@ -48,6 +49,12 @@ class E621PostExtractor(booru.PostMixin, E621Extractor):
test = [("https://e621.net/post/show/535", { test = [("https://e621.net/post/show/535", {
"url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
"content": "66f46e96a893fba8e694c4e049b23c2acc9af462", "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
"options": (("tags", True),),
"keyword": {
"tags_artist": "anry",
"tags_general": str,
"tags_species": str,
},
})] })]

@ -16,9 +16,10 @@ class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
category = "konachan" category = "konachan"
def __init__(self, match): def __init__(self, match):
root = "https://konachan." + match.group("tld")
self.api_url = root + "/post.json"
self.post_url = root + "/post/show/{}"
super().__init__(match) super().__init__(match)
self.api_url = "https://konachan.{tld}/post.json".format(
tld=match.group("tld"))
class KonachanTagExtractor(booru.TagMixin, KonachanExtractor): class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
@ -26,10 +27,10 @@ class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"] r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"]
test = [ test = [
("http://konachan.com/post?tags=patata", { ("https://konachan.com/post?tags=patata", {
"content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
}), }),
("http://konachan.net/post?tags=patata", None), ("https://konachan.net/post?tags=patata", None),
] ]
@ -38,10 +39,10 @@ class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
r"/pool/show/(?P<pool>\d+)"] r"/pool/show/(?P<pool>\d+)"]
test = [ test = [
("http://konachan.com/pool/show/95", { ("https://konachan.com/pool/show/95", {
"content": "cf0546e38a93c2c510a478f8744e60687b7a8426", "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
}), }),
("http://konachan.net/pool/show/95", None), ("https://konachan.net/pool/show/95", None),
] ]
@ -50,10 +51,17 @@ class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
r"/post/show/(?P<post>\d+)"] r"/post/show/(?P<post>\d+)"]
test = [ test = [
("http://konachan.com/post/show/205189", { ("https://konachan.com/post/show/205189", {
"content": "674e75a753df82f5ad80803f575818b8e46e4b65", "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
"options": (("tags", True),),
"keyword": {
"tags_artist": "patata",
"tags_character": "clownpiece",
"tags_copyright": "touhou",
"tags_general": str,
},
}), }),
("http://konachan.net/post/show/205189", None), ("https://konachan.net/post/show/205189", None),
] ]

@ -17,6 +17,7 @@ class Rule34Extractor(booru.XmlParserMixin,
"""Base class for rule34 extractors""" """Base class for rule34 extractors"""
category = "rule34" category = "rule34"
api_url = "https://rule34.xxx/index.php" api_url = "https://rule34.xxx/index.php"
post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
page_limit = 4000 page_limit = 4000
def __init__(self, match): def __init__(self, match):
@ -28,7 +29,7 @@ class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
"""Extractor for images from rule34.xxx based on search-tags""" """Extractor for images from rule34.xxx based on search-tags"""
pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?" pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")] r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
test = [("http://rule34.xxx/index.php?page=post&s=list&tags=danraku", { test = [("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
"content": "a01768c6f86f32eb7ebbdeb87c30b0d9968d7f97", "content": "a01768c6f86f32eb7ebbdeb87c30b0d9968d7f97",
"pattern": r"https?://(.?img\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", "pattern": r"https?://(.?img\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
"count": 2, "count": 2,
@ -39,6 +40,14 @@ class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
"""Extractor for single images from rule34.xxx""" """Extractor for single images from rule34.xxx"""
pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?" pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")] r"\?page=post&s=view&id=(?P<post>\d+)")]
test = [("http://rule34.xxx/index.php?page=post&s=view&id=1974854", { test = [("https://rule34.xxx/index.php?page=post&s=view&id=1974854", {
"content": "fd2820df78fb937532da0a46f7af6cefc4dc94be", "content": "fd2820df78fb937532da0a46f7af6cefc4dc94be",
"options": (("tags", True),),
"keyword": {
"tags_artist": "danraku",
"tags_character": "io_(pso2)",
"tags_copyright": "phantasy_star phantasy_star_online_2",
"tags_general": "blue_hair female",
"tags_metadata": "absurdres highres",
},
})] })]

@ -17,6 +17,7 @@ class SafebooruExtractor(booru.XmlParserMixin,
"""Base class for safebooru extractors""" """Base class for safebooru extractors"""
category = "safebooru" category = "safebooru"
api_url = "https://safebooru.org/index.php" api_url = "https://safebooru.org/index.php"
post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
def __init__(self, match): def __init__(self, match):
super().__init__(match) super().__init__(match)
@ -27,7 +28,7 @@ class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
"""Extractor for images from safebooru.org based on search-tags""" """Extractor for images from safebooru.org based on search-tags"""
pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?" pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")] r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
test = [("http://safebooru.org/index.php?page=post&s=list&tags=bonocho", { test = [("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
"url": "17c61b386530cf4c30842c9f580d15ef1cd09586", "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
"content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
})] })]
@ -37,7 +38,14 @@ class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
"""Extractor for single images from safebooru.org""" """Extractor for single images from safebooru.org"""
pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?" pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")] r"\?page=post&s=view&id=(?P<post>\d+)")]
test = [("http://safebooru.org/index.php?page=post&s=view&id=1169132", { test = [("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
"url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
"content": "93b293b27dabd198afafabbaf87c49863ac82f27", "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
"options": (("tags", True),),
"keyword": {
"tags_artist": "kawanakajima",
"tags_character": "heath_ledger ronald_mcdonald the_joker",
"tags_copyright": "dc_comics mcdonald's the_dark_knight",
"tags_general": str,
},
})] })]

@ -9,37 +9,13 @@
"""Extract images from https://yande.re/""" """Extract images from https://yande.re/"""
from . import booru from . import booru
from .. import text
class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
"""Base class for yandere extractors""" """Base class for yandere extractors"""
category = "yandere" category = "yandere"
api_url = "https://yande.re/post.json" api_url = "https://yande.re/post.json"
post_url = "https://yande.re/post/show/{}"
def __init__(self, match):
super().__init__(match)
if self.config("tags", False):
self.prepare = self._categorize_tags
def _categorize_tags(self, image):
url = "https://yande.re/post/show/{}".format(image["id"])
page = self.request(url).text
taghtml = text.extract(page, '<ul id="tag-sidebar">', '</ul>')[0]
pos = 0
tags = {"artist": [], "copyright": [], "character": [],
"circle": [], "faults": [], "general": []}
while True:
tagtype, pos = text.extract(taghtml, "tag-type-", '"', pos)
if not tagtype:
break
tagname, pos = text.extract(taghtml, "?tags=", '"', pos)
tags[tagtype].append(text.unquote(tagname))
for key, value in tags.items():
image["tags_" + key] = " ".join(value)
class YandereTagExtractor(booru.TagMixin, YandereExtractor): class YandereTagExtractor(booru.TagMixin, YandereExtractor):
@ -69,8 +45,6 @@ class YanderePostExtractor(booru.PostMixin, YandereExtractor):
"tags_artist": "sasaki_tamaru", "tags_artist": "sasaki_tamaru",
"tags_circle": "softhouse_chara", "tags_circle": "softhouse_chara",
"tags_copyright": "ouzoku", "tags_copyright": "ouzoku",
"tags_character": str,
"tags_faults": str,
"tags_general": str, "tags_general": str,
}, },
})] })]

Loading…
Cancel
Save