generalize tag-splitting option (#92)

- extend functionality to other booru sites:
  - http://behoimi.org/
  - https://konachan.com/
  - https://e621.net/
  - https://rule34.xxx/
  - https://safebooru.org/
  - https://yande.re/
pull/133/head
Mike Fährmann 6 years ago
parent 188e956c4e
commit 4a57509392
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -617,20 +617,23 @@ Description A (comma-separated) list of post types to extract images, etc. from.
=========== =====
extractor.3dbooru.tags
----------------------
extractor.e621.tags
-------------------
extractor.konachan.tags
-----------------------
extractor.rule34.tags
---------------------
extractor.safebooru.tags
------------------------
extractor.yandere.tags
----------------------
=========== =====
Type ``bool``
Default ``false``
Description Split tags into different categories
and provide the following additional metadata-entries:
- ``tags_artist``
- ``tags_character``
- ``tags_circle``
- ``tags_copyright``
- ``tags_faults``
- ``tags_general``
Description Categorize tags by their respective types
and provide them as ``tags_<type>`` metadata fields.
Note: This requires 1 additional HTTP request for each post.
=========== =====

@ -15,6 +15,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
"""Base class for 3dbooru extractors"""
category = "3dbooru"
api_url = "http://behoimi.org/post/index.json"
post_url = "http://behoimi.org/post/show/{}"
page_limit = 1000
def __init__(self, match):
@ -53,6 +54,13 @@ class ThreedeebooruPostExtractor(booru.PostMixin,
test = [("http://behoimi.org/post/show/140852", {
"url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
"content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
"options": (("tags", True),),
"keyword": {
"tags_character": "furude_rika",
"tags_copyright": "higurashi_no_naku_koro_ni",
"tags_model": "himekawa_azuru",
"tags_general": str,
},
})]

@ -11,8 +11,10 @@
from .common import SharedConfigExtractor, Message
from .. import text
from xml.etree import ElementTree
import collections
import datetime
import operator
import re
class BooruExtractor(SharedConfigExtractor):
@ -20,6 +22,7 @@ class BooruExtractor(SharedConfigExtractor):
basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
api_url = ""
post_url = ""
per_page = 50
page_start = 1
page_limit = None
@ -28,6 +31,10 @@ class BooruExtractor(SharedConfigExtractor):
def __init__(self, match):
super().__init__()
self.params = {}
self.prepare = None
if self.post_url and self.config("tags", False):
self.prepare = self._extended_tags
def skip(self, num):
pages = num // self.per_page
@ -50,17 +57,18 @@ class BooruExtractor(SharedConfigExtractor):
for image in images:
try:
url = image["file_url"]
if url.startswith("/"):
url = text.urljoin(self.api_url, url)
image.update(data)
self.prepare(image)
yield Message.Url, url, text.nameext_from_url(url, image)
except KeyError:
continue
if url.startswith("/"):
url = text.urljoin(self.api_url, url)
image.update(data)
if self.prepare:
self.prepare(image)
yield Message.Url, url, text.nameext_from_url(url, image)
if len(images) < self.per_page:
return
self.update_page(images[-1])
self.update_page(image)
def reset_page(self):
"""Initialize params to point to the first page"""
@ -81,8 +89,19 @@ class BooruExtractor(SharedConfigExtractor):
"""Collect metadata for extractor-job"""
return {}
def prepare(self, image):
"""Prepare and modify an 'image' object"""
def _extended_tags(self, image):
"""Rerieve extended tag information"""
url = self.post_url.format(image["id"])
page = self.request(url).text
tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
tags = collections.defaultdict(list)
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
for tag_type, tag_name in pattern.findall(tag_html):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
image["tags_" + key] = " ".join(value)
class XmlParserMixin():

@ -15,6 +15,7 @@ class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
"""Base class for e621 extractors"""
category = "e621"
api_url = "https://e621.net/post/index.json"
post_url = "https://e621.net/post/show/{}"
page_limit = 750
@ -48,6 +49,12 @@ class E621PostExtractor(booru.PostMixin, E621Extractor):
test = [("https://e621.net/post/show/535", {
"url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
"content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
"options": (("tags", True),),
"keyword": {
"tags_artist": "anry",
"tags_general": str,
"tags_species": str,
},
})]

@ -16,9 +16,10 @@ class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
category = "konachan"
def __init__(self, match):
root = "https://konachan." + match.group("tld")
self.api_url = root + "/post.json"
self.post_url = root + "/post/show/{}"
super().__init__(match)
self.api_url = "https://konachan.{tld}/post.json".format(
tld=match.group("tld"))
class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
@ -26,10 +27,10 @@ class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"]
test = [
("http://konachan.com/post?tags=patata", {
("https://konachan.com/post?tags=patata", {
"content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
}),
("http://konachan.net/post?tags=patata", None),
("https://konachan.net/post?tags=patata", None),
]
@ -38,10 +39,10 @@ class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
r"/pool/show/(?P<pool>\d+)"]
test = [
("http://konachan.com/pool/show/95", {
("https://konachan.com/pool/show/95", {
"content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
}),
("http://konachan.net/pool/show/95", None),
("https://konachan.net/pool/show/95", None),
]
@ -50,10 +51,17 @@ class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
r"/post/show/(?P<post>\d+)"]
test = [
("http://konachan.com/post/show/205189", {
("https://konachan.com/post/show/205189", {
"content": "674e75a753df82f5ad80803f575818b8e46e4b65",
"options": (("tags", True),),
"keyword": {
"tags_artist": "patata",
"tags_character": "clownpiece",
"tags_copyright": "touhou",
"tags_general": str,
},
}),
("http://konachan.net/post/show/205189", None),
("https://konachan.net/post/show/205189", None),
]

@ -17,6 +17,7 @@ class Rule34Extractor(booru.XmlParserMixin,
"""Base class for rule34 extractors"""
category = "rule34"
api_url = "https://rule34.xxx/index.php"
post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
page_limit = 4000
def __init__(self, match):
@ -28,7 +29,7 @@ class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
"""Extractor for images from rule34.xxx based on search-tags"""
pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
test = [("http://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
test = [("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
"content": "a01768c6f86f32eb7ebbdeb87c30b0d9968d7f97",
"pattern": r"https?://(.?img\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
"count": 2,
@ -39,6 +40,14 @@ class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
"""Extractor for single images from rule34.xxx"""
pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")]
test = [("http://rule34.xxx/index.php?page=post&s=view&id=1974854", {
test = [("https://rule34.xxx/index.php?page=post&s=view&id=1974854", {
"content": "fd2820df78fb937532da0a46f7af6cefc4dc94be",
"options": (("tags", True),),
"keyword": {
"tags_artist": "danraku",
"tags_character": "io_(pso2)",
"tags_copyright": "phantasy_star phantasy_star_online_2",
"tags_general": "blue_hair female",
"tags_metadata": "absurdres highres",
},
})]

@ -17,6 +17,7 @@ class SafebooruExtractor(booru.XmlParserMixin,
"""Base class for safebooru extractors"""
category = "safebooru"
api_url = "https://safebooru.org/index.php"
post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
def __init__(self, match):
super().__init__(match)
@ -27,7 +28,7 @@ class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
"""Extractor for images from safebooru.org based on search-tags"""
pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
test = [("http://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
test = [("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
"url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
"content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
})]
@ -37,7 +38,14 @@ class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
"""Extractor for single images from safebooru.org"""
pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")]
test = [("http://safebooru.org/index.php?page=post&s=view&id=1169132", {
test = [("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
"url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
"content": "93b293b27dabd198afafabbaf87c49863ac82f27",
"options": (("tags", True),),
"keyword": {
"tags_artist": "kawanakajima",
"tags_character": "heath_ledger ronald_mcdonald the_joker",
"tags_copyright": "dc_comics mcdonald's the_dark_knight",
"tags_general": str,
},
})]

@ -9,37 +9,13 @@
"""Extract images from https://yande.re/"""
from . import booru
from .. import text
class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
"""Base class for yandere extractors"""
category = "yandere"
api_url = "https://yande.re/post.json"
def __init__(self, match):
super().__init__(match)
if self.config("tags", False):
self.prepare = self._categorize_tags
def _categorize_tags(self, image):
url = "https://yande.re/post/show/{}".format(image["id"])
page = self.request(url).text
taghtml = text.extract(page, '<ul id="tag-sidebar">', '</ul>')[0]
pos = 0
tags = {"artist": [], "copyright": [], "character": [],
"circle": [], "faults": [], "general": []}
while True:
tagtype, pos = text.extract(taghtml, "tag-type-", '"', pos)
if not tagtype:
break
tagname, pos = text.extract(taghtml, "?tags=", '"', pos)
tags[tagtype].append(text.unquote(tagname))
for key, value in tags.items():
image["tags_" + key] = " ".join(value)
post_url = "https://yande.re/post/show/{}"
class YandereTagExtractor(booru.TagMixin, YandereExtractor):
@ -69,8 +45,6 @@ class YanderePostExtractor(booru.PostMixin, YandereExtractor):
"tags_artist": "sasaki_tamaru",
"tags_circle": "softhouse_chara",
"tags_copyright": "ouzoku",
"tags_character": str,
"tags_faults": str,
"tags_general": str,
},
})]

Loading…
Cancel
Save