[hentaifox] improve metadata extraction (fixes #1378)

pull/1405/head
Mike Fährmann 4 years ago
parent 72fe9ac0f3
commit 6be7df53da
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -22,27 +22,56 @@ class HentaifoxBase():
class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
"""Extractor for image galleries on hentaifox.com"""
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
test = ("https://hentaifox.com/gallery/56622/", {
test = (
("https://hentaifox.com/gallery/56622/", {
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
"keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92",
"count": 24,
})
}),
# 'split_tag' element (#1378)
("https://hentaifox.com/gallery/630/", {
"keyword": {
"artist": ["beti", "betty", "magi", "mimikaki"],
"characters": [
"aerith gainsborough",
"tifa lockhart",
"yuffie kisaragi"
],
"count": 32,
"gallery_id": 630,
"group": ["cu-little2"],
"parody": ["darkstalkers | vampire", "final fantasy vii"],
"tags": ["femdom", "fingering", "masturbation", "yuri"],
"title": "Cu-Little Bakanya~",
"type": "doujinshi",
},
}),
)
def __init__(self, match):
GalleryExtractor.__init__(self, match)
self.gallery_id = match.group(2)
def metadata(self, page, split=text.split_html):
@staticmethod
def _split(txt):
return [
text.remove_html(tag.partition(">")[2], "", "")
for tag in text.extract_iter(
txt, "class='tag_btn", "<span class='t_badge")
]
def metadata(self, page):
extr = text.extract_from(page)
split = self._split
return {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr("<h1>", "</h1>")),
"parody" : split(extr(">Parodies:" , "</ul>"))[::2],
"characters": split(extr(">Characters:", "</ul>"))[::2],
"tags" : split(extr(">Tags:" , "</ul>"))[::2],
"artist" : split(extr(">Artists:" , "</ul>"))[::2],
"group" : split(extr(">Groups:" , "</ul>"))[::2],
"parody" : split(extr(">Parodies:" , "</ul>")),
"characters": split(extr(">Characters:", "</ul>")),
"tags" : split(extr(">Tags:" , "</ul>")),
"artist" : split(extr(">Artists:" , "</ul>")),
"group" : split(extr(">Groups:" , "</ul>")),
"type" : text.remove_html(extr(">Category:", "<span")),
"language" : "English",
"lang" : "en",

Loading…
Cancel
Save