[hitomi] update metadata extraction (fixes #2444)

remove 'hitomi.metadata' option, as it is no longer necessary
to make additional HTTP requests to fetch all metadata.
pull/2474/head
Mike Fährmann 3 years ago
parent e718dd7b32
commit 37d584a9b2
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1336,17 +1336,6 @@ Description
but is most likely going to fail with ``403 Forbidden`` errors. but is most likely going to fail with ``403 Forbidden`` errors.
extractor.hitomi.metadata
-------------------------
Type
``bool``
Default
``false``
Description
Try to extract
``artist``, ``group``, ``parody``, and ``characters`` metadata.
extractor.imgur.mp4 extractor.imgur.mp4
------------------- -------------------
Type Type

@ -28,8 +28,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
("https://hitomi.la/galleries/867789.html", { ("https://hitomi.la/galleries/867789.html", {
"pattern": r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+" "pattern": r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+"
r"/[0-9a-f]{64}\.webp", r"/[0-9a-f]{64}\.webp",
"keyword": "4b584d09d535694d7d757c47daf5c15d116420d2", "keyword": "86af5371f38117a07407f11af689bdd460b09710",
"options": (("metadata", True),),
"count": 16, "count": 16,
}), }),
# download test # download test
@ -77,23 +76,18 @@ class HitomiGalleryExtractor(GalleryExtractor):
def metadata(self, page): def metadata(self, page):
self.info = info = json.loads(page.partition("=")[2]) self.info = info = json.loads(page.partition("=")[2])
iget = info.get
data = self._data_from_gallery_info(info) language = iget("language")
if self.config("metadata", False):
data.update(self._data_from_gallery_page(info))
return data
def _data_from_gallery_info(self, info):
language = info.get("language")
if language: if language:
language = language.capitalize() language = language.capitalize()
date = info.get("date") date = iget("date")
if date: if date:
date += ":00" date += ":00"
tags = [] tags = []
for tinfo in info.get("tags") or (): for tinfo in iget("tags") or ():
tag = string.capwords(tinfo["tag"]) tag = string.capwords(tinfo["tag"])
if tinfo.get("female"): if tinfo.get("female"):
tag += "" tag += ""
@ -109,35 +103,10 @@ class HitomiGalleryExtractor(GalleryExtractor):
"lang" : util.language_to_code(language), "lang" : util.language_to_code(language),
"date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"), "date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"),
"tags" : tags, "tags" : tags,
} "artist" : [o["artist"] for o in iget("artists") or ()],
"group" : [o["group"] for o in iget("groups") or ()],
def _data_from_gallery_page(self, info): "parody" : [o["parody"] for o in iget("parodys") or ()],
url = "{}/galleries/{}.html".format(self.root, info["id"]) "characters": [o["character"] for o in iget("characters") or ()]
# follow redirects
while True:
response = self.request(url, fatal=False)
if b"<title>Redirect</title>" not in response.content:
break
url = text.extract(
response.text, 'http-equiv="refresh" content="', '"',
)[0].partition("=")[2]
if response.status_code >= 400:
return {}
def prep(value):
return [
text.unescape(string.capwords(v))
for v in text.extract_iter(value or "", '.html">', '<')
]
extr = text.extract_from(response.text)
return {
"artist" : prep(extr('<h2>', '</h2>')),
"group" : prep(extr('<td>Group</td><td>', '</td>')),
"parody" : prep(extr('<td>Series</td><td>', '</td>')),
"characters": prep(extr('<td>Characters</td><td>', '</td>')),
} }
def images(self, _): def images(self, _):

Loading…
Cancel
Save