|
|
@ -28,8 +28,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|
|
|
("https://hitomi.la/galleries/867789.html", {
|
|
|
|
("https://hitomi.la/galleries/867789.html", {
|
|
|
|
"pattern": r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+"
|
|
|
|
"pattern": r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+"
|
|
|
|
r"/[0-9a-f]{64}\.webp",
|
|
|
|
r"/[0-9a-f]{64}\.webp",
|
|
|
|
"keyword": "4b584d09d535694d7d757c47daf5c15d116420d2",
|
|
|
|
"keyword": "86af5371f38117a07407f11af689bdd460b09710",
|
|
|
|
"options": (("metadata", True),),
|
|
|
|
|
|
|
|
"count": 16,
|
|
|
|
"count": 16,
|
|
|
|
}),
|
|
|
|
}),
|
|
|
|
# download test
|
|
|
|
# download test
|
|
|
@ -77,23 +76,18 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|
|
|
|
|
|
|
|
|
|
|
def metadata(self, page):
|
|
|
|
def metadata(self, page):
|
|
|
|
self.info = info = json.loads(page.partition("=")[2])
|
|
|
|
self.info = info = json.loads(page.partition("=")[2])
|
|
|
|
|
|
|
|
iget = info.get
|
|
|
|
|
|
|
|
|
|
|
|
data = self._data_from_gallery_info(info)
|
|
|
|
language = iget("language")
|
|
|
|
if self.config("metadata", False):
|
|
|
|
|
|
|
|
data.update(self._data_from_gallery_page(info))
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _data_from_gallery_info(self, info):
|
|
|
|
|
|
|
|
language = info.get("language")
|
|
|
|
|
|
|
|
if language:
|
|
|
|
if language:
|
|
|
|
language = language.capitalize()
|
|
|
|
language = language.capitalize()
|
|
|
|
|
|
|
|
|
|
|
|
date = info.get("date")
|
|
|
|
date = iget("date")
|
|
|
|
if date:
|
|
|
|
if date:
|
|
|
|
date += ":00"
|
|
|
|
date += ":00"
|
|
|
|
|
|
|
|
|
|
|
|
tags = []
|
|
|
|
tags = []
|
|
|
|
for tinfo in info.get("tags") or ():
|
|
|
|
for tinfo in iget("tags") or ():
|
|
|
|
tag = string.capwords(tinfo["tag"])
|
|
|
|
tag = string.capwords(tinfo["tag"])
|
|
|
|
if tinfo.get("female"):
|
|
|
|
if tinfo.get("female"):
|
|
|
|
tag += " ♀"
|
|
|
|
tag += " ♀"
|
|
|
@ -109,35 +103,10 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|
|
|
"lang" : util.language_to_code(language),
|
|
|
|
"lang" : util.language_to_code(language),
|
|
|
|
"date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"),
|
|
|
|
"date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"),
|
|
|
|
"tags" : tags,
|
|
|
|
"tags" : tags,
|
|
|
|
}
|
|
|
|
"artist" : [o["artist"] for o in iget("artists") or ()],
|
|
|
|
|
|
|
|
"group" : [o["group"] for o in iget("groups") or ()],
|
|
|
|
def _data_from_gallery_page(self, info):
|
|
|
|
"parody" : [o["parody"] for o in iget("parodys") or ()],
|
|
|
|
url = "{}/galleries/{}.html".format(self.root, info["id"])
|
|
|
|
"characters": [o["character"] for o in iget("characters") or ()]
|
|
|
|
|
|
|
|
|
|
|
|
# follow redirects
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
|
|
|
response = self.request(url, fatal=False)
|
|
|
|
|
|
|
|
if b"<title>Redirect</title>" not in response.content:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
url = text.extract(
|
|
|
|
|
|
|
|
response.text, 'http-equiv="refresh" content="', '"',
|
|
|
|
|
|
|
|
)[0].partition("=")[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.status_code >= 400:
|
|
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prep(value):
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
|
|
|
text.unescape(string.capwords(v))
|
|
|
|
|
|
|
|
for v in text.extract_iter(value or "", '.html">', '<')
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extr = text.extract_from(response.text)
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
|
|
"artist" : prep(extr('<h2>', '</h2>')),
|
|
|
|
|
|
|
|
"group" : prep(extr('<td>Group</td><td>', '</td>')),
|
|
|
|
|
|
|
|
"parody" : prep(extr('<td>Series</td><td>', '</td>')),
|
|
|
|
|
|
|
|
"characters": prep(extr('<td>Characters</td><td>', '</td>')),
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def images(self, _):
|
|
|
|
def images(self, _):
|
|
|
|