[exhentai] update data extraction code

- parse 'date' to datetime object
- use 'text.extract_from()'
pull/251/head
Mike Fährmann 5 years ago
parent 80fdb11508
commit 1f7fa9dc8e
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -110,7 +110,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
r"|/s/([\da-f]{10})/(\d+)-(\d+))") r"|/s/([\da-f]{10})/(\d+)-(\d+))")
test = ( test = (
("https://exhentai.org/g/960460/4f0e369d82/", { ("https://exhentai.org/g/960460/4f0e369d82/", {
"keyword": "993bfaf68b4823084fbd0d3339564666463b1432", "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
"content": "493d759de534355c9f55f8e365565b62411de146", "content": "493d759de534355c9f55f8e365565b62411de146",
}), }),
("https://exhentai.org/g/960461/4f0e369d82/", { ("https://exhentai.org/g/960461/4f0e369d82/", {
@ -169,57 +169,55 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def get_metadata(self, page): def get_metadata(self, page):
"""Extract gallery metadata""" """Extract gallery metadata"""
data, pos = text.extract_all(page, ( extr = text.extract_from(page)
("title" , '<h1 id="gn">', '</h1>'), data = {
("title_jp" , '<h1 id="gj">', '</h1>'), "gallery_id" : self.gallery_id,
("date" , '>Posted:</td><td class="gdt2">', '</td>'), "gallery_token": self.gallery_token,
("parent" , '>Parent:</td><td class="gdt2"><a href="', '"'), "title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
("visible" , '>Visible:</td><td class="gdt2">', '<'), "title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')),
("language" , '>Language:</td><td class="gdt2">', ' '), "date" : text.parse_datetime(extr(
("gallery_size", '>File Size:</td><td class="gdt2">', '<'), '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
("count" , '>Length:</td><td class="gdt2">', ' '), "parent" : extr(
)) '>Parent:</td><td class="gdt2"><a href="', '"'),
"visible" : extr(
'>Visible:</td><td class="gdt2">', '<'),
"language" : extr(
'>Language:</td><td class="gdt2">', ' '),
"gallery_size" : text.parse_bytes(extr(
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
"count" : text.parse_int(extr(
'>Length:</td><td class="gdt2">', ' ')),
}
data["lang"] = util.language_to_code(data["language"]) data["lang"] = util.language_to_code(data["language"])
data["title"] = text.unescape(data["title"])
data["title_jp"] = text.unescape(data["title_jp"])
data["count"] = text.parse_int(data["count"])
data["gallery_id"] = self.gallery_id
data["gallery_token"] = self.gallery_token
data["gallery_size"] = text.parse_bytes(
data["gallery_size"].rstrip("Bb"))
data["tags"] = [ data["tags"] = [
text.unquote(tag) text.unquote(tag)
for tag in text.extract_iter(page, 'hentai.org/tag/', '"', pos) for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
] ]
return data return data
def image_from_page(self, page): def image_from_page(self, page):
"""Get image url and data from webpage""" """Get image url and data from webpage"""
info = text.extract_all(page, ( pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
(None , '<div id="i3"><a onclick="return load_image(', ''), extr = text.extract_from(page, pos)
("nextkey" , "'", "'"),
("url" , '<img id="img" src="', '"'), self.key["next"] = extr("'", "'")
("origurl" , 'hentai.org/fullimg.php', '"'), iurl = extr('<img id="img" src="', '"')
("originfo", 'ownload original', '<'), orig = extr('hentai.org/fullimg.php', '"')
("startkey", 'var startkey="', '";'),
("showkey" , 'var showkey="', '";'), if self.original and orig:
))[0] url = self.root + "/fullimg.php" + text.unescape(orig)
self.key["start"] = info["startkey"] data = self._parse_original_info(extr('ownload original', '<'))
self.key["show"] = info["showkey"]
self.key["next"] = info["nextkey"]
if self.original and info["origurl"]:
part = text.unescape(info["origurl"])
url = self.root + "/fullimg.php" + part
data = self._parse_original_info(info["originfo"])
else: else:
url = info["url"] url = iurl
data = self._parse_image_info(url) data = self._parse_image_info(url)
data["num"] = self.image_num data["num"] = self.image_num
data["image_token"] = info["startkey"] data["image_token"] = self.key["start"] = extr('var startkey="', '";')
return url, text.nameext_from_url(info["url"], data) self.key["show"] = extr('var showkey="', '";')
return url, text.nameext_from_url(iurl, data)
def images_from_api(self): def images_from_api(self):
"""Get image url and data from api calls""" """Get image url and data from api calls"""

Loading…
Cancel
Save