[exhentai] rename metadata fields to match API results (#1325)

- gallery_id    -> gid
- gallery_token -> token
- title_jp      -> title_jpn
- visible       -> expunged
- gallery_size  -> filesize
- count         -> filecount

Also changes the function of the 'metadata' option.
It is now boolean and causes extra data fields from the API to be added
instead of completely replacing the data from HTML when activated.
pull/1352/head
Mike Fährmann 4 years ago
parent 996bfe4d4b
commit 61fbbd2dae
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -927,14 +927,15 @@ Description
extractor.exhentai.metadata extractor.exhentai.metadata
--------------------------- ---------------------------
Type Type
``string`` ``bool``
Default Default
``"html"`` ``false``
Description Description
Select the gallery metadata source. Load extended gallery metadata from the
`API <https://ehwiki.org/wiki/API#Gallery_Metadata>`_.
* ``"api"``: Get data from the `API <https://ehwiki.org/wiki/API>`_. Adds ``archiver_key``, ``posted``, and ``torrents``.
* ``"html"``: Extract data from HTML. Makes ``date`` and ``filesize`` more precise.
extractor.exhentai.original extractor.exhentai.original

@ -22,10 +22,10 @@ BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
class ExhentaiExtractor(Extractor): class ExhentaiExtractor(Extractor):
"""Base class for exhentai extractors""" """Base class for exhentai extractors"""
category = "exhentai" category = "exhentai"
directory_fmt = ("{category}", "{gallery_id} {title[:247]}") directory_fmt = ("{category}", "{gid} {title[:247]}")
filename_fmt = ( filename_fmt = (
"{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}") "{gid}_{num:>04}_{image_token}_{filename}.{extension}")
archive_fmt = "{gallery_id}_{num}" archive_fmt = "{gid}_{num}"
cookienames = ("ipb_member_id", "ipb_pass_hash") cookienames = ("ipb_member_id", "ipb_pass_hash")
cookiedomain = ".exhentai.org" cookiedomain = ".exhentai.org"
root = "https://exhentai.org" root = "https://exhentai.org"
@ -131,7 +131,39 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
r"|/s/([\da-f]{10})/(\d+)-(\d+))") r"|/s/([\da-f]{10})/(\d+)-(\d+))")
test = ( test = (
("https://exhentai.org/g/1200119/d55c44d3d0/", { ("https://exhentai.org/g/1200119/d55c44d3d0/", {
"keyword": "199db053b4ccab94463b459e1cfe079df8cdcdd1", "keyword": {
"cost": int,
"date": "dt:2018-03-18 20:15:00",
"eh_category": "Non-H",
"expunged": False,
"favorites": "17",
"filecount": "4",
"filesize": 1488978,
"gid": 1200119,
"height": int,
"image_token": "re:[0-9a-f]{10}",
"lang": "jp",
"language": "Japanese",
"parent": "",
"rating": r"re:\d\.\d+",
"size": int,
"tags": [
"parody:komi-san wa komyushou desu.",
"character:shouko komi",
"group:seventh lowlife",
"sample",
],
"thumb": "https://exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f8"
"3bcb1630ab1350640-624622-736-1036-jpg_250.jpg",
"title": "C93 [Seventh_Lowlife] Komi-san ha Tokidoki Daitan de"
"su (Komi-san wa Komyushou desu) [Sample]",
"title_jpn": "(C93) [Comiketjack (わ!)] 古見さんは、時々大胆"
"です。 (古見さんは、コミュ症です。) [見本]",
"token": "d55c44d3d0",
"torrentcount": "0",
"uploader": "klorpa",
"width": int,
},
"content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
}), }),
("https://exhentai.org/g/960461/4f0e369d82/", { ("https://exhentai.org/g/960461/4f0e369d82/", {
@ -182,6 +214,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
gpage = self._gallery_page() gpage = self._gallery_page()
data = self.get_metadata(gpage) data = self.get_metadata(gpage)
self.count = text.parse_int(data["filecount"])
yield Message.Directory, data yield Message.Directory, data
images = itertools.chain( images = itertools.chain(
@ -197,37 +230,38 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def get_metadata(self, page): def get_metadata(self, page):
"""Extract gallery metadata""" """Extract gallery metadata"""
if self.config("metadata") == "api": data = self.metadata_from_page(page)
return self.metadata_from_api() if self.config("metadata", False):
return self.metadata_from_page(page) data.update(self.metadata_from_api())
data["date"] = text.parse_timestamp(data["posted"])
return data
def metadata_from_page(self, page): def metadata_from_page(self, page):
extr = text.extract_from(page) extr = text.extract_from(page)
data = { data = {
"gallery_id" : self.gallery_id, "gid" : self.gallery_id,
"gallery_token": self.gallery_token, "token" : self.gallery_token,
"thumb" : extr("background:transparent url(", ")"), "thumb" : extr("background:transparent url(", ")"),
"title" : text.unescape(extr('<h1 id="gn">', '</h1>')), "title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
"title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')), "title_jpn" : text.unescape(extr('<h1 id="gj">', '</h1>')),
"_" : extr('<div id="gdc"><div class="cs ct', '"'),
"eh_category" : extr('>', '<'),
"uploader" : text.unquote(extr('/uploader/', '"')), "uploader" : text.unquote(extr('/uploader/', '"')),
"date" : text.parse_datetime(extr( "date" : text.parse_datetime(extr(
'>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"), '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
"parent" : extr( "parent" : extr(
'>Parent:</td><td class="gdt2"><a href="', '"'), '>Parent:</td><td class="gdt2"><a href="', '"'),
"visible" : extr( "expunged" : "Yes" != extr(
'>Visible:</td><td class="gdt2">', '<'), '>Visible:</td><td class="gdt2">', '<'),
"language" : extr( "language" : extr('>Language:</td><td class="gdt2">', ' '),
'>Language:</td><td class="gdt2">', ' '), "filesize" : text.parse_bytes(extr(
"gallery_size" : text.parse_bytes(extr(
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")), '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
"count" : text.parse_int(extr( "filecount" : extr('>Length:</td><td class="gdt2">', ' '),
'>Length:</td><td class="gdt2">', ' ')), "favorites" : extr('id="favcount">', ' '),
"favorites" : text.parse_int(extr('id="favcount">', ' ')), "rating" : extr(">Average: ", "<"),
"rating" : text.parse_float(extr(">Average: ", "<")), "torrentcount" : extr('>Torrent Download (', ')'),
"torrentcount" : text.parse_int(extr('>Torrent Download (', ')')),
} }
self.count = data["count"]
data["lang"] = util.language_to_code(data["language"]) data["lang"] = util.language_to_code(data["language"])
data["tags"] = [ data["tags"] = [
text.unquote(tag.replace("+", " ")) text.unquote(tag.replace("+", " "))
@ -248,12 +282,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if "error" in data: if "error" in data:
raise exception.StopExtraction(data["error"]) raise exception.StopExtraction(data["error"])
data = data["gmetadata"][0] return data["gmetadata"][0]
data["eh_category"] = data["category"]
data["date"] = text.parse_timestamp(data["posted"])
self.count = data["filecount"]
return data
def image_from_page(self, page): def image_from_page(self, page):
"""Get image url and data from webpage""" """Get image url and data from webpage"""

Loading…
Cancel
Save