update extractor class hierarchies

- let the GalleryExtractor class inherit directly from Extractor
- make ChapterExtractor a subclass of GalleryExtractor
- change enumeration field names of GalleryExtractors to 'num'
pull/465/head
Mike Fährmann 5 years ago
parent 7ebd984e8d
commit 1693d97bd3
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor):
test = ( test = (
("https://www.adultempire.com/5998/gallery.html", { ("https://www.adultempire.com/5998/gallery.html", {
"range": "1", "range": "1",
"keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361", "keyword": "5b3266e69801db0d78c22181da23bc102886e027",
"content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e", "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e",
}), }),
("https://www.adultdvdempire.com/5683/gallery.html", { ("https://www.adultdvdempire.com/5683/gallery.html", {
"url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d", "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d",
"keyword": "9634eb16cc6dbf347eb9dcdd9b2a499dfd04d167", "keyword": "8d448d79c4ac5f5b10a3019d5b5129ddb43655e5",
}), }),
) )
@ -55,4 +55,4 @@ class AdultempireGalleryExtractor(GalleryExtractor):
if len(urls) < 24: if len(urls) < 24:
return return
params["page"] += 1 params["page"] += 1
page = self.request(self.chapter_url, params=params).text page = self.request(self.gallery_url, params=params).text

@ -249,24 +249,21 @@ class Extractor():
yield test yield test
class ChapterExtractor(Extractor): class GalleryExtractor(Extractor):
subcategory = "chapter" subcategory = "gallery"
directory_fmt = ( filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
"{category}", "{manga}", directory_fmt = ("{category}", "{gallery_id} {title}")
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}") archive_fmt = "{gallery_id}_{num}"
filename_fmt = ( enum = "num"
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
archive_fmt = (
"{manga}_{chapter}{chapter_minor}_{page}")
def __init__(self, match, url=None): def __init__(self, match, url=None):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.chapter_url = url or self.root + match.group(1) self.gallery_url = self.root + match.group(1) if url is None else url
def items(self): def items(self):
self.login() self.login()
page = self.request(self.chapter_url).text page = self.request(self.gallery_url).text
data = self.metadata(page) data = self.metadata(page)
imgs = self.images(page) imgs = self.images(page)
@ -284,7 +281,7 @@ class ChapterExtractor(Extractor):
yield Message.Version, 1 yield Message.Version, 1
yield Message.Directory, data yield Message.Directory, data
for data["page"], (url, imgdata) in images: for data[self.enum], (url, imgdata) in images:
if imgdata: if imgdata:
data.update(imgdata) data.update(imgdata)
yield Message.Url, url, text.nameext_from_url(url, data) yield Message.Url, url, text.nameext_from_url(url, data)
@ -299,6 +296,19 @@ class ChapterExtractor(Extractor):
"""Return a list of all (image-url, metadata)-tuples""" """Return a list of all (image-url, metadata)-tuples"""
class ChapterExtractor(GalleryExtractor):
subcategory = "chapter"
directory_fmt = (
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
archive_fmt = (
"{manga}_{chapter}{chapter_minor}_{page}")
enum = "page"
class MangaExtractor(Extractor): class MangaExtractor(Extractor):
subcategory = "manga" subcategory = "manga"
@ -333,14 +343,6 @@ class MangaExtractor(Extractor):
"""Return a list of all (chapter-url, metadata)-tuples""" """Return a list of all (chapter-url, metadata)-tuples"""
class GalleryExtractor(ChapterExtractor):
subcategory = "gallery"
filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
directory_fmt = ("{category}", "{gallery_id} {title}")
archive_fmt = "{gallery_id}_{page}"
class AsynchronousMixin(): class AsynchronousMixin():
"""Run info extraction in a separate thread""" """Run info extraction in a separate thread"""

@ -44,14 +44,13 @@ class FoolslideBase(SharedConfigMixin):
class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
"""Base class for chapter extractors for FoOlSlide based sites""" """Base class for chapter extractors for FoOlSlide based sites"""
directory_fmt = ( directory_fmt = ("{category}", "{manga}", "{chapter_string}")
"{category}", "{manga}", "{chapter_string}")
archive_fmt = "{id}" archive_fmt = "{id}"
pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
decode = "default" decode = "default"
def items(self): def items(self):
page = self.request(self.chapter_url).text page = self.request(self.gallery_url).text
data = self.metadata(page) data = self.metadata(page)
imgs = self.images(page) imgs = self.images(page)
@ -77,7 +76,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
def metadata(self, page): def metadata(self, page):
extr = text.extract_from(page) extr = text.extract_from(page)
extr('<h1 class="tbtitle dnone">', '') extr('<h1 class="tbtitle dnone">', '')
return self.parse_chapter_url(self.chapter_url, { return self.parse_chapter_url(self.gallery_url, {
"manga" : text.unescape(extr('title="', '"')).strip(), "manga" : text.unescape(extr('title="', '"')).strip(),
"chapter_string": text.unescape(extr('title="', '"')), "chapter_string": text.unescape(extr('title="', '"')),
}) })

@ -42,7 +42,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
def metadata(self, page): def metadata(self, page):
headers = { headers = {
"Referer" : self.chapter_url, "Referer" : self.gallery_url,
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",
} }
auth = self.request( auth = self.request(

@ -31,10 +31,10 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
info = text.unescape(text.extract(page, '<title>', '</title>')[0]) info = text.unescape(text.extract(page, '<title>', '</title>')[0])
manga, _, chapter_string = info.partition(" :: ") manga, _, chapter_string = info.partition(" :: ")
data = self._data(self.chapter_url.split("/")[5]) data = self._data(self.gallery_url.split("/")[5])
data["manga"] = manga data["manga"] = manga
data["chapter_string"] = chapter_string.rstrip(" :") data["chapter_string"] = chapter_string.rstrip(" :")
return self.parse_chapter_url(self.chapter_url, data) return self.parse_chapter_url(self.gallery_url, data)
@memcache(keyarg=1) @memcache(keyarg=1)
def _data(self, manga): def _data(self, manga):

@ -24,7 +24,7 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
test = ("https://hentaifox.com/gallery/56622/", { test = ("https://hentaifox.com/gallery/56622/", {
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
"count": 24, "count": 24,
"keyword": "38f8517605feb6854d48833297da6b05c6541b69", "keyword": "903ebe227d85e484460382fc6cbab42be7a244d5",
}) })
def __init__(self, match): def __init__(self, match):

@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
test = ( test = (
("https://hentainexus.com/view/5688", { ("https://hentainexus.com/view/5688", {
"url": "746d0043e20030f1171aae5ea113176607302517", "url": "746d0043e20030f1171aae5ea113176607302517",
"keyword": "b05986369fbaf29cfa08b118960d92c49e59524b", "keyword": "9512cf5f258130e5f75de9954d7a13217c2405e7",
}), }),
("https://hentainexus.com/read/5688"), ("https://hentainexus.com/read/5688"),
) )

@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = ( test = (
("https://hitomi.la/galleries/867789.html", { ("https://hitomi.la/galleries/867789.html", {
"pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg", "pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg",
"keyword": "d097a8db8e810045131b4510c41714004f9eff3a", "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
"count": 16, "count": 16,
}), }),
("https://hitomi.la/galleries/1401410.html", { ("https://hitomi.la/galleries/1401410.html", {
@ -89,7 +89,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
base = "https://" + subdomain + ".hitomi.la/galleries/" base = "https://" + subdomain + ".hitomi.la/galleries/"
# set Referer header before image downloads (#239) # set Referer header before image downloads (#239)
self.session.headers["Referer"] = self.chapter_url self.session.headers["Referer"] = self.gallery_url
# handle Game CG galleries with scenes (#321) # handle Game CG galleries with scenes (#321)
scenes = text.extract(page, "var scene_indexes = [", "]")[0] scenes = text.extract(page, "var scene_indexes = [", "]")[0]

@ -17,14 +17,14 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
category = "nsfwalbum" category = "nsfwalbum"
subcategory = "album" subcategory = "album"
root = "https://nsfwalbum.com" root = "https://nsfwalbum.com"
filename_fmt = "{album_id}_{page:>03}_{id}.{extension}" filename_fmt = "{album_id}_{num:>03}_{id}.{extension}"
directory_fmt = ("{category}", "{album_id} {title}") directory_fmt = ("{category}", "{album_id} {title}")
archive_fmt = "{id}" archive_fmt = "{id}"
pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))" pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
test = ("https://nsfwalbum.com/album/401611", { test = ("https://nsfwalbum.com/album/401611", {
"range": "1-5", "range": "1-5",
"url": "b0481fc7fad5982da397b6359fbed8421b8ba284", "url": "b0481fc7fad5982da397b6359fbed8421b8ba284",
"keyword": "fc1ad4ebcd6d4cf32da15203120112b8bcf12eec", "keyword": "e98f9b0d473c00000831618d0235863b1dd78294",
}) })
def __init__(self, match): def __init__(self, match):

@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com" (("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), { "/amazon-no-hiyaku-amazon-elixir"), {
"url": "21613585ae5ec2f69ea579e9713f536fceab5bd5", "url": "21613585ae5ec2f69ea579e9713f536fceab5bd5",
"keyword": "bf75f9ff0fb60756b1b9b92403526a72d9178d23", "keyword": "9e87a0973553b2922ddee37958b8f5d87910af72",
}), }),
("https://www.simply-hentai.com/notfound", { ("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException, "exception": exception.GalleryDLException,
@ -43,7 +43,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
extr = text.extract_from(page) extr = text.extract_from(page)
split = text.split_html split = text.split_html
self.chapter_url = extr('<link rel="canonical" href="', '"') self.gallery_url = extr('<link rel="canonical" href="', '"')
title = extr('<meta property="og:title" content="', '"') title = extr('<meta property="og:title" content="', '"')
if not title: if not title:
raise exception.NotFoundError("gallery") raise exception.NotFoundError("gallery")
@ -63,7 +63,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
return data return data
def images(self, _): def images(self, _):
url = self.chapter_url + "/all-pages" url = self.gallery_url + "/all-pages"
headers = {"Accept": "application/json"} headers = {"Accept": "application/json"}
images = self.request(url, headers=headers).json() images = self.request(url, headers=headers).json()
return [ return [

@ -109,7 +109,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
def images(self, page): def images(self, page):
url = "{}/Read/Index/{}?page=1".format(self.root, self.gallery_id) url = "{}/Read/Index/{}?page=1".format(self.root, self.gallery_id)
headers = {"Referer": self.chapter_url} headers = {"Referer": self.gallery_url}
response = self.request(url, headers=headers, fatal=False) response = self.request(url, headers=headers, fatal=False)
if "/Auth/" in response.url: if "/Auth/" in response.url:

@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
__version__ = "1.10.7-dev" __version__ = "1.11.0-dev"

@ -27,6 +27,8 @@ TRAVIS_SKIP = {
# temporary issues, etc. # temporary issues, etc.
BROKEN = { BROKEN = {
"8chan", "8chan",
"hentaifoundry",
"luscious",
"mangapark", "mangapark",
} }

Loading…
Cancel
Save