update extractor class hierarchies

- let the GalleryExtractor class inherit directly from Extractor
- make ChapterExtractor a subclass of GalleryExtractor
- change enumeration field names of GalleryExtractors to 'num'
pull/465/head
Mike Fährmann 5 years ago
parent 7ebd984e8d
commit 1693d97bd3
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor):
test = (
("https://www.adultempire.com/5998/gallery.html", {
"range": "1",
"keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361",
"keyword": "5b3266e69801db0d78c22181da23bc102886e027",
"content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e",
}),
("https://www.adultdvdempire.com/5683/gallery.html", {
"url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d",
"keyword": "9634eb16cc6dbf347eb9dcdd9b2a499dfd04d167",
"keyword": "8d448d79c4ac5f5b10a3019d5b5129ddb43655e5",
}),
)
@ -55,4 +55,4 @@ class AdultempireGalleryExtractor(GalleryExtractor):
if len(urls) < 24:
return
params["page"] += 1
page = self.request(self.chapter_url, params=params).text
page = self.request(self.gallery_url, params=params).text

@ -249,24 +249,21 @@ class Extractor():
yield test
class ChapterExtractor(Extractor):
class GalleryExtractor(Extractor):
subcategory = "chapter"
directory_fmt = (
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
archive_fmt = (
"{manga}_{chapter}{chapter_minor}_{page}")
subcategory = "gallery"
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
directory_fmt = ("{category}", "{gallery_id} {title}")
archive_fmt = "{gallery_id}_{num}"
enum = "num"
def __init__(self, match, url=None):
Extractor.__init__(self, match)
self.chapter_url = url or self.root + match.group(1)
self.gallery_url = self.root + match.group(1) if url is None else url
def items(self):
self.login()
page = self.request(self.chapter_url).text
page = self.request(self.gallery_url).text
data = self.metadata(page)
imgs = self.images(page)
@ -284,7 +281,7 @@ class ChapterExtractor(Extractor):
yield Message.Version, 1
yield Message.Directory, data
for data["page"], (url, imgdata) in images:
for data[self.enum], (url, imgdata) in images:
if imgdata:
data.update(imgdata)
yield Message.Url, url, text.nameext_from_url(url, data)
@ -299,6 +296,19 @@ class ChapterExtractor(Extractor):
"""Return a list of all (image-url, metadata)-tuples"""
class ChapterExtractor(GalleryExtractor):
subcategory = "chapter"
directory_fmt = (
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
archive_fmt = (
"{manga}_{chapter}{chapter_minor}_{page}")
enum = "page"
class MangaExtractor(Extractor):
subcategory = "manga"
@ -333,14 +343,6 @@ class MangaExtractor(Extractor):
"""Return a list of all (chapter-url, metadata)-tuples"""
class GalleryExtractor(ChapterExtractor):
subcategory = "gallery"
filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
directory_fmt = ("{category}", "{gallery_id} {title}")
archive_fmt = "{gallery_id}_{page}"
class AsynchronousMixin():
"""Run info extraction in a separate thread"""

@ -44,14 +44,13 @@ class FoolslideBase(SharedConfigMixin):
class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
"""Base class for chapter extractors for FoOlSlide based sites"""
directory_fmt = (
"{category}", "{manga}", "{chapter_string}")
directory_fmt = ("{category}", "{manga}", "{chapter_string}")
archive_fmt = "{id}"
pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
decode = "default"
def items(self):
page = self.request(self.chapter_url).text
page = self.request(self.gallery_url).text
data = self.metadata(page)
imgs = self.images(page)
@ -77,7 +76,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
def metadata(self, page):
extr = text.extract_from(page)
extr('<h1 class="tbtitle dnone">', '')
return self.parse_chapter_url(self.chapter_url, {
return self.parse_chapter_url(self.gallery_url, {
"manga" : text.unescape(extr('title="', '"')).strip(),
"chapter_string": text.unescape(extr('title="', '"')),
})

@ -42,7 +42,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
def metadata(self, page):
headers = {
"Referer" : self.chapter_url,
"Referer" : self.gallery_url,
"X-Requested-With": "XMLHttpRequest",
}
auth = self.request(

@ -31,10 +31,10 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
info = text.unescape(text.extract(page, '<title>', '</title>')[0])
manga, _, chapter_string = info.partition(" :: ")
data = self._data(self.chapter_url.split("/")[5])
data = self._data(self.gallery_url.split("/")[5])
data["manga"] = manga
data["chapter_string"] = chapter_string.rstrip(" :")
return self.parse_chapter_url(self.chapter_url, data)
return self.parse_chapter_url(self.gallery_url, data)
@memcache(keyarg=1)
def _data(self, manga):

@ -24,7 +24,7 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
test = ("https://hentaifox.com/gallery/56622/", {
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
"count": 24,
"keyword": "38f8517605feb6854d48833297da6b05c6541b69",
"keyword": "903ebe227d85e484460382fc6cbab42be7a244d5",
})
def __init__(self, match):

@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
test = (
("https://hentainexus.com/view/5688", {
"url": "746d0043e20030f1171aae5ea113176607302517",
"keyword": "b05986369fbaf29cfa08b118960d92c49e59524b",
"keyword": "9512cf5f258130e5f75de9954d7a13217c2405e7",
}),
("https://hentainexus.com/read/5688"),
)

@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = (
("https://hitomi.la/galleries/867789.html", {
"pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg",
"keyword": "d097a8db8e810045131b4510c41714004f9eff3a",
"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
"count": 16,
}),
("https://hitomi.la/galleries/1401410.html", {
@ -89,7 +89,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
base = "https://" + subdomain + ".hitomi.la/galleries/"
# set Referer header before image downloads (#239)
self.session.headers["Referer"] = self.chapter_url
self.session.headers["Referer"] = self.gallery_url
# handle Game CG galleries with scenes (#321)
scenes = text.extract(page, "var scene_indexes = [", "]")[0]

@ -17,14 +17,14 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
category = "nsfwalbum"
subcategory = "album"
root = "https://nsfwalbum.com"
filename_fmt = "{album_id}_{page:>03}_{id}.{extension}"
filename_fmt = "{album_id}_{num:>03}_{id}.{extension}"
directory_fmt = ("{category}", "{album_id} {title}")
archive_fmt = "{id}"
pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
test = ("https://nsfwalbum.com/album/401611", {
"range": "1-5",
"url": "b0481fc7fad5982da397b6359fbed8421b8ba284",
"keyword": "fc1ad4ebcd6d4cf32da15203120112b8bcf12eec",
"keyword": "e98f9b0d473c00000831618d0235863b1dd78294",
})
def __init__(self, match):

@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
"url": "21613585ae5ec2f69ea579e9713f536fceab5bd5",
"keyword": "bf75f9ff0fb60756b1b9b92403526a72d9178d23",
"keyword": "9e87a0973553b2922ddee37958b8f5d87910af72",
}),
("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException,
@ -43,7 +43,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
extr = text.extract_from(page)
split = text.split_html
self.chapter_url = extr('<link rel="canonical" href="', '"')
self.gallery_url = extr('<link rel="canonical" href="', '"')
title = extr('<meta property="og:title" content="', '"')
if not title:
raise exception.NotFoundError("gallery")
@ -63,7 +63,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
return data
def images(self, _):
url = self.chapter_url + "/all-pages"
url = self.gallery_url + "/all-pages"
headers = {"Accept": "application/json"}
images = self.request(url, headers=headers).json()
return [

@ -109,7 +109,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
def images(self, page):
url = "{}/Read/Index/{}?page=1".format(self.root, self.gallery_id)
headers = {"Referer": self.chapter_url}
headers = {"Referer": self.gallery_url}
response = self.request(url, headers=headers, fatal=False)
if "/Auth/" in response.url:

@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
__version__ = "1.10.7-dev"
__version__ = "1.11.0-dev"

@ -27,6 +27,8 @@ TRAVIS_SKIP = {
# temporary issues, etc.
BROKEN = {
"8chan",
"hentaifoundry",
"luscious",
"mangapark",
}

Loading…
Cancel
Save