From 5b3c34aa9621df43c455c7cc0e0011ed2f02cd28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 7 Feb 2018 11:22:47 +0100 Subject: [PATCH] use generic chapter-extractor in more modules --- gallery_dl/extractor/common.py | 9 ++-- gallery_dl/extractor/hentai2read.py | 36 +++++++------- gallery_dl/extractor/hentaicdn.py | 41 ---------------- gallery_dl/extractor/hentaihere.py | 36 ++++++++------ gallery_dl/extractor/kissmanga.py | 51 +++++++------------- gallery_dl/extractor/mangafox.py | 63 +++++++++--------------- gallery_dl/extractor/mangahere.py | 38 ++++----------- gallery_dl/extractor/mangapanda.py | 4 +- gallery_dl/extractor/mangareader.py | 42 +++++++--------- gallery_dl/extractor/mangastream.py | 45 ++++++----------- gallery_dl/extractor/powermanga.py | 4 +- gallery_dl/extractor/readcomiconline.py | 48 +++++++++++-------- gallery_dl/extractor/spectrumnexus.py | 64 ++++++++----------------- 13 files changed, 178 insertions(+), 303 deletions(-) delete mode 100644 gallery_dl/extractor/hentaicdn.py diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index b7c550f6..5914aa67 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -168,9 +168,9 @@ class ChapterExtractor(Extractor): subcategory = "chapter" directory_fmt = [ "{category}", "{manga}", - "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"] + "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}"] filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") + "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") def __init__(self, url): Extractor.__init__(self) @@ -182,7 +182,10 @@ class ChapterExtractor(Extractor): imgs = self.get_images(page) if "count" in data: - images = zip(range(1, data["count"]+1), imgs) + images = zip( + range(1, data["count"]+1), + imgs + ) else: try: data["count"] = len(imgs) diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 5b369e47..c960daa4 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,9 +8,8 @@ """Extract hentai-manga from https://hentai2read.com/""" -from .common import MangaExtractor +from .common import ChapterExtractor, MangaExtractor from .. import text, util -from . import hentaicdn import re import json @@ -53,42 +52,43 @@ class Hentai2readMangaExtractor(MangaExtractor): return results -class Hentai2readChapterExtractor(hentaicdn.HentaicdnChapterExtractor): +class Hentai2readChapterExtractor(ChapterExtractor): """Extractor for a single manga chapter from hentai2read.com""" category = "hentai2read" pattern = [r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)"] test = [("http://hentai2read.com/amazon_elixir/1/", { "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", - "keyword": "0f6408d462a14bfe58030117dc295b84666843d0", + "keyword": "9845105898d28c6a540cffdea60a1a20fab52431", })] def __init__(self, match): - hentaicdn.HentaicdnChapterExtractor.__init__(self) url_title, self.chapter = match.groups() - self.url = "https://hentai2read.com/{}/{}/".format( - url_title, self.chapter - ) + url = "https://hentai2read.com/{}/{}/".format(url_title, self.chapter) + ChapterExtractor.__init__(self, url) - def get_job_metadata(self, page, images): - title = text.extract(page, "", "")[0] - chapter_id = text.extract(page, 'data-cid="', '"')[0] + def get_metadata(self, page): + title, pos = text.extract(page, "", "") + manga_id, pos = text.extract(page, 'data-mid="', '"', pos) + chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " r"(\d+): (.+) . Page 1 ", title) return { - "manga_id": images[0].split("/")[-3], "manga": match.group(1), + "manga_id": util.safe_int(manga_id), + "chapter": util.safe_int(self.chapter), + "chapter_id": util.safe_int(chapter_id), "type": match.group(2), - "chapter_id": chapter_id, - "chapter": self.chapter, "author": match.group(3), "title": match.group(5), - "count": len(images), "lang": "en", "language": "English", } @staticmethod - def get_image_urls(page): + def get_images(page): """Extract and return a list of all image-urls""" images = text.extract(page, "'images' : ", ",\n")[0] - return json.loads(images) + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] diff --git a/gallery_dl/extractor/hentaicdn.py b/gallery_dl/extractor/hentaicdn.py deleted file mode 100644 index 0ef55456..00000000 --- a/gallery_dl/extractor/hentaicdn.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2016-2017 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Base classes for extractors from sites based on hentaicdn""" - -from .common import Extractor, Message -from .. import text -import json - - -class HentaicdnChapterExtractor(Extractor): - """Base class for extractors for a single manga chapter""" - subcategory = "chapter" - directory_fmt = ["{category}", "{manga_id} {title}"] - filename_fmt = ("{category}_{manga_id}_{chapter:>02}_" - "{num:>03}.{extension}") - url = "" - - def items(self): - page = self.request(self.url).text - images = self.get_image_urls(page) - data = self.get_job_metadata(page, images) - yield Message.Version, 1 - yield Message.Directory, data - for data["num"], part in enumerate(images, 1): - url = "https://hentaicdn.com/hentai" + part - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self, page, images): - """Collect metadata for extractor-job""" - - @staticmethod - def get_image_urls(page): - """Extract and return a list of all image-urls""" - images = text.extract(page, "var rff_imageList = ", ";")[0] - return json.loads(images) diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index 48ebb2ea..9937cf8b 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,10 +8,10 @@ """Extract hentai-manga from https://hentaihere.com/""" -from .common import MangaExtractor +from .common import ChapterExtractor, MangaExtractor from .. import text, util -from . import hentaicdn import re +import json class HentaihereMangaExtractor(MangaExtractor): @@ -56,36 +56,42 @@ class HentaihereMangaExtractor(MangaExtractor): })) -class HentaihereChapterExtractor(hentaicdn.HentaicdnChapterExtractor): +class HentaihereChapterExtractor(ChapterExtractor): """Extractor for a single manga chapter from hentaihere.com""" category = "hentaihere" pattern = [r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"] test = [("https://hentaihere.com/m/S13812/1/1/", { "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", - "keyword": "a07753f655210525a80ff62607261715746f3273", + "keyword": "e9382a9be337abce3db2b1132e85751379dc05c5", })] def __init__(self, match): - hentaicdn.HentaicdnChapterExtractor.__init__(self) - self.gid, self.chapter = match.groups() - self.url = "https://hentaihere.com/m/S{}/{}/1".format( - self.gid, self.chapter - ) + self.manga_id, self.chapter = match.groups() + url = "https://hentaihere.com/m/S{}/{}/1".format( + self.manga_id, self.chapter) + ChapterExtractor.__init__(self, url) - def get_job_metadata(self, page, images): + def get_metadata(self, page): title = text.extract(page, "", "")[0] chapter_id = text.extract(page, 'report/C', '"')[0] pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " match = re.match(pattern, title) return { - "manga_id": self.gid, "manga": match.group(1), + "manga_id": util.safe_int(self.manga_id), + "chapter": util.safe_int(self.chapter), + "chapter_id": util.safe_int(chapter_id), "type": match.group(2), - "chapter_id": chapter_id, - "chapter": self.chapter, "title": match.group(3), "author": match.group(4), - "count": len(images), "lang": "en", "language": "English", } + + @staticmethod + def get_images(page): + images = text.extract(page, "var rff_imageList = ", ";")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index fe9f1ae6..69a6aaeb 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extract manga-chapters and entire manga from http://kissmanga.com/""" -from .common import Extractor, MangaExtractor, Message +from .common import ChapterExtractor, MangaExtractor from .. import text, util, cloudflare, aes, exception from ..cache import cache import re @@ -21,21 +21,11 @@ IV = [ ] -class KissmangaExtractor(Extractor): +class KissmangaBase(): """Base class for kissmanga extractors""" category = "kissmanga" - directory_fmt = [ - "{category}", "{manga}", - "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"] - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") root = "http://kissmanga.com" - def __init__(self, match): - Extractor.__init__(self) - self.url = match.group(0) - self.session.headers["Referer"] = self.root - def request(self, url): response = cloudflare.request_func(self, url) if response.history and "/Message/AreYouHuman?" in response.url: @@ -72,10 +62,10 @@ class KissmangaExtractor(Extractor): return data -class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor): +class KissmangaMangaExtractor(KissmangaBase, MangaExtractor): """Extractor for manga from kissmanga.com""" - pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/" - r"Manga/[^/?&#]+/?$"] + pattern = [r"(?i)(?:https?://)?(?:www\.)?(kissmanga\.com" + r"/Manga/[^/?&#]+/?)$"] test = [ ("http://kissmanga.com/Manga/Dropout", { "url": "992befdd64e178fe5af67de53f8b510860d968ca", @@ -105,11 +95,10 @@ class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor): return results -class KissmangaChapterExtractor(KissmangaExtractor): +class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): """Extractor for manga-chapters from kissmanga.com""" - subcategory = "chapter" - pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/" - r"Manga/[^/?&#]+/[^/?&#]+\?id=\d+"] + pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" + r"/Manga/[^/?&#]+/[^/?&#]+\?id=\d+"] test = [ ("http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", { "url": "4136bcd1c6cecbca8cc2bc965d54f33ef0a97cc0", @@ -126,18 +115,11 @@ class KissmangaChapterExtractor(KissmangaExtractor): ("http://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608", None), ] - def items(self): - page = self.request(self.url).text - data = self.get_job_metadata(page) - imgs = self.get_image_urls(page) - data["count"] = len(imgs) - yield Message.Version, 1 - yield Message.Directory, data - for data["page"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" + def __init__(self, match): + ChapterExtractor.__init__(self, match.group(0)) + self.session.headers["Referer"] = self.root + + def get_metadata(self, page): title = text.extract(page, "", "")[0].strip() manga, cinfo = title.split("\n")[1:3] data = { @@ -148,12 +130,11 @@ class KissmangaChapterExtractor(KissmangaExtractor): } return self.parse_chapter_string(data) - def get_image_urls(self, page): - """Extract list of all image-urls for a manga chapter""" + def get_images(self, page): try: key = self.build_aes_key(page) return [ - aes.aes_cbc_decrypt_text(data, key, IV) + (aes.aes_cbc_decrypt_text(data, key, IV), None) for data in text.extract_iter( page, 'lstImages.push(wrapKA("', '"' ) diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index ab681e74..906c2372 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -1,56 +1,39 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from http://www.mangafox.me/""" +"""Extract manga-chapters and entire manga from http://fanfox.net/""" -from .common import AsynchronousExtractor, Message +from .common import ChapterExtractor from .. import text, util, exception import re -class MangafoxChapterExtractor(AsynchronousExtractor): - """Extractor for manga-chapters from mangafox.me""" +class MangafoxChapterExtractor(ChapterExtractor): + """Extractor for manga-chapters from fanfox.net""" category = "mangafox" - subcategory = "chapter" - directory_fmt = [ - "{category}", "{manga}", - "{volume:?v/ />02}c{chapter:>03}{chapter_minor}"] - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") - pattern = [(r"(?:https?://)?(?:www\.)?(mangafox\.me/manga/" - r"[^/]+/(v\d+/)?c\d+[^/]*)")] - test = [(("http://mangafox.me/manga/kidou_keisatsu_patlabor/" - "v05/c006.2/1.html"), { - "keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd", - "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c", - })] + pattern = [(r"(?:https?://)?(?:www\.)?(?:mangafox\.me|fanfox\.net)" + r"(/manga/[^/]+/(?:v\d+/)?c\d+[^/?&#]*)")] + test = [ + ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", { + "keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd", + "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c", + }), + ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/", None), + ] + root = "http://fanfox.net" def __init__(self, match): - AsynchronousExtractor.__init__(self) - self.url = "http://" + match.group(1) + self.urlbase = self.root + match.group(1) + ChapterExtractor.__init__(self, self.urlbase + "/1.html") - def items(self): - page = self.request(self.url + "/1.html").text + def get_metadata(self, page): if "Sorry, its licensed, and not available." in page: raise exception.AuthorizationError() - data = self.get_metadata(page) - urls = zip( - range(1, data["count"]+1), - self.get_image_urls(page), - ) - yield Message.Version, 1 - yield Message.Directory, data.copy() - for data["page"], url in urls: - text.nameext_from_url(url, data) - yield Message.Url, url, data.copy() - - def get_metadata(self, page): - """Collect metadata for extractor-job""" data = text.extract_all(page, ( ("manga" , " - Read ", " Manga Scans "), ("sid" , "var sid=", ";"), @@ -67,14 +50,14 @@ class MangafoxChapterExtractor(AsynchronousExtractor): data[key] = util.safe_int(data[key]) return data - def get_image_urls(self, page): - """Yield all image-urls for this chapter""" + def get_images(self, page): pnum = 1 while True: url, pos = text.extract(page, '02}c{chapter:>03}{chapter_minor}"] - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") pattern = [(r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/" r"([^/]+(?:/v0*(\d+))?/c0*(\d+)(\.\d+)?)")] test = [ @@ -82,27 +76,12 @@ class MangahereChapterExtractor(AsynchronousExtractor): url_fmt = "http://www.mangahere.cc/manga/{}/{}.html" def __init__(self, match): - AsynchronousExtractor.__init__(self) self.part, self.volume, self.chapter, self.chminor = match.groups() - - def items(self): # remove ".html" for the first chapter page to avoid redirects url = self.url_fmt.format(self.part, "")[:-5] + ChapterExtractor.__init__(self, url) - page = self.request(url).text - data = self.get_job_metadata(page) - urls = zip( - range(1, data["count"]+1), - self.get_image_urls(page), - ) - - yield Message.Version, 1 - yield Message.Directory, data.copy() - for data["page"], url in urls: - text.nameext_from_url(url, data) - yield Message.Url, url, data.copy() - - def get_job_metadata(self, page): + def get_metadata(self, page): """Collect metadata for extractor-job""" manga, pos = text.extract(page, '', '') chid , pos = text.extract(page, '.net/store/manga/', '/', pos) @@ -122,15 +101,16 @@ class MangahereChapterExtractor(AsynchronousExtractor): "language": "English", } - def get_image_urls(self, page): + def get_images(self, page): """Yield all image-urls for this chapter""" pnum = 1 while True: url, pos = text.extract(page, '03}{title:?: //}"] - filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}" pattern = [ (r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"), (r"(?:https?://)?(?:www\.)?mangareader\.net" @@ -70,26 +67,14 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor): test = [(("http://www.mangareader.net/" "karate-shoukoushi-kohinata-minoru/11"), { "url": "84ffaab4c027ef9022695c53163c3aeabd07ca58", - "keyword": "2038e6a780a0028eee0067985b55debb1d4a6aab", + "keyword": "2893cfcd1916859fb498f3345f1929f868fe667f", })] def __init__(self, match): - AsynchronousExtractor.__init__(self) self.part, self.url_title, self.chapter = match.groups() + ChapterExtractor.__init__(self, self.root + self.part) - def items(self): - page = self.request(self.root + self.part).text - data = self.get_job_metadata(page) - yield Message.Version, 1 - yield Message.Directory, data - for data["page"] in range(1, data["count"]+1): - next_url, image_url, image_data = self.get_page_metadata(page) - image_data.update(data) - yield Message.Url, image_url, image_data - if next_url: - page = self.request(next_url).text - - def get_job_metadata(self, chapter_page): + def get_metadata(self, chapter_page): """Collect metadata for extractor-job""" page = self.request(self.root + self.url_title).text data = self.parse_page(page, { @@ -106,7 +91,16 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor): ) return data - def get_page_metadata(self, page): + def get_images(self, page): + while True: + next_url, image_url, image_data = self.get_image_metadata(page) + yield image_url, image_data + + if not next_url: + return + page = self.request(next_url).text + + def get_image_metadata(self, page): """Collect next url, image-url and metadata for one manga-page""" extr = text.extract width = None @@ -122,7 +116,7 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor): width , pos = extr(page, '', "<") pos = page.find(self.part, pos) @@ -59,9 +40,11 @@ class MangastreamChapterExtractor(AsynchronousExtractor): "language": "English", } - @staticmethod - def get_page_metadata(page): - """Collect next url, image-url and metadata for one manga-page""" - nurl, pos = text.extract(page, '
\n\n03}"] filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" root = "http://readcomiconline.to" + useragent = "Wget/1.19.2 (linux-gnu)" - def __init__(self, match): - kissmanga.KissmangaExtractor.__init__(self, match) - self.session.headers["User-Agent"] = "Wget/1.19.2 (linux-gnu)" + request = cloudflare.request_func -class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor, - kissmanga.KissmangaMangaExtractor): +class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): """Extractor for comics from readcomiconline.to""" subcategory = "comic" - pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/" - r"Comic/[^/?&#]+/?$"] + pattern = [r"(?i)(?:https?://)?(?:www\.)?(readcomiconline\.to" + r"/Comic/[^/?&#]+/?)$"] test = [ ("http://readcomiconline.to/Comic/W-i-t-c-h", { "url": "c5a530538a30b176916e30cbe223a93d83cb2691", @@ -42,6 +40,10 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor, }), ] + def __init__(self, match): + MangaExtractor.__init__(self, match) + self.session.headers["User-Agent"] = self.useragent + def chapters(self, page): results = [] comic, pos = text.extract(page, '

', '<') @@ -58,19 +60,21 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor, return results -class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor, - kissmanga.KissmangaChapterExtractor): +class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): """Extractor for comic-issues from readcomiconline.to""" subcategory = "issue" - pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/" - r"Comic/[^/?&#]+/[^/?&#]+\?id=\d+"] + pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" + r"/Comic/[^/?&#]+/[^/?&#]+\?id=\d+"] test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { "url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5", "keyword": "dee8a8a44659825afe1d69e1d809a48b03e98c68", })] - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" + def __init__(self, match): + ChapterExtractor.__init__(self, match.group(0)) + self.session.headers["User-Agent"] = self.useragent + + def get_metadata(self, page): comic, pos = text.extract(page, " - Read\r\n ", "\r\n") iinfo, pos = text.extract(page, " ", "\r\n", pos) match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo) @@ -82,6 +86,10 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor, } @staticmethod - def get_image_urls(page): - """Extract list of all image-urls for a manga chapter""" - return list(text.extract_iter(page, 'lstImages.push("', '"')) + def get_images(page): + return [ + (url, None) + for url in text.extract_iter( + page, 'lstImages.push("', '"' + ) + ] diff --git a/gallery_dl/extractor/spectrumnexus.py b/gallery_dl/extractor/spectrumnexus.py index 2cfc1d99..93630196 100644 --- a/gallery_dl/extractor/spectrumnexus.py +++ b/gallery_dl/extractor/spectrumnexus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extract manga pages from http://www.thespectrum.net/manga_scans/""" -from .common import MangaExtractor, AsynchronousExtractor, Message +from .common import ChapterExtractor, MangaExtractor from .. import text, util @@ -33,55 +33,31 @@ class SpectrumnexusMangaExtractor(MangaExtractor): return results -class SpectrumnexusChapterExtractor(AsynchronousExtractor): +class SpectrumnexusChapterExtractor(ChapterExtractor): """Extractor for manga-chapters or -volumes from thespectrum.net""" category = "spectrumnexus" - subcategory = "chapter" - directory_fmt = ["{category}", "{manga}", "{identifier}"] - filename_fmt = "{manga} {identifier} {page:>03}.{extension}" - pattern = [ - (r"(?:https?://)?(view\.thespectrum\.net/series/" - r"[^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"), - (r"(?:https?://)?(view\.thespectrum\.net/series/" - r"[^/]+-chapter-(\d+)\.html)"), - ] + directory_fmt = ["{category}", "{manga}", "{chapter_string}"] + filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" + + pattern = [r"(?:https?://)?view\.thespectrum\.net/series/" + r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"] test = [(("http://view.thespectrum.net/series/" "toriko.html?ch=Chapter+343&page=1"), { "url": "c0fc7dc594841217cc622a67edd79f06e9900333", - "keyword": "3d0cb57b6b1c2cbecc7aed33f83c24891a4ff53f", + "keyword": "a8abe126cbc5fc798148b0b155242a470c1ba9d1", })] def __init__(self, match): - AsynchronousExtractor.__init__(self) - self.url = "http://" + match.group(1) - self.identifier = match.group(2) - self.chapter = match.group(3) - self.volume = match.group(4) - - def items(self): - params = { - "ch": self.identifier, - "page": 1, - } - page = self.request(self.url, params=params).text - data = self.get_job_metadata(page) - yield Message.Version, 1 - yield Message.Directory, data.copy() - for i in range(1, data["count"]+1): - url = self.get_image_url(page) - text.nameext_from_url(url, data) - data["page"] = i - yield Message.Url, url, data.copy() - if i < data["count"]: - params["page"] += 1 - page = self.request(self.url, params=params).text + path, self.chapter_string, self.chapter, self.volume = match.groups() + url = "http://view.thespectrum.net/series/{}?ch={}".format( + path, self.chapter_string) + ChapterExtractor.__init__(self, url) - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" + def get_metadata(self, page): data = { "chapter": util.safe_int(self.chapter), + "chapter_string": self.chapter_string.replace("+", " "), "volume": util.safe_int(self.volume), - "identifier": self.identifier.replace("+", " "), } data = text.extract_all(page, ( ('manga', '', ' · SPECTRUM NEXUS '), @@ -90,7 +66,9 @@ class SpectrumnexusChapterExtractor(AsynchronousExtractor): data["count"] = util.safe_int(data["count"]) return data - @staticmethod - def get_image_url(page): - """Extract url of one manga page""" - return text.extract(page, '