diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 36037387..b7c550f6 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,7 +18,7 @@ import requests import threading import http.cookiejar from .message import Message -from .. import config, exception +from .. import config, text, exception class Extractor(): @@ -163,6 +163,47 @@ class AsynchronousExtractor(Extractor): put(None) +class ChapterExtractor(Extractor): + + subcategory = "chapter" + directory_fmt = [ + "{category}", "{manga}", + "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"] + filename_fmt = ( + "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") + + def __init__(self, url): + Extractor.__init__(self) + self.url = url + + def items(self): + page = self.request(self.url).text + data = self.get_metadata(page) + imgs = self.get_images(page) + + if "count" in data: + images = zip(range(1, data["count"]+1), imgs) + else: + try: + data["count"] = len(imgs) + except TypeError: + pass + images = enumerate(imgs, 1) + + yield Message.Version, 1 + yield Message.Directory, data + for data["page"], (url, imgdata) in images: + if imgdata: + data.update(imgdata) + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_metadata(self, page): + """Return a dict with general metadata""" + + def get_images(self, page): + """Return a list of all (image-url, metadata)-tuples""" + + class MangaExtractor(Extractor): subcategory = "manga" @@ -176,7 +217,6 @@ class MangaExtractor(Extractor): self.url = url or self.scheme + "://" + match.group(1) def items(self): - self.login() page = self.request(self.url).text chapters = self.chapters(page) @@ -187,12 +227,8 @@ class MangaExtractor(Extractor): for chapter, data in chapters: yield Message.Queue, chapter, data - def login(self): - """Login and set necessary cookies""" - def chapters(self, page): - """Return a list of all (url, metadata)-tuples""" - return [] + """Return a list of all (chapter-url, metadata)-tuples""" class SharedConfigExtractor(Extractor): diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index e822a502..bd9107ac 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,54 +8,36 @@ """Extract manga-chapters from https://dynasty-scans.com/""" -from .common import Extractor, Message +from .common import ChapterExtractor from .. import text, util import re import json -class DynastyscansChapterExtractor(Extractor): +class DynastyscansChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from dynasty-scans.com""" category = "dynastyscans" - subcategory = "chapter" - directory_fmt = [ - "{category}", "{manga}", "c{chapter:>03}{chapter_minor}{title:?: //}"] - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") pattern = [r"(?:https?://)?(?:www\.)?dynasty-scans\.com/chapters/([^/]+)"] test = [ (("http://dynasty-scans.com/chapters/" "hitoribocchi_no_oo_seikatsu_ch33"), { "url": "dce64e8c504118f1ab4135c00245ea12413896cb", - "keyword": "fb2f470b995df5b301ccede31ed9829a010236db", + "keyword": "ec5c56bbd5c97aa521d00f2598bba4663fb8ab9f", }), (("http://dynasty-scans.com/chapters/" "new_game_the_spinoff_special_13"), { "url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538", - "keyword": "281bbe0fb74b812ced595619ca5876983490dc0e", + "keyword": "1208a102d9a1bb0b0c740a67996d9b26a9357b64", }), ] root = "https://dynasty-scans.com" def __init__(self, match): - Extractor.__init__(self) self.chaptername = match.group(1) + url = self.root + "/chapters/" + self.chaptername + ChapterExtractor.__init__(self, url) - def items(self): - page = self.request(self.root + "/chapters/" + self.chaptername, - encoding="utf-8").text - data = self.get_job_metadata(page) - imgs = self.get_image_data(page) - data["count"] = len(imgs) - yield Message.Version, 1 - yield Message.Directory, data - for data["page"], img in enumerate(imgs, 1): - url = self.root + img["image"] - text.nameext_from_url(url, data) - data["name"] = img["name"] - yield Message.Url, url, data - - def get_job_metadata(self, page): + def get_metadata(self, page): """Collect metadata for extractor-job""" info , pos = text.extract(page, "

", "") author, pos = text.extract(page, " by ", "", pos) @@ -82,8 +64,10 @@ class DynastyscansChapterExtractor(Extractor): "language": "English", } - @staticmethod - def get_image_data(page): + def get_images(self, page): """Extract list of all image-urls for a manga chapter""" data = text.extract(page, "var pages = ", ";\n")[0] - return json.loads(data) + return [ + (self.root + img["image"], None) + for img in json.loads(data) + ] diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index 235cdeab..8b319ff4 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,21 +8,16 @@ """Extract manga-chapters from https://www.fascans.com/""" -from .common import Extractor, MangaExtractor, Message +from .common import ChapterExtractor, MangaExtractor from .. import text, util import json -class FallenangelsChapterExtractor(Extractor): +class FallenangelsChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from fascans.com""" category = "fallenangels" - subcategory = "chapter" - directory_fmt = ["{category}", "{manga}", - "c{chapter:>03}{chapter_minor}{title:?: //}"] - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") - pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com/" - r"manga/([^/]+)/(\d+)(\.[^/?&#]+)?")] + pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com" + r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?")] test = [ ("https://manga.fascans.com/manga/chronos-ruler/20/1", { "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3", @@ -38,24 +33,12 @@ class FallenangelsChapterExtractor(Extractor): ] def __init__(self, match): - Extractor.__init__(self) self.version, self.manga, self.chapter, self.minor = match.groups() - - def items(self): url = "https://{}.fascans.com/manga/{}/{}/1".format( self.version, self.manga, self.chapter) - page = self.request(url).text - data = self.get_metadata(page) - imgs = self.get_images(page) - data["count"] = len(imgs) - yield Message.Version, 1 - yield Message.Directory, data - for data["page"], img in enumerate(imgs, 1): - url = img["page_image"] - yield Message.Url, url, text.nameext_from_url(url, data) + ChapterExtractor.__init__(self, url) def get_metadata(self, page): - """Collect metadata for extractor-job""" lang = "vi" if self.version == "truyen" else "en" data = { "chapter": self.chapter, @@ -70,8 +53,12 @@ class FallenangelsChapterExtractor(Extractor): @staticmethod def get_images(page): - """Return a list of all images in this chapter""" - return json.loads(text.extract(page, "var pages = ", ";")[0]) + return [ + (img["page_image"], None) + for img in json.loads( + text.extract(page, "var pages = ", ";")[0] + ) + ] class FallenangelsMangaExtractor(MangaExtractor): diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index ee847424..4f3f84fb 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,13 +8,13 @@ """Extract images from http://www.hbrowse.com/""" -from .common import Extractor, MangaExtractor, Message +from .common import ChapterExtractor, MangaExtractor from .. import text, util from urllib.parse import urljoin import json -class HbrowseExtractor(Extractor): +class HbrowseExtractor(): """Base class for hbrowse extractors""" category = "hbrowse" root = "http://www.hbrowse.com" @@ -64,41 +64,30 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor): results.append((urljoin(self.root, url), data.copy())) -class HbrowseChapterExtractor(HbrowseExtractor): +class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor): """Extractor for manga-chapters from hbrowse.com""" - subcategory = "chapter" directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"] filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" - "{num:>03}.{extension}") + "{page:>03}.{extension}") pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"] test = [("http://www.hbrowse.com/10363/c00000", { "url": "634f4800858913f097bc3b62a8fedaf74b5254bd", - "keyword": "730bd33de2a0a0fb4e0b6dcdafedcaeee1060047", + "keyword": "f37cafef404696312f5db6ccaaaf72737d309e2d", "content": "44578ebbe176c2c27434966aef22945787e2781e", })] def __init__(self, match): - HbrowseExtractor.__init__(self) self.gid, self.chapter = match.groups() self.path = "/{}/c{}/".format(self.gid, self.chapter) + ChapterExtractor.__init__(self, self.root + self.path) - def items(self): - page = self.request(self.root + self.path).text - data = self.get_job_metadata(page) - yield Message.Version, 1 - yield Message.Directory, data - for data["num"], url in enumerate(self.get_image_urls(page), 1): - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" + def get_metadata(self, page): return self.parse_page(page, { "manga_id": util.safe_int(self.gid), "chapter": util.safe_int(self.chapter) }) - def get_image_urls(self, page): - """Yield all image-urls for a 'chapter'""" + def get_images(self, page): base = self.root + "/data" + self.path json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" - return [base + name for name in json.loads(json_data)] + return [(base + name, None) for name in json.loads(json_data)] diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index ceedc604..690f8db4 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -8,12 +8,12 @@ """Extract manga-chapters and entire manga from https://mangapark.me/""" -from .common import Extractor, MangaExtractor, Message +from .common import ChapterExtractor, MangaExtractor from .. import text, util from urllib.parse import urljoin -class MangaparkExtractor(Extractor): +class MangaparkExtractor(): """Base class for mangapark extractors""" category = "mangapark" root = "https://mangapark.me" @@ -68,14 +68,8 @@ class MangaparkMangaExtractor(MangaparkExtractor, MangaExtractor): results.append((self.root + path, data.copy())) -class MangaparkChapterExtractor(MangaparkExtractor): +class MangaparkChapterExtractor(MangaparkExtractor, ChapterExtractor): """Extractor for manga-chapters from mangapark.me""" - subcategory = "chapter" - directory_fmt = [ - "{category}", "{manga}", - "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"] - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") pattern = [(r"(?:https?://)?(?:www\.)?mangapark\.me(/manga/[^/]+" r"/s\d+(?:/v\d+)?/c\d+[^/]*(?:/e\d+)?)")] test = [ @@ -95,20 +89,11 @@ class MangaparkChapterExtractor(MangaparkExtractor): ] def __init__(self, match): - MangaparkExtractor.__init__(self) self.path = match.group(1) + url = self.root + self.path + "?zoom=2" + ChapterExtractor.__init__(self, url) - def items(self): - page = self.request(self.root + self.path + "?zoom=2").text - data = self.get_job_metadata(page) - yield Message.Version, 1 - yield Message.Directory, data - for url, image in self.get_images(page): - data.update(image) - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" + def get_metadata(self, page): data = {"lang": "en", "language": "English"} self.parse_chapter_path(self.path, data) text.extract_all(page, ( @@ -126,7 +111,6 @@ class MangaparkChapterExtractor(MangaparkExtractor): return data def get_images(self, page): - """Collect image-urls, -widths and -heights""" pos = 0 num = 0 while True: diff --git a/test/test_extractors.py b/test/test_extractors.py index 78f304b8..bc422cc6 100644 --- a/test/test_extractors.py +++ b/test/test_extractors.py @@ -20,7 +20,7 @@ SKIP = { # temporary issues "chronos", "coreimg", - "luscious", + "yeet", }