diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 36037387..b7c550f6 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -18,7 +18,7 @@ import requests
import threading
import http.cookiejar
from .message import Message
-from .. import config, exception
+from .. import config, text, exception
class Extractor():
@@ -163,6 +163,47 @@ class AsynchronousExtractor(Extractor):
put(None)
+class ChapterExtractor(Extractor):
+
+ subcategory = "chapter"
+ directory_fmt = [
+ "{category}", "{manga}",
+ "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
+
+ def __init__(self, url):
+ Extractor.__init__(self)
+ self.url = url
+
+ def items(self):
+ page = self.request(self.url).text
+ data = self.get_metadata(page)
+ imgs = self.get_images(page)
+
+ if "count" in data:
+ images = zip(range(1, data["count"]+1), imgs)
+ else:
+ try:
+ data["count"] = len(imgs)
+ except TypeError:
+ pass
+ images = enumerate(imgs, 1)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], (url, imgdata) in images:
+ if imgdata:
+ data.update(imgdata)
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_metadata(self, page):
+ """Return a dict with general metadata"""
+
+ def get_images(self, page):
+ """Return a list of all (image-url, metadata)-tuples"""
+
+
class MangaExtractor(Extractor):
subcategory = "manga"
@@ -176,7 +217,6 @@ class MangaExtractor(Extractor):
self.url = url or self.scheme + "://" + match.group(1)
def items(self):
- self.login()
page = self.request(self.url).text
chapters = self.chapters(page)
@@ -187,12 +227,8 @@ class MangaExtractor(Extractor):
for chapter, data in chapters:
yield Message.Queue, chapter, data
- def login(self):
- """Login and set necessary cookies"""
-
def chapters(self, page):
- """Return a list of all (url, metadata)-tuples"""
- return []
+ """Return a list of all (chapter-url, metadata)-tuples"""
class SharedConfigExtractor(Extractor):
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index e822a502..bd9107ac 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,54 +8,36 @@
"""Extract manga-chapters from https://dynasty-scans.com/"""
-from .common import Extractor, Message
+from .common import ChapterExtractor
from .. import text, util
import re
import json
-class DynastyscansChapterExtractor(Extractor):
+class DynastyscansChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from dynasty-scans.com"""
category = "dynastyscans"
- subcategory = "chapter"
- directory_fmt = [
- "{category}", "{manga}", "c{chapter:>03}{chapter_minor}{title:?: //}"]
- filename_fmt = (
- "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [r"(?:https?://)?(?:www\.)?dynasty-scans\.com/chapters/([^/]+)"]
test = [
(("http://dynasty-scans.com/chapters/"
"hitoribocchi_no_oo_seikatsu_ch33"), {
"url": "dce64e8c504118f1ab4135c00245ea12413896cb",
- "keyword": "fb2f470b995df5b301ccede31ed9829a010236db",
+ "keyword": "ec5c56bbd5c97aa521d00f2598bba4663fb8ab9f",
}),
(("http://dynasty-scans.com/chapters/"
"new_game_the_spinoff_special_13"), {
"url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538",
- "keyword": "281bbe0fb74b812ced595619ca5876983490dc0e",
+ "keyword": "1208a102d9a1bb0b0c740a67996d9b26a9357b64",
}),
]
root = "https://dynasty-scans.com"
def __init__(self, match):
- Extractor.__init__(self)
self.chaptername = match.group(1)
+ url = self.root + "/chapters/" + self.chaptername
+ ChapterExtractor.__init__(self, url)
- def items(self):
- page = self.request(self.root + "/chapters/" + self.chaptername,
- encoding="utf-8").text
- data = self.get_job_metadata(page)
- imgs = self.get_image_data(page)
- data["count"] = len(imgs)
- yield Message.Version, 1
- yield Message.Directory, data
- for data["page"], img in enumerate(imgs, 1):
- url = self.root + img["image"]
- text.nameext_from_url(url, data)
- data["name"] = img["name"]
- yield Message.Url, url, data
-
- def get_job_metadata(self, page):
+ def get_metadata(self, page):
"""Collect metadata for extractor-job"""
info , pos = text.extract(page, "
", "")
author, pos = text.extract(page, " by ", "", pos)
@@ -82,8 +64,10 @@ class DynastyscansChapterExtractor(Extractor):
"language": "English",
}
- @staticmethod
- def get_image_data(page):
+ def get_images(self, page):
"""Extract list of all image-urls for a manga chapter"""
data = text.extract(page, "var pages = ", ";\n")[0]
- return json.loads(data)
+ return [
+ (self.root + img["image"], None)
+ for img in json.loads(data)
+ ]
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
index 235cdeab..8b319ff4 100644
--- a/gallery_dl/extractor/fallenangels.py
+++ b/gallery_dl/extractor/fallenangels.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,21 +8,16 @@
"""Extract manga-chapters from https://www.fascans.com/"""
-from .common import Extractor, MangaExtractor, Message
+from .common import ChapterExtractor, MangaExtractor
from .. import text, util
import json
-class FallenangelsChapterExtractor(Extractor):
+class FallenangelsChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from fascans.com"""
category = "fallenangels"
- subcategory = "chapter"
- directory_fmt = ["{category}", "{manga}",
- "c{chapter:>03}{chapter_minor}{title:?: //}"]
- filename_fmt = (
- "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
- pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com/"
- r"manga/([^/]+)/(\d+)(\.[^/?]+)?")]
+ pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com"
+ r"/manga/([^/]+)/(\d+)(\.[^/?]+)?")]
test = [
("https://manga.fascans.com/manga/chronos-ruler/20/1", {
"url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3",
@@ -38,24 +33,12 @@ class FallenangelsChapterExtractor(Extractor):
]
def __init__(self, match):
- Extractor.__init__(self)
self.version, self.manga, self.chapter, self.minor = match.groups()
-
- def items(self):
url = "https://{}.fascans.com/manga/{}/{}/1".format(
self.version, self.manga, self.chapter)
- page = self.request(url).text
- data = self.get_metadata(page)
- imgs = self.get_images(page)
- data["count"] = len(imgs)
- yield Message.Version, 1
- yield Message.Directory, data
- for data["page"], img in enumerate(imgs, 1):
- url = img["page_image"]
- yield Message.Url, url, text.nameext_from_url(url, data)
+ ChapterExtractor.__init__(self, url)
def get_metadata(self, page):
- """Collect metadata for extractor-job"""
lang = "vi" if self.version == "truyen" else "en"
data = {
"chapter": self.chapter,
@@ -70,8 +53,12 @@ class FallenangelsChapterExtractor(Extractor):
@staticmethod
def get_images(page):
- """Return a list of all images in this chapter"""
- return json.loads(text.extract(page, "var pages = ", ";")[0])
+ return [
+ (img["page_image"], None)
+ for img in json.loads(
+ text.extract(page, "var pages = ", ";")[0]
+ )
+ ]
class FallenangelsMangaExtractor(MangaExtractor):
diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py
index ee847424..4f3f84fb 100644
--- a/gallery_dl/extractor/hbrowse.py
+++ b/gallery_dl/extractor/hbrowse.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,13 +8,13 @@
"""Extract images from http://www.hbrowse.com/"""
-from .common import Extractor, MangaExtractor, Message
+from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from urllib.parse import urljoin
import json
-class HbrowseExtractor(Extractor):
+class HbrowseExtractor():
"""Base class for hbrowse extractors"""
category = "hbrowse"
root = "http://www.hbrowse.com"
@@ -64,41 +64,30 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor):
results.append((urljoin(self.root, url), data.copy()))
-class HbrowseChapterExtractor(HbrowseExtractor):
+class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
"""Extractor for manga-chapters from hbrowse.com"""
- subcategory = "chapter"
directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"]
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
- "{num:>03}.{extension}")
+ "{page:>03}.{extension}")
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"]
test = [("http://www.hbrowse.com/10363/c00000", {
"url": "634f4800858913f097bc3b62a8fedaf74b5254bd",
- "keyword": "730bd33de2a0a0fb4e0b6dcdafedcaeee1060047",
+ "keyword": "f37cafef404696312f5db6ccaaaf72737d309e2d",
"content": "44578ebbe176c2c27434966aef22945787e2781e",
})]
def __init__(self, match):
- HbrowseExtractor.__init__(self)
self.gid, self.chapter = match.groups()
self.path = "/{}/c{}/".format(self.gid, self.chapter)
+ ChapterExtractor.__init__(self, self.root + self.path)
- def items(self):
- page = self.request(self.root + self.path).text
- data = self.get_job_metadata(page)
- yield Message.Version, 1
- yield Message.Directory, data
- for data["num"], url in enumerate(self.get_image_urls(page), 1):
- yield Message.Url, url, text.nameext_from_url(url, data)
-
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
+ def get_metadata(self, page):
return self.parse_page(page, {
"manga_id": util.safe_int(self.gid),
"chapter": util.safe_int(self.chapter)
})
- def get_image_urls(self, page):
- """Yield all image-urls for a 'chapter'"""
+ def get_images(self, page):
base = self.root + "/data" + self.path
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
- return [base + name for name in json.loads(json_data)]
+ return [(base + name, None) for name in json.loads(json_data)]
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index ceedc604..690f8db4 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -8,12 +8,12 @@
"""Extract manga-chapters and entire manga from https://mangapark.me/"""
-from .common import Extractor, MangaExtractor, Message
+from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from urllib.parse import urljoin
-class MangaparkExtractor(Extractor):
+class MangaparkExtractor():
"""Base class for mangapark extractors"""
category = "mangapark"
root = "https://mangapark.me"
@@ -68,14 +68,8 @@ class MangaparkMangaExtractor(MangaparkExtractor, MangaExtractor):
results.append((self.root + path, data.copy()))
-class MangaparkChapterExtractor(MangaparkExtractor):
+class MangaparkChapterExtractor(MangaparkExtractor, ChapterExtractor):
"""Extractor for manga-chapters from mangapark.me"""
- subcategory = "chapter"
- directory_fmt = [
- "{category}", "{manga}",
- "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
- filename_fmt = (
- "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [(r"(?:https?://)?(?:www\.)?mangapark\.me(/manga/[^/]+"
r"/s\d+(?:/v\d+)?/c\d+[^/]*(?:/e\d+)?)")]
test = [
@@ -95,20 +89,11 @@ class MangaparkChapterExtractor(MangaparkExtractor):
]
def __init__(self, match):
- MangaparkExtractor.__init__(self)
self.path = match.group(1)
+ url = self.root + self.path + "?zoom=2"
+ ChapterExtractor.__init__(self, url)
- def items(self):
- page = self.request(self.root + self.path + "?zoom=2").text
- data = self.get_job_metadata(page)
- yield Message.Version, 1
- yield Message.Directory, data
- for url, image in self.get_images(page):
- data.update(image)
- yield Message.Url, url, text.nameext_from_url(url, data)
-
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
+ def get_metadata(self, page):
data = {"lang": "en", "language": "English"}
self.parse_chapter_path(self.path, data)
text.extract_all(page, (
@@ -126,7 +111,6 @@ class MangaparkChapterExtractor(MangaparkExtractor):
return data
def get_images(self, page):
- """Collect image-urls, -widths and -heights"""
pos = 0
num = 0
while True:
diff --git a/test/test_extractors.py b/test/test_extractors.py
index 78f304b8..bc422cc6 100644
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@@ -20,7 +20,7 @@ SKIP = {
# temporary issues
"chronos",
"coreimg",
- "luscious",
+ "yeet",
}