implement generic manga-chapter extractor

pull/79/head
Mike Fährmann 7 years ago
parent aa38eab2be
commit 7a412f5c32
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann
# Copyright 2014-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -18,7 +18,7 @@ import requests
import threading
import http.cookiejar
from .message import Message
from .. import config, exception
from .. import config, text, exception
class Extractor():
@ -163,6 +163,47 @@ class AsynchronousExtractor(Extractor):
put(None)
class ChapterExtractor(Extractor):
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
def __init__(self, url):
Extractor.__init__(self)
self.url = url
def items(self):
page = self.request(self.url).text
data = self.get_metadata(page)
imgs = self.get_images(page)
if "count" in data:
images = zip(range(1, data["count"]+1), imgs)
else:
try:
data["count"] = len(imgs)
except TypeError:
pass
images = enumerate(imgs, 1)
yield Message.Version, 1
yield Message.Directory, data
for data["page"], (url, imgdata) in images:
if imgdata:
data.update(imgdata)
yield Message.Url, url, text.nameext_from_url(url, data)
def get_metadata(self, page):
"""Return a dict with general metadata"""
def get_images(self, page):
"""Return a list of all (image-url, metadata)-tuples"""
class MangaExtractor(Extractor):
subcategory = "manga"
@ -176,7 +217,6 @@ class MangaExtractor(Extractor):
self.url = url or self.scheme + "://" + match.group(1)
def items(self):
self.login()
page = self.request(self.url).text
chapters = self.chapters(page)
@ -187,12 +227,8 @@ class MangaExtractor(Extractor):
for chapter, data in chapters:
yield Message.Queue, chapter, data
def login(self):
"""Login and set necessary cookies"""
def chapters(self, page):
"""Return a list of all (url, metadata)-tuples"""
return []
"""Return a list of all (chapter-url, metadata)-tuples"""
class SharedConfigExtractor(Extractor):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,54 +8,36 @@
"""Extract manga-chapters from https://dynasty-scans.com/"""
from .common import Extractor, Message
from .common import ChapterExtractor
from .. import text, util
import re
import json
class DynastyscansChapterExtractor(Extractor):
class DynastyscansChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from dynasty-scans.com"""
category = "dynastyscans"
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}", "c{chapter:>03}{chapter_minor}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [r"(?:https?://)?(?:www\.)?dynasty-scans\.com/chapters/([^/]+)"]
test = [
(("http://dynasty-scans.com/chapters/"
"hitoribocchi_no_oo_seikatsu_ch33"), {
"url": "dce64e8c504118f1ab4135c00245ea12413896cb",
"keyword": "fb2f470b995df5b301ccede31ed9829a010236db",
"keyword": "ec5c56bbd5c97aa521d00f2598bba4663fb8ab9f",
}),
(("http://dynasty-scans.com/chapters/"
"new_game_the_spinoff_special_13"), {
"url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538",
"keyword": "281bbe0fb74b812ced595619ca5876983490dc0e",
"keyword": "1208a102d9a1bb0b0c740a67996d9b26a9357b64",
}),
]
root = "https://dynasty-scans.com"
def __init__(self, match):
Extractor.__init__(self)
self.chaptername = match.group(1)
url = self.root + "/chapters/" + self.chaptername
ChapterExtractor.__init__(self, url)
def items(self):
page = self.request(self.root + "/chapters/" + self.chaptername,
encoding="utf-8").text
data = self.get_job_metadata(page)
imgs = self.get_image_data(page)
data["count"] = len(imgs)
yield Message.Version, 1
yield Message.Directory, data
for data["page"], img in enumerate(imgs, 1):
url = self.root + img["image"]
text.nameext_from_url(url, data)
data["name"] = img["name"]
yield Message.Url, url, data
def get_job_metadata(self, page):
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
info , pos = text.extract(page, "<h3 id='chapter-title'><b>", "</b>")
author, pos = text.extract(page, " by ", "</a>", pos)
@ -82,8 +64,10 @@ class DynastyscansChapterExtractor(Extractor):
"language": "English",
}
@staticmethod
def get_image_data(page):
def get_images(self, page):
"""Extract list of all image-urls for a manga chapter"""
data = text.extract(page, "var pages = ", ";\n")[0]
return json.loads(data)
return [
(self.root + img["image"], None)
for img in json.loads(data)
]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
# Copyright 2017-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,21 +8,16 @@
"""Extract manga-chapters from https://www.fascans.com/"""
from .common import Extractor, MangaExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
import json
class FallenangelsChapterExtractor(Extractor):
class FallenangelsChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from fascans.com"""
category = "fallenangels"
subcategory = "chapter"
directory_fmt = ["{category}", "{manga}",
"c{chapter:>03}{chapter_minor}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com/"
r"manga/([^/]+)/(\d+)(\.[^/?&#]+)?")]
pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com"
r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?")]
test = [
("https://manga.fascans.com/manga/chronos-ruler/20/1", {
"url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3",
@ -38,24 +33,12 @@ class FallenangelsChapterExtractor(Extractor):
]
def __init__(self, match):
Extractor.__init__(self)
self.version, self.manga, self.chapter, self.minor = match.groups()
def items(self):
url = "https://{}.fascans.com/manga/{}/{}/1".format(
self.version, self.manga, self.chapter)
page = self.request(url).text
data = self.get_metadata(page)
imgs = self.get_images(page)
data["count"] = len(imgs)
yield Message.Version, 1
yield Message.Directory, data
for data["page"], img in enumerate(imgs, 1):
url = img["page_image"]
yield Message.Url, url, text.nameext_from_url(url, data)
ChapterExtractor.__init__(self, url)
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
lang = "vi" if self.version == "truyen" else "en"
data = {
"chapter": self.chapter,
@ -70,8 +53,12 @@ class FallenangelsChapterExtractor(Extractor):
@staticmethod
def get_images(page):
"""Return a list of all images in this chapter"""
return json.loads(text.extract(page, "var pages = ", ";")[0])
return [
(img["page_image"], None)
for img in json.loads(
text.extract(page, "var pages = ", ";")[0]
)
]
class FallenangelsMangaExtractor(MangaExtractor):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,13 +8,13 @@
"""Extract images from http://www.hbrowse.com/"""
from .common import Extractor, MangaExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from urllib.parse import urljoin
import json
class HbrowseExtractor(Extractor):
class HbrowseExtractor():
"""Base class for hbrowse extractors"""
category = "hbrowse"
root = "http://www.hbrowse.com"
@ -64,41 +64,30 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor):
results.append((urljoin(self.root, url), data.copy()))
class HbrowseChapterExtractor(HbrowseExtractor):
class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
"""Extractor for manga-chapters from hbrowse.com"""
subcategory = "chapter"
directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"]
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
"{num:>03}.{extension}")
"{page:>03}.{extension}")
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"]
test = [("http://www.hbrowse.com/10363/c00000", {
"url": "634f4800858913f097bc3b62a8fedaf74b5254bd",
"keyword": "730bd33de2a0a0fb4e0b6dcdafedcaeee1060047",
"keyword": "f37cafef404696312f5db6ccaaaf72737d309e2d",
"content": "44578ebbe176c2c27434966aef22945787e2781e",
})]
def __init__(self, match):
HbrowseExtractor.__init__(self)
self.gid, self.chapter = match.groups()
self.path = "/{}/c{}/".format(self.gid, self.chapter)
ChapterExtractor.__init__(self, self.root + self.path)
def items(self):
page = self.request(self.root + self.path).text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data
for data["num"], url in enumerate(self.get_image_urls(page), 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
def get_metadata(self, page):
return self.parse_page(page, {
"manga_id": util.safe_int(self.gid),
"chapter": util.safe_int(self.chapter)
})
def get_image_urls(self, page):
"""Yield all image-urls for a 'chapter'"""
def get_images(self, page):
base = self.root + "/data" + self.path
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
return [base + name for name in json.loads(json_data)]
return [(base + name, None) for name in json.loads(json_data)]

@ -8,12 +8,12 @@
"""Extract manga-chapters and entire manga from https://mangapark.me/"""
from .common import Extractor, MangaExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from urllib.parse import urljoin
class MangaparkExtractor(Extractor):
class MangaparkExtractor():
"""Base class for mangapark extractors"""
category = "mangapark"
root = "https://mangapark.me"
@ -68,14 +68,8 @@ class MangaparkMangaExtractor(MangaparkExtractor, MangaExtractor):
results.append((self.root + path, data.copy()))
class MangaparkChapterExtractor(MangaparkExtractor):
class MangaparkChapterExtractor(MangaparkExtractor, ChapterExtractor):
"""Extractor for manga-chapters from mangapark.me"""
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [(r"(?:https?://)?(?:www\.)?mangapark\.me(/manga/[^/]+"
r"/s\d+(?:/v\d+)?/c\d+[^/]*(?:/e\d+)?)")]
test = [
@ -95,20 +89,11 @@ class MangaparkChapterExtractor(MangaparkExtractor):
]
def __init__(self, match):
MangaparkExtractor.__init__(self)
self.path = match.group(1)
url = self.root + self.path + "?zoom=2"
ChapterExtractor.__init__(self, url)
def items(self):
page = self.request(self.root + self.path + "?zoom=2").text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data
for url, image in self.get_images(page):
data.update(image)
yield Message.Url, url, text.nameext_from_url(url, data)
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
def get_metadata(self, page):
data = {"lang": "en", "language": "English"}
self.parse_chapter_path(self.path, data)
text.extract_all(page, (
@ -126,7 +111,6 @@ class MangaparkChapterExtractor(MangaparkExtractor):
return data
def get_images(self, page):
"""Collect image-urls, -widths and -heights"""
pos = 0
num = 0
while True:

@ -20,7 +20,7 @@ SKIP = {
# temporary issues
"chronos",
"coreimg",
"luscious",
"yeet",
}

Loading…
Cancel
Save