use generic chapter-extractor in more modules

pull/79/head
Mike Fährmann 7 years ago
parent 347baf7ac5
commit 5b3c34aa96
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -168,9 +168,9 @@ class ChapterExtractor(Extractor):
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
def __init__(self, url):
Extractor.__init__(self)
@ -182,7 +182,10 @@ class ChapterExtractor(Extractor):
imgs = self.get_images(page)
if "count" in data:
images = zip(range(1, data["count"]+1), imgs)
images = zip(
range(1, data["count"]+1),
imgs
)
else:
try:
data["count"] = len(imgs)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann
# Copyright 2016-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,9 +8,8 @@
"""Extract hentai-manga from https://hentai2read.com/"""
from .common import MangaExtractor
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from . import hentaicdn
import re
import json
@ -53,42 +52,43 @@ class Hentai2readMangaExtractor(MangaExtractor):
return results
class Hentai2readChapterExtractor(hentaicdn.HentaicdnChapterExtractor):
class Hentai2readChapterExtractor(ChapterExtractor):
"""Extractor for a single manga chapter from hentai2read.com"""
category = "hentai2read"
pattern = [r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)"]
test = [("http://hentai2read.com/amazon_elixir/1/", {
"url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
"keyword": "0f6408d462a14bfe58030117dc295b84666843d0",
"keyword": "9845105898d28c6a540cffdea60a1a20fab52431",
})]
def __init__(self, match):
hentaicdn.HentaicdnChapterExtractor.__init__(self)
url_title, self.chapter = match.groups()
self.url = "https://hentai2read.com/{}/{}/".format(
url_title, self.chapter
)
url = "https://hentai2read.com/{}/{}/".format(url_title, self.chapter)
ChapterExtractor.__init__(self, url)
def get_job_metadata(self, page, images):
title = text.extract(page, "<title>", "</title>")[0]
chapter_id = text.extract(page, 'data-cid="', '"')[0]
def get_metadata(self, page):
title, pos = text.extract(page, "<title>", "</title>")
manga_id, pos = text.extract(page, 'data-mid="', '"', pos)
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - "
r"(\d+): (.+) . Page 1 ", title)
return {
"manga_id": images[0].split("/")[-3],
"manga": match.group(1),
"manga_id": util.safe_int(manga_id),
"chapter": util.safe_int(self.chapter),
"chapter_id": util.safe_int(chapter_id),
"type": match.group(2),
"chapter_id": chapter_id,
"chapter": self.chapter,
"author": match.group(3),
"title": match.group(5),
"count": len(images),
"lang": "en",
"language": "English",
}
@staticmethod
def get_image_urls(page):
def get_images(page):
"""Extract and return a list of all image-urls"""
images = text.extract(page, "'images' : ", ",\n")[0]
return json.loads(images)
return [
("https://hentaicdn.com/hentai" + part, None)
for part in json.loads(images)
]

@ -1,41 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Base classes for extractors from sites based on hentaicdn"""
from .common import Extractor, Message
from .. import text
import json
class HentaicdnChapterExtractor(Extractor):
"""Base class for extractors for a single manga chapter"""
subcategory = "chapter"
directory_fmt = ["{category}", "{manga_id} {title}"]
filename_fmt = ("{category}_{manga_id}_{chapter:>02}_"
"{num:>03}.{extension}")
url = ""
def items(self):
page = self.request(self.url).text
images = self.get_image_urls(page)
data = self.get_job_metadata(page, images)
yield Message.Version, 1
yield Message.Directory, data
for data["num"], part in enumerate(images, 1):
url = "https://hentaicdn.com/hentai" + part
yield Message.Url, url, text.nameext_from_url(url, data)
def get_job_metadata(self, page, images):
"""Collect metadata for extractor-job"""
@staticmethod
def get_image_urls(page):
"""Extract and return a list of all image-urls"""
images = text.extract(page, "var rff_imageList = ", ";")[0]
return json.loads(images)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann
# Copyright 2016-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,10 +8,10 @@
"""Extract hentai-manga from https://hentaihere.com/"""
from .common import MangaExtractor
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from . import hentaicdn
import re
import json
class HentaihereMangaExtractor(MangaExtractor):
@ -56,36 +56,42 @@ class HentaihereMangaExtractor(MangaExtractor):
}))
class HentaihereChapterExtractor(hentaicdn.HentaicdnChapterExtractor):
class HentaihereChapterExtractor(ChapterExtractor):
"""Extractor for a single manga chapter from hentaihere.com"""
category = "hentaihere"
pattern = [r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"]
test = [("https://hentaihere.com/m/S13812/1/1/", {
"url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
"keyword": "a07753f655210525a80ff62607261715746f3273",
"keyword": "e9382a9be337abce3db2b1132e85751379dc05c5",
})]
def __init__(self, match):
hentaicdn.HentaicdnChapterExtractor.__init__(self)
self.gid, self.chapter = match.groups()
self.url = "https://hentaihere.com/m/S{}/{}/1".format(
self.gid, self.chapter
)
self.manga_id, self.chapter = match.groups()
url = "https://hentaihere.com/m/S{}/{}/1".format(
self.manga_id, self.chapter)
ChapterExtractor.__init__(self, url)
def get_job_metadata(self, page, images):
def get_metadata(self, page):
title = text.extract(page, "<title>", "</title>")[0]
chapter_id = text.extract(page, 'report/C', '"')[0]
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
match = re.match(pattern, title)
return {
"manga_id": self.gid,
"manga": match.group(1),
"manga_id": util.safe_int(self.manga_id),
"chapter": util.safe_int(self.chapter),
"chapter_id": util.safe_int(chapter_id),
"type": match.group(2),
"chapter_id": chapter_id,
"chapter": self.chapter,
"title": match.group(3),
"author": match.group(4),
"count": len(images),
"lang": "en",
"language": "English",
}
@staticmethod
def get_images(page):
images = text.extract(page, "var rff_imageList = ", ";")[0]
return [
("https://hentaicdn.com/hentai" + part, None)
for part in json.loads(images)
]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from http://kissmanga.com/"""
from .common import Extractor, MangaExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util, cloudflare, aes, exception
from ..cache import cache
import re
@ -21,21 +21,11 @@ IV = [
]
class KissmangaExtractor(Extractor):
class KissmangaBase():
"""Base class for kissmanga extractors"""
category = "kissmanga"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
root = "http://kissmanga.com"
def __init__(self, match):
Extractor.__init__(self)
self.url = match.group(0)
self.session.headers["Referer"] = self.root
def request(self, url):
response = cloudflare.request_func(self, url)
if response.history and "/Message/AreYouHuman?" in response.url:
@ -72,10 +62,10 @@ class KissmangaExtractor(Extractor):
return data
class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor):
class KissmangaMangaExtractor(KissmangaBase, MangaExtractor):
"""Extractor for manga from kissmanga.com"""
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/"
r"Manga/[^/?&#]+/?$"]
pattern = [r"(?i)(?:https?://)?(?:www\.)?(kissmanga\.com"
r"/Manga/[^/?&#]+/?)$"]
test = [
("http://kissmanga.com/Manga/Dropout", {
"url": "992befdd64e178fe5af67de53f8b510860d968ca",
@ -105,11 +95,10 @@ class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor):
return results
class KissmangaChapterExtractor(KissmangaExtractor):
class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
"""Extractor for manga-chapters from kissmanga.com"""
subcategory = "chapter"
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/"
r"Manga/[^/?&#]+/[^/?&#]+\?id=\d+"]
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
r"/Manga/[^/?&#]+/[^/?&#]+\?id=\d+"]
test = [
("http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", {
"url": "4136bcd1c6cecbca8cc2bc965d54f33ef0a97cc0",
@ -126,18 +115,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
("http://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608", None),
]
def items(self):
page = self.request(self.url).text
data = self.get_job_metadata(page)
imgs = self.get_image_urls(page)
data["count"] = len(imgs)
yield Message.Version, 1
yield Message.Directory, data
for data["page"], url in enumerate(imgs, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
def __init__(self, match):
ChapterExtractor.__init__(self, match.group(0))
self.session.headers["Referer"] = self.root
def get_metadata(self, page):
title = text.extract(page, "<title>", "</title>")[0].strip()
manga, cinfo = title.split("\n")[1:3]
data = {
@ -148,12 +130,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
}
return self.parse_chapter_string(data)
def get_image_urls(self, page):
"""Extract list of all image-urls for a manga chapter"""
def get_images(self, page):
try:
key = self.build_aes_key(page)
return [
aes.aes_cbc_decrypt_text(data, key, IV)
(aes.aes_cbc_decrypt_text(data, key, IV), None)
for data in text.extract_iter(
page, 'lstImages.push(wrapKA("', '"'
)

@ -1,56 +1,39 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
# Copyright 2017-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract manga-chapters and entire manga from http://www.mangafox.me/"""
"""Extract manga-chapters and entire manga from http://fanfox.net/"""
from .common import AsynchronousExtractor, Message
from .common import ChapterExtractor
from .. import text, util, exception
import re
class MangafoxChapterExtractor(AsynchronousExtractor):
"""Extractor for manga-chapters from mangafox.me"""
class MangafoxChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from fanfox.net"""
category = "mangafox"
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [(r"(?:https?://)?(?:www\.)?(mangafox\.me/manga/"
r"[^/]+/(v\d+/)?c\d+[^/]*)")]
test = [(("http://mangafox.me/manga/kidou_keisatsu_patlabor/"
"v05/c006.2/1.html"), {
"keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
"content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
})]
pattern = [(r"(?:https?://)?(?:www\.)?(?:mangafox\.me|fanfox\.net)"
r"(/manga/[^/]+/(?:v\d+/)?c\d+[^/?&#]*)")]
test = [
("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
"keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
"content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
}),
("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/", None),
]
root = "http://fanfox.net"
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.url = "http://" + match.group(1)
self.urlbase = self.root + match.group(1)
ChapterExtractor.__init__(self, self.urlbase + "/1.html")
def items(self):
page = self.request(self.url + "/1.html").text
def get_metadata(self, page):
if "Sorry, its licensed, and not available." in page:
raise exception.AuthorizationError()
data = self.get_metadata(page)
urls = zip(
range(1, data["count"]+1),
self.get_image_urls(page),
)
yield Message.Version, 1
yield Message.Directory, data.copy()
for data["page"], url in urls:
text.nameext_from_url(url, data)
yield Message.Url, url, data.copy()
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
data = text.extract_all(page, (
("manga" , " - Read ", " Manga Scans "),
("sid" , "var sid=", ";"),
@ -67,14 +50,14 @@ class MangafoxChapterExtractor(AsynchronousExtractor):
data[key] = util.safe_int(data[key])
return data
def get_image_urls(self, page):
"""Yield all image-urls for this chapter"""
def get_images(self, page):
pnum = 1
while True:
url, pos = text.extract(page, '<img src="', '"')
yield url
yield url, None
_ , pos = text.extract(page, '<img src="', '"', pos)
url, pos = text.extract(page, '<img src="', '"', pos)
yield url
yield url, None
pnum += 2
page = self.request(self.url + "/{}.html".format(pnum)).text
page = self.request("{}/{}.html".format(self.urlbase, pnum)).text

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from http://www.mangahere.co/"""
from .common import MangaExtractor, AsynchronousExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from urllib.parse import urljoin
import re
@ -60,15 +60,9 @@ class MangahereMangaExtractor(MangaExtractor):
}))
class MangahereChapterExtractor(AsynchronousExtractor):
class MangahereChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from mangahere.co"""
category = "mangahere"
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [(r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
r"([^/]+(?:/v0*(\d+))?/c0*(\d+)(\.\d+)?)")]
test = [
@ -82,27 +76,12 @@ class MangahereChapterExtractor(AsynchronousExtractor):
url_fmt = "http://www.mangahere.cc/manga/{}/{}.html"
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part, self.volume, self.chapter, self.chminor = match.groups()
def items(self):
# remove ".html" for the first chapter page to avoid redirects
url = self.url_fmt.format(self.part, "")[:-5]
ChapterExtractor.__init__(self, url)
page = self.request(url).text
data = self.get_job_metadata(page)
urls = zip(
range(1, data["count"]+1),
self.get_image_urls(page),
)
yield Message.Version, 1
yield Message.Directory, data.copy()
for data["page"], url in urls:
text.nameext_from_url(url, data)
yield Message.Url, url, data.copy()
def get_job_metadata(self, page):
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
manga, pos = text.extract(page, '<title>', '</title>')
chid , pos = text.extract(page, '.net/store/manga/', '/', pos)
@ -122,15 +101,16 @@ class MangahereChapterExtractor(AsynchronousExtractor):
"language": "English",
}
def get_image_urls(self, page):
def get_images(self, page):
"""Yield all image-urls for this chapter"""
pnum = 1
while True:
url, pos = text.extract(page, '<img src="', '"')
yield url
yield url, None
_ , pos = text.extract(page, '<img src="', '"', pos)
_ , pos = text.extract(page, '<img src="', '"', pos)
url, pos = text.extract(page, '<img src="', '"', pos)
yield url
yield url, None
pnum += 2
page = self.request(self.url_fmt.format(self.part, pnum)).text

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -35,5 +35,5 @@ class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
]
test = [("http://www.mangapanda.com/red-storm/2", {
"url": "4bf4ddf6c50105ec8a37675495ab80c46608275d",
"keyword": "367d2694b49cc7cac82d68530d7d467a62453502",
"keyword": "32b5e84017c2bf5f122b339ecf40899e41f18cc9",
})]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from http://www.mangareader.net/"""
from .common import AsynchronousExtractor, MangaExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
@ -57,11 +57,8 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
results.append((self.root + url, data.copy()))
class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
"""Extractor for manga-chapters from mangareader.net"""
subcategory = "chapter"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03}{title:?: //}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [
(r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"),
(r"(?:https?://)?(?:www\.)?mangareader\.net"
@ -70,26 +67,14 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
test = [(("http://www.mangareader.net/"
"karate-shoukoushi-kohinata-minoru/11"), {
"url": "84ffaab4c027ef9022695c53163c3aeabd07ca58",
"keyword": "2038e6a780a0028eee0067985b55debb1d4a6aab",
"keyword": "2893cfcd1916859fb498f3345f1929f868fe667f",
})]
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part, self.url_title, self.chapter = match.groups()
ChapterExtractor.__init__(self, self.root + self.part)
def items(self):
page = self.request(self.root + self.part).text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data
for data["page"] in range(1, data["count"]+1):
next_url, image_url, image_data = self.get_page_metadata(page)
image_data.update(data)
yield Message.Url, image_url, image_data
if next_url:
page = self.request(next_url).text
def get_job_metadata(self, chapter_page):
def get_metadata(self, chapter_page):
"""Collect metadata for extractor-job"""
page = self.request(self.root + self.url_title).text
data = self.parse_page(page, {
@ -106,7 +91,16 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
)
return data
def get_page_metadata(self, page):
def get_images(self, page):
while True:
next_url, image_url, image_data = self.get_image_metadata(page)
yield image_url, image_data
if not next_url:
return
page = self.request(next_url).text
def get_image_metadata(self, page):
"""Collect next url, image-url and metadata for one manga-page"""
extr = text.extract
width = None
@ -122,7 +116,7 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
width , pos = extr(page, '<img id="img" width="', '"', pos)
height, pos = extr(page, ' height="', '"', pos)
image, pos = extr(page, ' src="', '"', pos)
return self.root + url, image, text.nameext_from_url(image, {
return self.root + url, image, {
"width": util.safe_int(width),
"height": util.safe_int(height),
})
}

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,42 +8,23 @@
"""Extract manga-chapters from https://mangastream.com/"""
from .common import AsynchronousExtractor, Message
from .common import ChapterExtractor
from .. import text, util
from urllib.parse import urljoin
class MangastreamChapterExtractor(AsynchronousExtractor):
class MangastreamChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from mangastream.com"""
category = "mangastream"
subcategory = "chapter"
directory_fmt = ["{category}", "{manga}", "c{chapter}{title:?: //}"]
filename_fmt = "{manga}_c{chapter}_{page:>03}.{extension}"
pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/"
r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")]
base_url = "https://mangastream.com/r/"
base_url = "https://readms.net/r/"
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part, self.chapter, self.ch_id = match.groups()
ChapterExtractor.__init__(self, self.base_url + self.part)
def items(self):
page = self.request(self.base_url + self.part).text
data = self.get_job_metadata(page)
next_url = None
yield Message.Version, 1
yield Message.Directory, data.copy()
for data["page"] in range(1, data["count"]+1):
if next_url:
page = self.request(next_url).text
next_url, image_url = self.get_page_metadata(page)
text.nameext_from_url(image_url, data)
next_url = urljoin(self.base_url, next_url)
image_url = urljoin(self.base_url, image_url)
yield Message.Url, image_url, data.copy()
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
def get_metadata(self, page):
manga, pos = text.extract(
page, '<span class="hidden-xs hidden-sm">', "<")
pos = page.find(self.part, pos)
@ -59,9 +40,11 @@ class MangastreamChapterExtractor(AsynchronousExtractor):
"language": "English",
}
@staticmethod
def get_page_metadata(page):
"""Collect next url, image-url and metadata for one manga-page"""
nurl, pos = text.extract(page, '<div class="page">\n<a href="', '"')
iurl, pos = text.extract(page, '<img id="manga-page" src="', '"', pos)
return nurl, iurl
def get_images(self, page):
while True:
next_url, pos = text.extract(
page, '<div class="page">\n<a href="', '"')
image_url, pos = text.extract(
page, '<img id="manga-page" src="', '"', pos)
yield urljoin(self.base_url, image_url), None
page = self.request(urljoin(self.base_url, next_url)).text

@ -26,6 +26,6 @@ class PowermangaMangaExtractor(foolslide.FoolslideMangaExtractor):
category = "powermanga"
pattern = foolslide.manga_pattern(r"read\.powermanga\.org")
test = [("https://read.powermanga.org/series/one_piece/", {
"url": "3b2037a9ffe30ea0da4e710a40863f0693f21afe",
"keyword": "e2a924b0924cba711e78b3585ad24a97dec70006",
"url": "e5e9a64c14ca51a170e14c4b711aaa88fdf7a7aa",
"keyword": "1245ab2a730f9129001a4589b1d8615a17dc4a7b",
})]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann
# Copyright 2016-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,29 +8,27 @@
"""Extract comic-issues and entire comics from http://readcomiconline.to/"""
from . import kissmanga
from .. import text
from .common import ChapterExtractor, MangaExtractor
from .. import text, cloudflare
import re
class ReadcomiconlineExtractor(kissmanga.KissmangaExtractor):
class ReadcomiconlineBase():
"""Base class for readcomiconline extractors"""
category = "readcomiconline"
directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
root = "http://readcomiconline.to"
useragent = "Wget/1.19.2 (linux-gnu)"
def __init__(self, match):
kissmanga.KissmangaExtractor.__init__(self, match)
self.session.headers["User-Agent"] = "Wget/1.19.2 (linux-gnu)"
request = cloudflare.request_func
class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
kissmanga.KissmangaMangaExtractor):
class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
"""Extractor for comics from readcomiconline.to"""
subcategory = "comic"
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/"
r"Comic/[^/?&#]+/?$"]
pattern = [r"(?i)(?:https?://)?(?:www\.)?(readcomiconline\.to"
r"/Comic/[^/?&#]+/?)$"]
test = [
("http://readcomiconline.to/Comic/W-i-t-c-h", {
"url": "c5a530538a30b176916e30cbe223a93d83cb2691",
@ -42,6 +40,10 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
}),
]
def __init__(self, match):
MangaExtractor.__init__(self, match)
self.session.headers["User-Agent"] = self.useragent
def chapters(self, page):
results = []
comic, pos = text.extract(page, '<div class="heading"><h3>', '<')
@ -58,19 +60,21 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
return results
class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor,
kissmanga.KissmangaChapterExtractor):
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
"""Extractor for comic-issues from readcomiconline.to"""
subcategory = "issue"
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/"
r"Comic/[^/?&#]+/[^/?&#]+\?id=\d+"]
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
r"/Comic/[^/?&#]+/[^/?&#]+\?id=\d+"]
test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
"url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5",
"keyword": "dee8a8a44659825afe1d69e1d809a48b03e98c68",
})]
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
def __init__(self, match):
ChapterExtractor.__init__(self, match.group(0))
self.session.headers["User-Agent"] = self.useragent
def get_metadata(self, page):
comic, pos = text.extract(page, " - Read\r\n ", "\r\n")
iinfo, pos = text.extract(page, " ", "\r\n", pos)
match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
@ -82,6 +86,10 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor,
}
@staticmethod
def get_image_urls(page):
"""Extract list of all image-urls for a manga chapter"""
return list(text.extract_iter(page, 'lstImages.push("', '"'))
def get_images(page):
return [
(url, None)
for url in text.extract_iter(
page, 'lstImages.push("', '"'
)
]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann
# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga pages from http://www.thespectrum.net/manga_scans/"""
from .common import MangaExtractor, AsynchronousExtractor, Message
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
@ -33,55 +33,31 @@ class SpectrumnexusMangaExtractor(MangaExtractor):
return results
class SpectrumnexusChapterExtractor(AsynchronousExtractor):
class SpectrumnexusChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters or -volumes from thespectrum.net"""
category = "spectrumnexus"
subcategory = "chapter"
directory_fmt = ["{category}", "{manga}", "{identifier}"]
filename_fmt = "{manga} {identifier} {page:>03}.{extension}"
pattern = [
(r"(?:https?://)?(view\.thespectrum\.net/series/"
r"[^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"),
(r"(?:https?://)?(view\.thespectrum\.net/series/"
r"[^/]+-chapter-(\d+)\.html)"),
]
directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
pattern = [r"(?:https?://)?view\.thespectrum\.net/series/"
r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"]
test = [(("http://view.thespectrum.net/series/"
"toriko.html?ch=Chapter+343&page=1"), {
"url": "c0fc7dc594841217cc622a67edd79f06e9900333",
"keyword": "3d0cb57b6b1c2cbecc7aed33f83c24891a4ff53f",
"keyword": "a8abe126cbc5fc798148b0b155242a470c1ba9d1",
})]
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.url = "http://" + match.group(1)
self.identifier = match.group(2)
self.chapter = match.group(3)
self.volume = match.group(4)
def items(self):
params = {
"ch": self.identifier,
"page": 1,
}
page = self.request(self.url, params=params).text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data.copy()
for i in range(1, data["count"]+1):
url = self.get_image_url(page)
text.nameext_from_url(url, data)
data["page"] = i
yield Message.Url, url, data.copy()
if i < data["count"]:
params["page"] += 1
page = self.request(self.url, params=params).text
path, self.chapter_string, self.chapter, self.volume = match.groups()
url = "http://view.thespectrum.net/series/{}?ch={}".format(
path, self.chapter_string)
ChapterExtractor.__init__(self, url)
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
def get_metadata(self, page):
data = {
"chapter": util.safe_int(self.chapter),
"chapter_string": self.chapter_string.replace("+", " "),
"volume": util.safe_int(self.volume),
"identifier": self.identifier.replace("+", " "),
}
data = text.extract_all(page, (
('manga', '<title>', ' &#183; SPECTRUM NEXUS </title>'),
@ -90,7 +66,9 @@ class SpectrumnexusChapterExtractor(AsynchronousExtractor):
data["count"] = util.safe_int(data["count"])
return data
@staticmethod
def get_image_url(page):
"""Extract url of one manga page"""
return text.extract(page, '<img id="mainimage" src="', '"')[0]
def get_images(self, page):
params = {"page": 1}
while True:
yield text.extract(page, '<img id="mainimage" src="', '"')[0], None
params["page"] += 1
page = self.request(self.url, params=params).text

Loading…
Cancel
Save