use generic chapter-extractor in more modules

pull/79/head
Mike Fährmann 7 years ago
parent 347baf7ac5
commit 5b3c34aa96
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -168,9 +168,9 @@ class ChapterExtractor(Extractor):
subcategory = "chapter" subcategory = "chapter"
directory_fmt = [ directory_fmt = [
"{category}", "{manga}", "{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"] "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}"]
filename_fmt = ( filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
def __init__(self, url): def __init__(self, url):
Extractor.__init__(self) Extractor.__init__(self)
@ -182,7 +182,10 @@ class ChapterExtractor(Extractor):
imgs = self.get_images(page) imgs = self.get_images(page)
if "count" in data: if "count" in data:
images = zip(range(1, data["count"]+1), imgs) images = zip(
range(1, data["count"]+1),
imgs
)
else: else:
try: try:
data["count"] = len(imgs) data["count"] = len(imgs)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,9 +8,8 @@
"""Extract hentai-manga from https://hentai2read.com/""" """Extract hentai-manga from https://hentai2read.com/"""
from .common import MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
from . import hentaicdn
import re import re
import json import json
@ -53,42 +52,43 @@ class Hentai2readMangaExtractor(MangaExtractor):
return results return results
class Hentai2readChapterExtractor(hentaicdn.HentaicdnChapterExtractor): class Hentai2readChapterExtractor(ChapterExtractor):
"""Extractor for a single manga chapter from hentai2read.com""" """Extractor for a single manga chapter from hentai2read.com"""
category = "hentai2read" category = "hentai2read"
pattern = [r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)"]
test = [("http://hentai2read.com/amazon_elixir/1/", { test = [("http://hentai2read.com/amazon_elixir/1/", {
"url": "964b942cf492b3a129d2fe2608abfc475bc99e71", "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
"keyword": "0f6408d462a14bfe58030117dc295b84666843d0", "keyword": "9845105898d28c6a540cffdea60a1a20fab52431",
})] })]
def __init__(self, match): def __init__(self, match):
hentaicdn.HentaicdnChapterExtractor.__init__(self)
url_title, self.chapter = match.groups() url_title, self.chapter = match.groups()
self.url = "https://hentai2read.com/{}/{}/".format( url = "https://hentai2read.com/{}/{}/".format(url_title, self.chapter)
url_title, self.chapter ChapterExtractor.__init__(self, url)
)
def get_job_metadata(self, page, images): def get_metadata(self, page):
title = text.extract(page, "<title>", "</title>")[0] title, pos = text.extract(page, "<title>", "</title>")
chapter_id = text.extract(page, 'data-cid="', '"')[0] manga_id, pos = text.extract(page, 'data-mid="', '"', pos)
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - "
r"(\d+): (.+) . Page 1 ", title) r"(\d+): (.+) . Page 1 ", title)
return { return {
"manga_id": images[0].split("/")[-3],
"manga": match.group(1), "manga": match.group(1),
"manga_id": util.safe_int(manga_id),
"chapter": util.safe_int(self.chapter),
"chapter_id": util.safe_int(chapter_id),
"type": match.group(2), "type": match.group(2),
"chapter_id": chapter_id,
"chapter": self.chapter,
"author": match.group(3), "author": match.group(3),
"title": match.group(5), "title": match.group(5),
"count": len(images),
"lang": "en", "lang": "en",
"language": "English", "language": "English",
} }
@staticmethod @staticmethod
def get_image_urls(page): def get_images(page):
"""Extract and return a list of all image-urls""" """Extract and return a list of all image-urls"""
images = text.extract(page, "'images' : ", ",\n")[0] images = text.extract(page, "'images' : ", ",\n")[0]
return json.loads(images) return [
("https://hentaicdn.com/hentai" + part, None)
for part in json.loads(images)
]

@ -1,41 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Base classes for extractors from sites based on hentaicdn"""
from .common import Extractor, Message
from .. import text
import json
class HentaicdnChapterExtractor(Extractor):
"""Base class for extractors for a single manga chapter"""
subcategory = "chapter"
directory_fmt = ["{category}", "{manga_id} {title}"]
filename_fmt = ("{category}_{manga_id}_{chapter:>02}_"
"{num:>03}.{extension}")
url = ""
def items(self):
page = self.request(self.url).text
images = self.get_image_urls(page)
data = self.get_job_metadata(page, images)
yield Message.Version, 1
yield Message.Directory, data
for data["num"], part in enumerate(images, 1):
url = "https://hentaicdn.com/hentai" + part
yield Message.Url, url, text.nameext_from_url(url, data)
def get_job_metadata(self, page, images):
"""Collect metadata for extractor-job"""
@staticmethod
def get_image_urls(page):
"""Extract and return a list of all image-urls"""
images = text.extract(page, "var rff_imageList = ", ";")[0]
return json.loads(images)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,10 +8,10 @@
"""Extract hentai-manga from https://hentaihere.com/""" """Extract hentai-manga from https://hentaihere.com/"""
from .common import MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
from . import hentaicdn
import re import re
import json
class HentaihereMangaExtractor(MangaExtractor): class HentaihereMangaExtractor(MangaExtractor):
@ -56,36 +56,42 @@ class HentaihereMangaExtractor(MangaExtractor):
})) }))
class HentaihereChapterExtractor(hentaicdn.HentaicdnChapterExtractor): class HentaihereChapterExtractor(ChapterExtractor):
"""Extractor for a single manga chapter from hentaihere.com""" """Extractor for a single manga chapter from hentaihere.com"""
category = "hentaihere" category = "hentaihere"
pattern = [r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"]
test = [("https://hentaihere.com/m/S13812/1/1/", { test = [("https://hentaihere.com/m/S13812/1/1/", {
"url": "964b942cf492b3a129d2fe2608abfc475bc99e71", "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
"keyword": "a07753f655210525a80ff62607261715746f3273", "keyword": "e9382a9be337abce3db2b1132e85751379dc05c5",
})] })]
def __init__(self, match): def __init__(self, match):
hentaicdn.HentaicdnChapterExtractor.__init__(self) self.manga_id, self.chapter = match.groups()
self.gid, self.chapter = match.groups() url = "https://hentaihere.com/m/S{}/{}/1".format(
self.url = "https://hentaihere.com/m/S{}/{}/1".format( self.manga_id, self.chapter)
self.gid, self.chapter ChapterExtractor.__init__(self, url)
)
def get_job_metadata(self, page, images): def get_metadata(self, page):
title = text.extract(page, "<title>", "</title>")[0] title = text.extract(page, "<title>", "</title>")[0]
chapter_id = text.extract(page, 'report/C', '"')[0] chapter_id = text.extract(page, 'report/C', '"')[0]
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
match = re.match(pattern, title) match = re.match(pattern, title)
return { return {
"manga_id": self.gid,
"manga": match.group(1), "manga": match.group(1),
"manga_id": util.safe_int(self.manga_id),
"chapter": util.safe_int(self.chapter),
"chapter_id": util.safe_int(chapter_id),
"type": match.group(2), "type": match.group(2),
"chapter_id": chapter_id,
"chapter": self.chapter,
"title": match.group(3), "title": match.group(3),
"author": match.group(4), "author": match.group(4),
"count": len(images),
"lang": "en", "lang": "en",
"language": "English", "language": "English",
} }
@staticmethod
def get_images(page):
images = text.extract(page, "var rff_imageList = ", ";")[0]
return [
("https://hentaicdn.com/hentai" + part, None)
for part in json.loads(images)
]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from http://kissmanga.com/""" """Extract manga-chapters and entire manga from http://kissmanga.com/"""
from .common import Extractor, MangaExtractor, Message from .common import ChapterExtractor, MangaExtractor
from .. import text, util, cloudflare, aes, exception from .. import text, util, cloudflare, aes, exception
from ..cache import cache from ..cache import cache
import re import re
@ -21,21 +21,11 @@ IV = [
] ]
class KissmangaExtractor(Extractor): class KissmangaBase():
"""Base class for kissmanga extractors""" """Base class for kissmanga extractors"""
category = "kissmanga" category = "kissmanga"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
root = "http://kissmanga.com" root = "http://kissmanga.com"
def __init__(self, match):
Extractor.__init__(self)
self.url = match.group(0)
self.session.headers["Referer"] = self.root
def request(self, url): def request(self, url):
response = cloudflare.request_func(self, url) response = cloudflare.request_func(self, url)
if response.history and "/Message/AreYouHuman?" in response.url: if response.history and "/Message/AreYouHuman?" in response.url:
@ -72,10 +62,10 @@ class KissmangaExtractor(Extractor):
return data return data
class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor): class KissmangaMangaExtractor(KissmangaBase, MangaExtractor):
"""Extractor for manga from kissmanga.com""" """Extractor for manga from kissmanga.com"""
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/" pattern = [r"(?i)(?:https?://)?(?:www\.)?(kissmanga\.com"
r"Manga/[^/?&#]+/?$"] r"/Manga/[^/?&#]+/?)$"]
test = [ test = [
("http://kissmanga.com/Manga/Dropout", { ("http://kissmanga.com/Manga/Dropout", {
"url": "992befdd64e178fe5af67de53f8b510860d968ca", "url": "992befdd64e178fe5af67de53f8b510860d968ca",
@ -105,11 +95,10 @@ class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor):
return results return results
class KissmangaChapterExtractor(KissmangaExtractor): class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
"""Extractor for manga-chapters from kissmanga.com""" """Extractor for manga-chapters from kissmanga.com"""
subcategory = "chapter" pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/" r"/Manga/[^/?&#]+/[^/?&#]+\?id=\d+"]
r"Manga/[^/?&#]+/[^/?&#]+\?id=\d+"]
test = [ test = [
("http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", { ("http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", {
"url": "4136bcd1c6cecbca8cc2bc965d54f33ef0a97cc0", "url": "4136bcd1c6cecbca8cc2bc965d54f33ef0a97cc0",
@ -126,18 +115,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
("http://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608", None), ("http://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608", None),
] ]
def items(self): def __init__(self, match):
page = self.request(self.url).text ChapterExtractor.__init__(self, match.group(0))
data = self.get_job_metadata(page) self.session.headers["Referer"] = self.root
imgs = self.get_image_urls(page)
data["count"] = len(imgs) def get_metadata(self, page):
yield Message.Version, 1
yield Message.Directory, data
for data["page"], url in enumerate(imgs, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
title = text.extract(page, "<title>", "</title>")[0].strip() title = text.extract(page, "<title>", "</title>")[0].strip()
manga, cinfo = title.split("\n")[1:3] manga, cinfo = title.split("\n")[1:3]
data = { data = {
@ -148,12 +130,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
} }
return self.parse_chapter_string(data) return self.parse_chapter_string(data)
def get_image_urls(self, page): def get_images(self, page):
"""Extract list of all image-urls for a manga chapter"""
try: try:
key = self.build_aes_key(page) key = self.build_aes_key(page)
return [ return [
aes.aes_cbc_decrypt_text(data, key, IV) (aes.aes_cbc_decrypt_text(data, key, IV), None)
for data in text.extract_iter( for data in text.extract_iter(
page, 'lstImages.push(wrapKA("', '"' page, 'lstImages.push(wrapKA("', '"'
) )

@ -1,56 +1,39 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extract manga-chapters and entire manga from http://www.mangafox.me/""" """Extract manga-chapters and entire manga from http://fanfox.net/"""
from .common import AsynchronousExtractor, Message from .common import ChapterExtractor
from .. import text, util, exception from .. import text, util, exception
import re import re
class MangafoxChapterExtractor(AsynchronousExtractor): class MangafoxChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from mangafox.me""" """Extractor for manga-chapters from fanfox.net"""
category = "mangafox" category = "mangafox"
subcategory = "chapter" pattern = [(r"(?:https?://)?(?:www\.)?(?:mangafox\.me|fanfox\.net)"
directory_fmt = [ r"(/manga/[^/]+/(?:v\d+/)?c\d+[^/?&#]*)")]
"{category}", "{manga}", test = [
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}"] ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
filename_fmt = ( "keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
pattern = [(r"(?:https?://)?(?:www\.)?(mangafox\.me/manga/" }),
r"[^/]+/(v\d+/)?c\d+[^/]*)")] ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/", None),
test = [(("http://mangafox.me/manga/kidou_keisatsu_patlabor/" ]
"v05/c006.2/1.html"), { root = "http://fanfox.net"
"keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
"content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
})]
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self) self.urlbase = self.root + match.group(1)
self.url = "http://" + match.group(1) ChapterExtractor.__init__(self, self.urlbase + "/1.html")
def items(self): def get_metadata(self, page):
page = self.request(self.url + "/1.html").text
if "Sorry, its licensed, and not available." in page: if "Sorry, its licensed, and not available." in page:
raise exception.AuthorizationError() raise exception.AuthorizationError()
data = self.get_metadata(page)
urls = zip(
range(1, data["count"]+1),
self.get_image_urls(page),
)
yield Message.Version, 1
yield Message.Directory, data.copy()
for data["page"], url in urls:
text.nameext_from_url(url, data)
yield Message.Url, url, data.copy()
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
data = text.extract_all(page, ( data = text.extract_all(page, (
("manga" , " - Read ", " Manga Scans "), ("manga" , " - Read ", " Manga Scans "),
("sid" , "var sid=", ";"), ("sid" , "var sid=", ";"),
@ -67,14 +50,14 @@ class MangafoxChapterExtractor(AsynchronousExtractor):
data[key] = util.safe_int(data[key]) data[key] = util.safe_int(data[key])
return data return data
def get_image_urls(self, page): def get_images(self, page):
"""Yield all image-urls for this chapter"""
pnum = 1 pnum = 1
while True: while True:
url, pos = text.extract(page, '<img src="', '"') url, pos = text.extract(page, '<img src="', '"')
yield url yield url, None
_ , pos = text.extract(page, '<img src="', '"', pos) _ , pos = text.extract(page, '<img src="', '"', pos)
url, pos = text.extract(page, '<img src="', '"', pos) url, pos = text.extract(page, '<img src="', '"', pos)
yield url yield url, None
pnum += 2 pnum += 2
page = self.request(self.url + "/{}.html".format(pnum)).text page = self.request("{}/{}.html".format(self.urlbase, pnum)).text

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from http://www.mangahere.co/""" """Extract manga-chapters and entire manga from http://www.mangahere.co/"""
from .common import MangaExtractor, AsynchronousExtractor, Message from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
from urllib.parse import urljoin from urllib.parse import urljoin
import re import re
@ -60,15 +60,9 @@ class MangahereMangaExtractor(MangaExtractor):
})) }))
class MangahereChapterExtractor(AsynchronousExtractor): class MangahereChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from mangahere.co""" """Extractor for manga-chapters from mangahere.co"""
category = "mangahere" category = "mangahere"
subcategory = "chapter"
directory_fmt = [
"{category}", "{manga}",
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}"]
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
pattern = [(r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/" pattern = [(r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
r"([^/]+(?:/v0*(\d+))?/c0*(\d+)(\.\d+)?)")] r"([^/]+(?:/v0*(\d+))?/c0*(\d+)(\.\d+)?)")]
test = [ test = [
@ -82,27 +76,12 @@ class MangahereChapterExtractor(AsynchronousExtractor):
url_fmt = "http://www.mangahere.cc/manga/{}/{}.html" url_fmt = "http://www.mangahere.cc/manga/{}/{}.html"
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part, self.volume, self.chapter, self.chminor = match.groups() self.part, self.volume, self.chapter, self.chminor = match.groups()
def items(self):
# remove ".html" for the first chapter page to avoid redirects # remove ".html" for the first chapter page to avoid redirects
url = self.url_fmt.format(self.part, "")[:-5] url = self.url_fmt.format(self.part, "")[:-5]
ChapterExtractor.__init__(self, url)
page = self.request(url).text def get_metadata(self, page):
data = self.get_job_metadata(page)
urls = zip(
range(1, data["count"]+1),
self.get_image_urls(page),
)
yield Message.Version, 1
yield Message.Directory, data.copy()
for data["page"], url in urls:
text.nameext_from_url(url, data)
yield Message.Url, url, data.copy()
def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
manga, pos = text.extract(page, '<title>', '</title>') manga, pos = text.extract(page, '<title>', '</title>')
chid , pos = text.extract(page, '.net/store/manga/', '/', pos) chid , pos = text.extract(page, '.net/store/manga/', '/', pos)
@ -122,15 +101,16 @@ class MangahereChapterExtractor(AsynchronousExtractor):
"language": "English", "language": "English",
} }
def get_image_urls(self, page): def get_images(self, page):
"""Yield all image-urls for this chapter""" """Yield all image-urls for this chapter"""
pnum = 1 pnum = 1
while True: while True:
url, pos = text.extract(page, '<img src="', '"') url, pos = text.extract(page, '<img src="', '"')
yield url yield url, None
_ , pos = text.extract(page, '<img src="', '"', pos) _ , pos = text.extract(page, '<img src="', '"', pos)
_ , pos = text.extract(page, '<img src="', '"', pos) _ , pos = text.extract(page, '<img src="', '"', pos)
url, pos = text.extract(page, '<img src="', '"', pos) url, pos = text.extract(page, '<img src="', '"', pos)
yield url yield url, None
pnum += 2 pnum += 2
page = self.request(self.url_fmt.format(self.part, pnum)).text page = self.request(self.url_fmt.format(self.part, pnum)).text

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -35,5 +35,5 @@ class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
] ]
test = [("http://www.mangapanda.com/red-storm/2", { test = [("http://www.mangapanda.com/red-storm/2", {
"url": "4bf4ddf6c50105ec8a37675495ab80c46608275d", "url": "4bf4ddf6c50105ec8a37675495ab80c46608275d",
"keyword": "367d2694b49cc7cac82d68530d7d467a62453502", "keyword": "32b5e84017c2bf5f122b339ecf40899e41f18cc9",
})] })]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from http://www.mangareader.net/""" """Extract manga-chapters and entire manga from http://www.mangareader.net/"""
from .common import AsynchronousExtractor, MangaExtractor, Message from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
@ -57,11 +57,8 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
results.append((self.root + url, data.copy())) results.append((self.root + url, data.copy()))
class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor): class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
"""Extractor for manga-chapters from mangareader.net""" """Extractor for manga-chapters from mangareader.net"""
subcategory = "chapter"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03}{title:?: //}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [ pattern = [
(r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"), (r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"),
(r"(?:https?://)?(?:www\.)?mangareader\.net" (r"(?:https?://)?(?:www\.)?mangareader\.net"
@ -70,26 +67,14 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
test = [(("http://www.mangareader.net/" test = [(("http://www.mangareader.net/"
"karate-shoukoushi-kohinata-minoru/11"), { "karate-shoukoushi-kohinata-minoru/11"), {
"url": "84ffaab4c027ef9022695c53163c3aeabd07ca58", "url": "84ffaab4c027ef9022695c53163c3aeabd07ca58",
"keyword": "2038e6a780a0028eee0067985b55debb1d4a6aab", "keyword": "2893cfcd1916859fb498f3345f1929f868fe667f",
})] })]
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part, self.url_title, self.chapter = match.groups() self.part, self.url_title, self.chapter = match.groups()
ChapterExtractor.__init__(self, self.root + self.part)
def items(self): def get_metadata(self, chapter_page):
page = self.request(self.root + self.part).text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data
for data["page"] in range(1, data["count"]+1):
next_url, image_url, image_data = self.get_page_metadata(page)
image_data.update(data)
yield Message.Url, image_url, image_data
if next_url:
page = self.request(next_url).text
def get_job_metadata(self, chapter_page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
page = self.request(self.root + self.url_title).text page = self.request(self.root + self.url_title).text
data = self.parse_page(page, { data = self.parse_page(page, {
@ -106,7 +91,16 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
) )
return data return data
def get_page_metadata(self, page): def get_images(self, page):
while True:
next_url, image_url, image_data = self.get_image_metadata(page)
yield image_url, image_data
if not next_url:
return
page = self.request(next_url).text
def get_image_metadata(self, page):
"""Collect next url, image-url and metadata for one manga-page""" """Collect next url, image-url and metadata for one manga-page"""
extr = text.extract extr = text.extract
width = None width = None
@ -122,7 +116,7 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
width , pos = extr(page, '<img id="img" width="', '"', pos) width , pos = extr(page, '<img id="img" width="', '"', pos)
height, pos = extr(page, ' height="', '"', pos) height, pos = extr(page, ' height="', '"', pos)
image, pos = extr(page, ' src="', '"', pos) image, pos = extr(page, ' src="', '"', pos)
return self.root + url, image, text.nameext_from_url(image, { return self.root + url, image, {
"width": util.safe_int(width), "width": util.safe_int(width),
"height": util.safe_int(height), "height": util.safe_int(height),
}) }

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,42 +8,23 @@
"""Extract manga-chapters from https://mangastream.com/""" """Extract manga-chapters from https://mangastream.com/"""
from .common import AsynchronousExtractor, Message from .common import ChapterExtractor
from .. import text, util from .. import text, util
from urllib.parse import urljoin from urllib.parse import urljoin
class MangastreamChapterExtractor(AsynchronousExtractor): class MangastreamChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from mangastream.com""" """Extractor for manga-chapters from mangastream.com"""
category = "mangastream" category = "mangastream"
subcategory = "chapter"
directory_fmt = ["{category}", "{manga}", "c{chapter}{title:?: //}"]
filename_fmt = "{manga}_c{chapter}_{page:>03}.{extension}"
pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/" pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/"
r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")] r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")]
base_url = "https://mangastream.com/r/" base_url = "https://readms.net/r/"
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part, self.chapter, self.ch_id = match.groups() self.part, self.chapter, self.ch_id = match.groups()
ChapterExtractor.__init__(self, self.base_url + self.part)
def items(self): def get_metadata(self, page):
page = self.request(self.base_url + self.part).text
data = self.get_job_metadata(page)
next_url = None
yield Message.Version, 1
yield Message.Directory, data.copy()
for data["page"] in range(1, data["count"]+1):
if next_url:
page = self.request(next_url).text
next_url, image_url = self.get_page_metadata(page)
text.nameext_from_url(image_url, data)
next_url = urljoin(self.base_url, next_url)
image_url = urljoin(self.base_url, image_url)
yield Message.Url, image_url, data.copy()
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
manga, pos = text.extract( manga, pos = text.extract(
page, '<span class="hidden-xs hidden-sm">', "<") page, '<span class="hidden-xs hidden-sm">', "<")
pos = page.find(self.part, pos) pos = page.find(self.part, pos)
@ -59,9 +40,11 @@ class MangastreamChapterExtractor(AsynchronousExtractor):
"language": "English", "language": "English",
} }
@staticmethod def get_images(self, page):
def get_page_metadata(page): while True:
"""Collect next url, image-url and metadata for one manga-page""" next_url, pos = text.extract(
nurl, pos = text.extract(page, '<div class="page">\n<a href="', '"') page, '<div class="page">\n<a href="', '"')
iurl, pos = text.extract(page, '<img id="manga-page" src="', '"', pos) image_url, pos = text.extract(
return nurl, iurl page, '<img id="manga-page" src="', '"', pos)
yield urljoin(self.base_url, image_url), None
page = self.request(urljoin(self.base_url, next_url)).text

@ -26,6 +26,6 @@ class PowermangaMangaExtractor(foolslide.FoolslideMangaExtractor):
category = "powermanga" category = "powermanga"
pattern = foolslide.manga_pattern(r"read\.powermanga\.org") pattern = foolslide.manga_pattern(r"read\.powermanga\.org")
test = [("https://read.powermanga.org/series/one_piece/", { test = [("https://read.powermanga.org/series/one_piece/", {
"url": "3b2037a9ffe30ea0da4e710a40863f0693f21afe", "url": "e5e9a64c14ca51a170e14c4b711aaa88fdf7a7aa",
"keyword": "e2a924b0924cba711e78b3585ad24a97dec70006", "keyword": "1245ab2a730f9129001a4589b1d8615a17dc4a7b",
})] })]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,29 +8,27 @@
"""Extract comic-issues and entire comics from http://readcomiconline.to/""" """Extract comic-issues and entire comics from http://readcomiconline.to/"""
from . import kissmanga from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text, cloudflare
import re import re
class ReadcomiconlineExtractor(kissmanga.KissmangaExtractor): class ReadcomiconlineBase():
"""Base class for readcomiconline extractors""" """Base class for readcomiconline extractors"""
category = "readcomiconline" category = "readcomiconline"
directory_fmt = ["{category}", "{comic}", "{issue:>03}"] directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
root = "http://readcomiconline.to" root = "http://readcomiconline.to"
useragent = "Wget/1.19.2 (linux-gnu)"
def __init__(self, match): request = cloudflare.request_func
kissmanga.KissmangaExtractor.__init__(self, match)
self.session.headers["User-Agent"] = "Wget/1.19.2 (linux-gnu)"
class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor, class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
kissmanga.KissmangaMangaExtractor):
"""Extractor for comics from readcomiconline.to""" """Extractor for comics from readcomiconline.to"""
subcategory = "comic" subcategory = "comic"
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/" pattern = [r"(?i)(?:https?://)?(?:www\.)?(readcomiconline\.to"
r"Comic/[^/?&#]+/?$"] r"/Comic/[^/?&#]+/?)$"]
test = [ test = [
("http://readcomiconline.to/Comic/W-i-t-c-h", { ("http://readcomiconline.to/Comic/W-i-t-c-h", {
"url": "c5a530538a30b176916e30cbe223a93d83cb2691", "url": "c5a530538a30b176916e30cbe223a93d83cb2691",
@ -42,6 +40,10 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
}), }),
] ]
def __init__(self, match):
MangaExtractor.__init__(self, match)
self.session.headers["User-Agent"] = self.useragent
def chapters(self, page): def chapters(self, page):
results = [] results = []
comic, pos = text.extract(page, '<div class="heading"><h3>', '<') comic, pos = text.extract(page, '<div class="heading"><h3>', '<')
@ -58,19 +60,21 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
return results return results
class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor, class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
kissmanga.KissmangaChapterExtractor):
"""Extractor for comic-issues from readcomiconline.to""" """Extractor for comic-issues from readcomiconline.to"""
subcategory = "issue" subcategory = "issue"
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/" pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
r"Comic/[^/?&#]+/[^/?&#]+\?id=\d+"] r"/Comic/[^/?&#]+/[^/?&#]+\?id=\d+"]
test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
"url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5", "url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5",
"keyword": "dee8a8a44659825afe1d69e1d809a48b03e98c68", "keyword": "dee8a8a44659825afe1d69e1d809a48b03e98c68",
})] })]
def get_job_metadata(self, page): def __init__(self, match):
"""Collect metadata for extractor-job""" ChapterExtractor.__init__(self, match.group(0))
self.session.headers["User-Agent"] = self.useragent
def get_metadata(self, page):
comic, pos = text.extract(page, " - Read\r\n ", "\r\n") comic, pos = text.extract(page, " - Read\r\n ", "\r\n")
iinfo, pos = text.extract(page, " ", "\r\n", pos) iinfo, pos = text.extract(page, " ", "\r\n", pos)
match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo) match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
@ -82,6 +86,10 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor,
} }
@staticmethod @staticmethod
def get_image_urls(page): def get_images(page):
"""Extract list of all image-urls for a manga chapter""" return [
return list(text.extract_iter(page, 'lstImages.push("', '"')) (url, None)
for url in text.extract_iter(
page, 'lstImages.push("', '"'
)
]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -8,7 +8,7 @@
"""Extract manga pages from http://www.thespectrum.net/manga_scans/""" """Extract manga pages from http://www.thespectrum.net/manga_scans/"""
from .common import MangaExtractor, AsynchronousExtractor, Message from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
@ -33,55 +33,31 @@ class SpectrumnexusMangaExtractor(MangaExtractor):
return results return results
class SpectrumnexusChapterExtractor(AsynchronousExtractor): class SpectrumnexusChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters or -volumes from thespectrum.net""" """Extractor for manga-chapters or -volumes from thespectrum.net"""
category = "spectrumnexus" category = "spectrumnexus"
subcategory = "chapter" directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
directory_fmt = ["{category}", "{manga}", "{identifier}"] filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
filename_fmt = "{manga} {identifier} {page:>03}.{extension}"
pattern = [ pattern = [r"(?:https?://)?view\.thespectrum\.net/series/"
(r"(?:https?://)?(view\.thespectrum\.net/series/" r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"]
r"[^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"),
(r"(?:https?://)?(view\.thespectrum\.net/series/"
r"[^/]+-chapter-(\d+)\.html)"),
]
test = [(("http://view.thespectrum.net/series/" test = [(("http://view.thespectrum.net/series/"
"toriko.html?ch=Chapter+343&page=1"), { "toriko.html?ch=Chapter+343&page=1"), {
"url": "c0fc7dc594841217cc622a67edd79f06e9900333", "url": "c0fc7dc594841217cc622a67edd79f06e9900333",
"keyword": "3d0cb57b6b1c2cbecc7aed33f83c24891a4ff53f", "keyword": "a8abe126cbc5fc798148b0b155242a470c1ba9d1",
})] })]
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self) path, self.chapter_string, self.chapter, self.volume = match.groups()
self.url = "http://" + match.group(1) url = "http://view.thespectrum.net/series/{}?ch={}".format(
self.identifier = match.group(2) path, self.chapter_string)
self.chapter = match.group(3) ChapterExtractor.__init__(self, url)
self.volume = match.group(4)
def items(self):
params = {
"ch": self.identifier,
"page": 1,
}
page = self.request(self.url, params=params).text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data.copy()
for i in range(1, data["count"]+1):
url = self.get_image_url(page)
text.nameext_from_url(url, data)
data["page"] = i
yield Message.Url, url, data.copy()
if i < data["count"]:
params["page"] += 1
page = self.request(self.url, params=params).text
def get_job_metadata(self, page): def get_metadata(self, page):
"""Collect metadata for extractor-job"""
data = { data = {
"chapter": util.safe_int(self.chapter), "chapter": util.safe_int(self.chapter),
"chapter_string": self.chapter_string.replace("+", " "),
"volume": util.safe_int(self.volume), "volume": util.safe_int(self.volume),
"identifier": self.identifier.replace("+", " "),
} }
data = text.extract_all(page, ( data = text.extract_all(page, (
('manga', '<title>', ' &#183; SPECTRUM NEXUS </title>'), ('manga', '<title>', ' &#183; SPECTRUM NEXUS </title>'),
@ -90,7 +66,9 @@ class SpectrumnexusChapterExtractor(AsynchronousExtractor):
data["count"] = util.safe_int(data["count"]) data["count"] = util.safe_int(data["count"])
return data return data
@staticmethod def get_images(self, page):
def get_image_url(page): params = {"page": 1}
"""Extract url of one manga page""" while True:
return text.extract(page, '<img id="mainimage" src="', '"')[0] yield text.extract(page, '<img id="mainimage" src="', '"')[0], None
params["page"] += 1
page = self.request(self.url, params=params).text

Loading…
Cancel
Save