gallery-dl/gallery_dl/extractor/mangafox.py

# -*- coding: utf-8 -*-

# Copyright 2017-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract manga-chapters and entire manga from http://fanfox.net/"""

from .common import ChapterExtractor
from .. import text, exception
import re


class MangafoxChapterExtractor(ChapterExtractor):
    """Extractor for manga-chapters from fanfox.net"""
    category = "mangafox"
    pattern = [(r"(?:https?://)?(?:www\.)?(?:mangafox\.me|fanfox\.net)"
                r"(/manga/[^/]+/(?:v\d+/)?c\d+[^/?&#]*)")]
    test = [
        ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
            "keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
            "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
        }),
        ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/", None),
    ]
    root = "http://fanfox.net"

    def __init__(self, match):
        self.urlbase = self.root + match.group(1)
        ChapterExtractor.__init__(self, self.urlbase + "/1.html")

    def get_metadata(self, page):
        if "Sorry, its licensed, and not available." in page:
            raise exception.AuthorizationError()
        data = text.extract_all(page, (
            ("manga"         , " - Read ", " Manga Scans "),
            ("sid"           , "var sid=", ";"),
            ("cid"           , "var cid=", ";"),
            ("count"         , "var total_pages=", ";"),
            ("chapter_string", 'var current_chapter="', '"'),
        ))[0]
        match = re.match(r"(v0*(\d+)/)?c0*(\d+)(.*)", data["chapter_string"])
        data["volume"] = match.group(2)
        data["chapter"] = match.group(3)
        data["chapter_minor"] = match.group(4) or ""
        data["manga"] = data["manga"].rpartition(" ")[0]
        for key in ("sid", "cid", "count", "volume", "chapter"):
            data[key] = text.parse_int(data[key])
        return data

    def get_images(self, page):
        pnum = 1
        while True:
            url, pos = text.extract(page, '<img src="', '"')
            yield url, None
            _  , pos = text.extract(page, '<img src="', '"', pos)
            url, pos = text.extract(page, '<img src="', '"', pos)
            yield url, None

            pnum += 2
            page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
[mangafox] add chapter extractor 8 years ago			`# -- coding: utf-8 --`

use generic chapter-extractor in more modules 7 years ago			`# Copyright 2017-2018 Mike Fährmann`
[mangafox] add chapter extractor 8 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

use generic chapter-extractor in more modules 7 years ago			`"""Extract manga-chapters and entire manga from http://fanfox.net/"""`
[mangafox] add chapter extractor 8 years ago
use generic chapter-extractor in more modules 7 years ago			`from .common import ChapterExtractor`
rename safe_int to parse_int; move parse_* to text module 7 years ago			`from .. import text, exception`
[mangafox] add chapter extractor 8 years ago			`import re`


use generic chapter-extractor in more modules 7 years ago			`class MangafoxChapterExtractor(ChapterExtractor):`
			`"""Extractor for manga-chapters from fanfox.net"""`
[mangafox] add chapter extractor 8 years ago			`category = "mangafox"`
use generic chapter-extractor in more modules 7 years ago			`pattern = [(r"(?:https?://)?(?:www\.)?(?:mangafox\.me\|fanfox\.net)"`
			`r"(/manga/[^/]+/(?:v\d+/)?c\d+[^/?&#]*)")]`
			`test = [`
			`("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {`
			`"keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",`
			`"content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",`
			`}),`
			`("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/", None),`
			`]`
			`root = "http://fanfox.net"`
[mangafox] add chapter extractor 8 years ago
			`def __init__(self, match):`
use generic chapter-extractor in more modules 7 years ago			`self.urlbase = self.root + match.group(1)`
			`ChapterExtractor.__init__(self, self.urlbase + "/1.html")`
[mangafox] add chapter extractor 8 years ago
use generic chapter-extractor in more modules 7 years ago			`def get_metadata(self, page):`
[mangafox] raise proper exception if chapter is not available 8 years ago			`if "Sorry, its licensed, and not available." in page:`
			`raise exception.AuthorizationError()`
[mangafox] add chapter extractor 8 years ago			`data = text.extract_all(page, (`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 7 years ago			`("manga" , " - Read ", " Manga Scans "),`
			`("sid" , "var sid=", ";"),`
			`("cid" , "var cid=", ";"),`
			`("count" , "var total_pages=", ";"),`
			`("chapter_string", 'var current_chapter="', '"'),`
[mangafox] add chapter extractor 8 years ago			`))[0]`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 7 years ago			`match = re.match(r"(v0(\d+)/)?c0(\d+)(.*)", data["chapter_string"])`
			`data["volume"] = match.group(2)`
[mangafox] add chapter extractor 8 years ago			`data["chapter"] = match.group(3)`
change keyword names to valid Python identifiers This commit mostly replaces all minus-signs ('-') in keyword names with underscores ('_') to allow them to be used in filter-expressions. For example 'gallery-id' got renamed to 'gallery_id'. (It is theoretically possible to access any variable, regardless of its name, with 'locals()["NAME"]', but that seems a bit too convoluted if just 'NAME' could be enough) 7 years ago			`data["chapter_minor"] = match.group(4) or ""`
use 'str.partition()' The (r)partition method is always faster then split() or any other method that has been replaced in this commit. 7 years ago			`data["manga"] = data["manga"].rpartition(" ")[0]`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 7 years ago			`for key in ("sid", "cid", "count", "volume", "chapter"):`
rename safe_int to parse_int; move parse_* to text module 7 years ago			`data[key] = text.parse_int(data[key])`
[mangafox] add chapter extractor 8 years ago			`return data`

use generic chapter-extractor in more modules 7 years ago			`def get_images(self, page):`
[mangafox] add chapter extractor 8 years ago			`pnum = 1`
			`while True:`
			`url, pos = text.extract(page, '<img src="', '"')`
use generic chapter-extractor in more modules 7 years ago			`yield url, None`
[mangafox] add chapter extractor 8 years ago			`_ , pos = text.extract(page, '<img src="', '"', pos)`
			`url, pos = text.extract(page, '<img src="', '"', pos)`
use generic chapter-extractor in more modules 7 years ago			`yield url, None`

[mangafox] add chapter extractor 8 years ago			`pnum += 2`
use generic chapter-extractor in more modules 7 years ago			`page = self.request("{}/{}.html".format(self.urlbase, pnum)).text`