[mangadex] add chapter- and manga-extractor

7 years ago · 749fbbfa6c
parent b58449fd88
commit 749fbbfa6c
8 changed files with 293 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,7 @@
 # Changelog
 ## Unreleased
 ## 1.3.0 - 2018-03-02
 - Added `--proxy` to explicitly specify a proxy server ([#76](https://github.com/mikf/gallery-dl/issues/76))
 - Added options to customize [archive ID formats](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorarchive-format) and [undefined replacement fields](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorkeywords-default)
--- a/docs/gallery-dl-example.conf
+++ b/docs/gallery-dl-example.conf
@ -0,0 +1,136 @@
 {
    "base-directory": "/tmp/",
    "netrc": false,
    "downloader":
    {
        "part": true,
        "part-directory": null,
        "http":
        {
            "rate": null,
            "retries": 5,
            "timeout": 30,
            "verify": true
        }
    },
    "extractor":
    {
        "archive": null,
        "proxy": null,
        "skip": true,
        "sleep": 0,
        "pixiv":
        {
            "user":
            {
                "directory": ["{category}", "{user[id]}"]
            },
            "bookmark":
            {
                "directory": ["{category}", "my bookmarks"]
            },
            "ugoira": true,
            "username": null,
            "password": null
        },
        "batoto":
        {
            "username": null,
            "password": null
        },
        "exhentai":
        {
            "wait-min": 3,
            "wait-max": 6,
            "original": true,
            "username": null,
            "password": null,
            "cookies": {
                "igneous": null,
                "s": null,
                "yay": "louder"
            }
        },
        "nijie":
        {
            "username": null,
            "password": null
        },
        "sankaku":
        {
            "wait-min": 2,
            "wait-max": 4,
            "username": null,
            "password": null
        },
        "seiga":
        {
            "username": null,
            "password": null
        },
        "gelbooru":
        {
            "filename": "{category}_{id:>07}_{md5}.{extension}",
            "api": true
        },
        "reddit":
        {
            "refresh-token": null,
            "comments": 500,
            "morecomments": false,
            "date-min": 0,
            "date-max": 253402210800,
            "date-format": "%Y-%m-%dT%H:%M:%S",
            "id-min": "0",
            "id-max": "ZIK0ZJ",
            "recursion": 0
        },
        "flickr":
        {
            "access-token": null,
            "access-token-secret": null,
            "metadata": false,
            "size-max": null
        },
        "deviantart":
        {
            "refresh-token": null,
            "flat": true,
            "mature": true,
            "original": true
        },
        "gfycat":
        {
            "format": "mp4"
        },
        "imgur":
        {
            "mp4": true
        },
        "tumblr":
        {
            "posts": "photo",
            "inline": false,
            "reblogs": true,
            "external": false
        },
        "recursive":
        {
            "blacklist": ["directlink", "oauth", "recursive", "test"]
        },
        "oauth":
        {
            "browser": true
        }
    },
    "output":
    {
        "mode": "auto",
        "shorten": true,
        "progress": true,
        "logfile": null,
        "unsupportedfile": null
    }
 }
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -47,6 +47,7 @@ Luscious             https://luscious.net/               Albums
 Manga Fox            http://fanfox.net/                  Chapters
 Manga Here           http://www.mangahere.co/            Chapters, Manga
 Manga Stream         https://mangastream.com/            Chapters
 Mangadex             https://mangadex.org/               Chapters, Manga
 Mangapanda           https://www.mangapanda.com/         Chapters, Manga
 MangaPark            https://mangapark.me/               Chapters, Manga
 Mangareader          https://www.mangareader.net/        Chapters, Manga
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -51,6 +51,7 @@ modules = [
    "konachan",
    "loveisover",
    "luscious",
    "mangadex",
    "mangafox",
    "mangahere",
    "mangapanda",
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@ -0,0 +1,148 @@
 # -*- coding: utf-8 -*-
 # Copyright 2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extract manga-chapters and entire manga from https://mangadex.org/"""
 from .common import ChapterExtractor, MangaExtractor
 from .. import text, util
 from urllib.parse import urljoin
 import json
 import re
 class MangadexExtractor():
    """Base class for mangadex extractors"""
    category = "mangadex"
    root = "https://mangadex.org"
 class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
    """Extractor for manga-chapters from mangadex.org"""
    pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"]
    test = [
        ("https://mangadex.org/chapter/122094", {
            "keyword": "b4c83fe41f125eae745c2e00d29e087cc4eb78df",
            "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
        }),
        # oneshot
        ("https://mangadex.org/chapter/138086", {
            "count": 64,
            "keyword": "9b1b7292f7dbcf10983fbdc34b8cdceeb47328ee",
        }),
    ]
    def __init__(self, match):
        self.chapter_id = match.group(1)
        url = self.root + "/chapter/" + self.chapter_id
        ChapterExtractor.__init__(self, url)
    def get_metadata(self, page):
        info    , pos = text.extract(page, '="og:title" content="', '"')
        manga_id, pos = text.extract(page, '/images/manga/', '.', pos)
        _       , pos = text.extract(page, ' id="jump_group"', '', pos)
        _       , pos = text.extract(page, ' selected ', '', pos)
        language, ___ = text.extract(page, " title='", "'", pos-100)
        group   , pos = text.extract(page, '>', '<', pos)
        info = text.unescape(info)
        match = re.match(
            r"(?:(?:Vol\. (\d+) )?Ch\. (\d+)([^ ]*)|(.*)) "
            r"\(([^)]+)\)",
            info)
        return {
            "manga": match.group(5),
            "manga_id": util.safe_int(manga_id),
            "volume": util.safe_int(match.group(1)),
            "chapter": util.safe_int(match.group(2)),
            "chapter_minor": match.group(3) or "",
            "chapter_id": util.safe_int(self.chapter_id),
            "chapter_string": info.rstrip(" - MangaDex"),
            "group": text.unescape(group),
            "lang": util.language_to_code(language),
            "language": language,
        }
    def get_images(self, page):
        dataurl , pos = text.extract(page, "var dataurl = '", "'")
        pagelist, pos = text.extract(page, "var page_array = [", "]", pos)
        server  , pos = text.extract(page, "var server = '", "'", pos)
        base = urljoin(self.root, server + dataurl + "/")
        return [
            (base + page, None)
            for page in json.loads(
                "[" + pagelist.replace("'", '"').rstrip(",") + "]"
            )
        ]
 class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
    """Extractor for manga from mangadex.org"""
    pattern = [r"(?:https?://)?(?:www\.)?(mangadex\.(?:org|com)/manga/\d+)"]
    test = [
        ("https://mangadex.org/manga/2946/souten-no-koumori", {
            "url": "9e77934759828458d0424473922e41f348719472",
            "keywords": {
                "manga": "Souten no Koumori",
                "manga_id": 2946,
                "title": "Oneshot",
                "volume": int,
                "chapter": int,
                "chapter_minor": str,
                "chapter_id": int,
                "group": str,
                "contributor": str,
                "date": str,
                "views": int,
                "lang": str,
                "language": str,
            },
        }),
    ]
    def chapters(self, page):
        results = []
        extr = text.extract
        manga = text.unescape(extr(
            page, '"og:title" content="', '"')[0].rpartition(" (")[0])
        manga_id = util.safe_int(extr(
            page, '/images/manga/', '.')[0])
        for info in text.extract_iter(page, "<tr id=", "</tr>"):
            chid    , pos = extr(info, 'data-chapter-id="', '"')
            chapter , pos = extr(info, 'data-chapter-num="', '"', pos)
            volume  , pos = extr(info, 'data-volume-num="', '"', pos)
            title   , pos = extr(info, 'data-chapter-name="', '"', pos)
            language, pos = extr(info, " title='", "'", pos)
            group   , pos = extr(info, "<td>", "</td>", pos)
            user    , pos = extr(info, "<td>", "</td>", pos)
            views   , pos = extr(info, ">", "<", pos)
            date    , pos = extr(info, ' datetime="', '"', pos)
            chapter, sep, minor = chapter.partition(".")
            results.append((self.root + "/chapter/" + chid, {
                "manga": manga,
                "manga_id": util.safe_int(manga_id),
                "title": text.unescape(title),
                "volume": util.safe_int(volume),
                "chapter": util.safe_int(chapter),
                "chapter_minor": sep + minor,
                "chapter_id": util.safe_int(chid),
                "group": text.unescape(text.remove_html(group)),
                "contributor": text.remove_html(user),
                "views": util.safe_int(views),
                "date": date,
                "lang": util.language_to_code(language),
                "language": language,
            }))
        return results
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@ -155,6 +155,8 @@ def language_to_code(lang, default=None):
 CODES = {
    "ar": "Arabic",
    "bg": "Bulgarian",
    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
-__version__ = "1.3.0"
+__version__ = "1.3.1-dev"
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@ -18,7 +18,9 @@ SKIP = {
    "archivedmoe", "archiveofsins", "thebarchive",
    # temporary issues
    "imgchili",
    "powermanga",
    "pinterest",
 }