gallery-dl/gallery_dl/extractor/hentaifox.py

# -*- coding: utf-8 -*-

# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://hentaifox.com/"""

from .common import GalleryExtractor, Extractor, Message
from .. import text
import json


class HentaifoxBase():
    """Base class for hentaifox extractors"""
    category = "hentaifox"
    root = "https://hentaifox.com"


class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
    """Extractor for image galleries on hentaifox.com"""
    pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
    test = (
        ("https://hentaifox.com/gallery/56622/", {
            "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
            "keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92",
            "count": 24,
        }),
        # 'split_tag' element (#1378)
        ("https://hentaifox.com/gallery/630/", {
            "keyword": {
                "artist": ["beti", "betty", "magi", "mimikaki"],
                "characters": [
                    "aerith gainsborough",
                    "tifa lockhart",
                    "yuffie kisaragi"
                ],
                "count": 32,
                "gallery_id": 630,
                "group": ["cu-little2"],
                "parody": ["darkstalkers | vampire", "final fantasy vii"],
                "tags": ["femdom", "fingering", "masturbation", "yuri"],
                "title": "Cu-Little Bakanyaï½ž",
                "type": "doujinshi",
            },
        }),
    )

    def __init__(self, match):
        GalleryExtractor.__init__(self, match)
        self.gallery_id = match.group(2)

    @staticmethod
    def _split(txt):
        return [
            text.remove_html(tag.partition(">")[2], "", "")
            for tag in text.extract_iter(
                txt, "class='tag_btn", "<span class='t_badge")
        ]

    def metadata(self, page):
        extr = text.extract_from(page)
        split = self._split

        return {
            "gallery_id": text.parse_int(self.gallery_id),
            "title"     : text.unescape(extr("<h1>", "</h1>")),
            "parody"    : split(extr(">Parodies:"  , "</ul>")),
            "characters": split(extr(">Characters:", "</ul>")),
            "tags"      : split(extr(">Tags:"      , "</ul>")),
            "artist"    : split(extr(">Artists:"   , "</ul>")),
            "group"     : split(extr(">Groups:"    , "</ul>")),
            "type"      : text.remove_html(extr(">Category:", "<span")),
            "language"  : "English",
            "lang"      : "en",
        }

    def images(self, page):
        cover, pos = text.extract(page, '<img src="', '"')
        data , pos = text.extract(page, "$.parseJSON('", "');", pos)
        path = "/".join(cover.split("/")[3:-1])

        result = []
        append = result.append
        extmap = {"j": "jpg", "p": "png", "g": "gif"}
        urlfmt = ("/" + path + "/{}.{}").format

        server1 = "https://i.hentaifox.com"
        server2 = "https://i2.hentaifox.com"

        for num, image in json.loads(data).items():
            ext, width, height = image.split(",")
            path = urlfmt(num, extmap[ext])
            append((server1 + path, {
                "width"    : width,
                "height"   : height,
                "_fallback": (server2 + path,),
            }))

        return result


class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
    """Extractor for search results and listings on hentaifox.com"""
    subcategory = "search"
    pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
               r"(/(?:parody|tag|artist|character|search|group)/[^/?%#]+)")
    test = (
        ("https://hentaifox.com/parody/touhou-project/"),
        ("https://hentaifox.com/character/reimu-hakurei/"),
        ("https://hentaifox.com/artist/distance/"),
        ("https://hentaifox.com/search/touhou/"),
        ("https://hentaifox.com/group/v-slash/"),
        ("https://hentaifox.com/tag/heterochromia/", {
            "pattern": HentaifoxGalleryExtractor.pattern,
            "count": ">= 60",
            "keyword": {
                "url"       : str,
                "gallery_id": int,
                "title"     : str,
            },
        }),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.path = match.group(1)

    def items(self):
        yield Message.Version, 1
        for gallery in self.galleries():
            yield Message.Queue, gallery["url"], gallery

    def galleries(self):
        num = 1

        while True:
            url = "{}{}/pag/{}/".format(self.root, self.path, num)
            page = self.request(url).text

            for info in text.extract_iter(
                    page, 'class="g_title"><a href="', '</a>'):
                url, _, title = info.partition('">')

                yield {
                    "url"       : text.urljoin(self.root, url),
                    "gallery_id": text.parse_int(
                        url.strip("/").rpartition("/")[2]),
                    "title"     : text.unescape(title),
                    "_extractor": HentaifoxGalleryExtractor,
                }

            pos = page.find(">Next<")
            url = text.rextract(page, "href=", ">", pos)[0]
            if pos == -1 or "/pag" not in url:
                return
            num += 1
[hentaifox] add chapter extractor (#160) 6 years ago			`# -- coding: utf-8 --`

[hentaifox] improve image extraction (fixes #1366) build image URLs from embedded JSON data instead 0f rewriting thumbnail URLs 4 years ago			`# Copyright 2019-2021 Mike Fährmann`
[hentaifox] add chapter extractor (#160) 6 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://hentaifox.com/"""`

use GalleryExtractor as common base class 6 years ago			`from .common import GalleryExtractor, Extractor, Message`
[hentaifox] add chapter extractor (#160) 6 years ago			`from .. import text`
[hentaifox] improve image extraction (fixes #1366) build image URLs from embedded JSON data instead 0f rewriting thumbnail URLs 4 years ago			`import json`
[hentaifox] add chapter extractor (#160) 6 years ago

adjust metadata types for GalleryExtractors 6 years ago			`class HentaifoxBase():`
			`"""Base class for hentaifox extractors"""`
[hentaifox] add chapter extractor (#160) 6 years ago			`category = "hentaifox"`
adjust metadata types for GalleryExtractors 6 years ago			`root = "https://hentaifox.com"`


			`class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):`
			`"""Extractor for image galleries on hentaifox.com"""`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 6 years ago			`pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"`
[hentaifox] improve metadata extraction (fixes #1378) 4 years ago			`test = (`
			`("https://hentaifox.com/gallery/56622/", {`
			`"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",`
			`"keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92",`
			`"count": 24,`
			`}),`
			`# 'split_tag' element (#1378)`
			`("https://hentaifox.com/gallery/630/", {`
			`"keyword": {`
			`"artist": ["beti", "betty", "magi", "mimikaki"],`
			`"characters": [`
			`"aerith gainsborough",`
			`"tifa lockhart",`
			`"yuffie kisaragi"`
			`],`
			`"count": 32,`
			`"gallery_id": 630,`
			`"group": ["cu-little2"],`
			`"parody": ["darkstalkers \| vampire", "final fantasy vii"],`
			`"tags": ["femdom", "fingering", "masturbation", "yuri"],`
			`"title": "Cu-Little Bakanyaï½ž",`
			`"type": "doujinshi",`
			`},`
			`}),`
			`)`
[hentaifox] add chapter extractor (#160) 6 years ago
			`def __init__(self, match):`
use GalleryExtractor as common base class 6 years ago			`GalleryExtractor.__init__(self, match)`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 6 years ago			`self.gallery_id = match.group(2)`
[hentaifox] add chapter extractor (#160) 6 years ago
[hentaifox] improve metadata extraction (fixes #1378) 4 years ago			`@staticmethod`
			`def _split(txt):`
			`return [`
			`text.remove_html(tag.partition(">")[2], "", "")`
			`for tag in text.extract_iter(`
			`txt, "class='tag_btn", "<span class='t_badge")`
			`]`

			`def metadata(self, page):`
use 'text.extract_from()' in a few places 6 years ago			`extr = text.extract_from(page)`
[hentaifox] improve metadata extraction (fixes #1378) 4 years ago			`split = self._split`
[hentaifox] add chapter extractor (#160) 6 years ago
use 'text.extract_from()' in a few places 6 years ago			`return {`
			`"gallery_id": text.parse_int(self.gallery_id),`
			`"title" : text.unescape(extr("<h1>", "</h1>")),`
[hentaifox] improve metadata extraction (fixes #1378) 4 years ago			`"parody" : split(extr(">Parodies:" , "</ul>")),`
			`"characters": split(extr(">Characters:", "</ul>")),`
			`"tags" : split(extr(">Tags:" , "</ul>")),`
			`"artist" : split(extr(">Artists:" , "</ul>")),`
			`"group" : split(extr(">Groups:" , "</ul>")),`
[hentaifox] fix extraction 5 years ago			`"type" : text.remove_html(extr(">Category:", "<span")),`
use 'text.extract_from()' in a few places 6 years ago			`"language" : "English",`
			`"lang" : "en",`
			`}`
[hentaifox] add chapter extractor (#160) 6 years ago
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 6 years ago			`def images(self, page):`
[hentaifox] improve image extraction (fixes #1366) build image URLs from embedded JSON data instead 0f rewriting thumbnail URLs 4 years ago			`cover, pos = text.extract(page, '<img src="', '"')`
			`data , pos = text.extract(page, "$.parseJSON('", "');", pos)`
			`path = "/".join(cover.split("/")[3:-1])`

			`result = []`
			`append = result.append`
			`extmap = {"j": "jpg", "p": "png", "g": "gif"}`
			`urlfmt = ("/" + path + "/{}.{}").format`

			`server1 = "https://i.hentaifox.com"`
			`server2 = "https://i2.hentaifox.com"`

			`for num, image in json.loads(data).items():`
			`ext, width, height = image.split(",")`
			`path = urlfmt(num, extmap[ext])`
			`append((server1 + path, {`
			`"width" : width,`
			`"height" : height,`
			`"_fallback": (server2 + path,),`
			`}))`

			`return result`
[hentaifox] add extractor for search results (#160) 6 years ago

adjust metadata types for GalleryExtractors 6 years ago			`class HentaifoxSearchExtractor(HentaifoxBase, Extractor):`
[hentaifox] add extractor for search results (#160) 6 years ago			`"""Extractor for search results and listings on hentaifox.com"""`
			`subcategory = "search"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"`
[hentaifox] support searching by group (#1294) Groups on hentaifox lists the items to download the same way as the other pages (artists, search, tag, etc). Added group to the pattern to search, and the test. 4 years ago			`r"(/(?:parody\|tag\|artist\|character\|search\|group)/[^/?%#]+)")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`test = (`
			`("https://hentaifox.com/parody/touhou-project/"),`
			`("https://hentaifox.com/character/reimu-hakurei/"),`
			`("https://hentaifox.com/artist/distance/"),`
			`("https://hentaifox.com/search/touhou/"),`
[hentaifox] support searching by group (#1294) Groups on hentaifox lists the items to download the same way as the other pages (artists, search, tag, etc). Added group to the pattern to search, and the test. 4 years ago			`("https://hentaifox.com/group/v-slash/"),`
[hentaifox] fix extraction 5 years ago			`("https://hentaifox.com/tag/heterochromia/", {`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`"pattern": HentaifoxGalleryExtractor.pattern,`
[hentaifox] fix extraction 5 years ago			`"count": ">= 60",`
[hentaifox] add extractor for search results (#160) 6 years ago			`"keyword": {`
[hentaifox] fix extraction 5 years ago			`"url" : str,`
[hentaifox] add extractor for search results (#160) 6 years ago			`"gallery_id": int,`
[hentaifox] fix extraction 5 years ago			`"title" : str,`
[hentaifox] add extractor for search results (#160) 6 years ago			`},`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`)`
[hentaifox] add extractor for search results (#160) 6 years ago
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 6 years ago			`Extractor.__init__(self, match)`
[hentaifox] add extractor for search results (#160) 6 years ago			`self.path = match.group(1)`

			`def items(self):`
			`yield Message.Version, 1`
			`for gallery in self.galleries():`
			`yield Message.Queue, gallery["url"], gallery`

			`def galleries(self):`
[hentaifox] fix extraction 5 years ago			`num = 1`
[hentaifox] add extractor for search results (#160) 6 years ago
			`while True:`
[hentaifox] fix extraction 5 years ago			`url = "{}{}/pag/{}/".format(self.root, self.path, num)`
[hentaifox] add extractor for search results (#160) 6 years ago			`page = self.request(url).text`

[hentaifox] fix extraction 5 years ago			`for info in text.extract_iter(`
			`page, 'class="g_title"><a href="', '</a>'):`
			`url, _, title = info.partition('">')`
[hentaifox] add extractor for search results (#160) 6 years ago
			`yield {`
[hentaifox] fix extraction 5 years ago			`"url" : text.urljoin(self.root, url),`
[hentaifox] add extractor for search results (#160) 6 years ago			`"gallery_id": text.parse_int(`
			`url.strip("/").rpartition("/")[2]),`
[hentaifox] fix extraction 5 years ago			`"title" : text.unescape(title),`
provide type information for Queue messages Child extractors are now directly constructed with Extractor.from_url() if the extractor class is known beforehand, instead of using extractor.find() and searching through all possible extractor classes. 6 years ago			`"_extractor": HentaifoxGalleryExtractor,`
[hentaifox] add extractor for search results (#160) 6 years ago			`}`

[hentaifox] fix extraction 5 years ago			`pos = page.find(">Next<")`
			`url = text.rextract(page, "href=", ">", pos)[0]`
[hentaifox] add extractor for search results (#160) 6 years ago			`if pos == -1 or "/pag" not in url:`
			`return`
[hentaifox] fix extraction 5 years ago			`num += 1`