# -*- coding: utf-8 -*- # Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://hentaifox.com/""" from .common import GalleryExtractor, Extractor, Message from .. import text import json class HentaifoxBase(): """Base class for hentaifox extractors""" category = "hentaifox" root = "https://hentaifox.com" class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): """Extractor for image galleries on hentaifox.com""" pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" test = ( ("https://hentaifox.com/gallery/56622/", { "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", "keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92", "count": 24, }), # 'split_tag' element (#1378) ("https://hentaifox.com/gallery/630/", { "keyword": { "artist": ["beti", "betty", "magi", "mimikaki"], "characters": [ "aerith gainsborough", "tifa lockhart", "yuffie kisaragi" ], "count": 32, "gallery_id": 630, "group": ["cu-little2"], "parody": ["darkstalkers | vampire", "final fantasy vii"], "tags": ["femdom", "fingering", "masturbation", "yuri"], "title": "Cu-Little Bakanyaï½ž", "type": "doujinshi", }, }), ) def __init__(self, match): GalleryExtractor.__init__(self, match) self.gallery_id = match.group(2) @staticmethod def _split(txt): return [ text.remove_html(tag.partition(">")[2], "", "") for tag in text.extract_iter( txt, "class='tag_btn", "= 60", "keyword": { "url" : str, "gallery_id": int, "title" : str, }, }), ) def __init__(self, match): Extractor.__init__(self, match) self.path = match.group(1) def items(self): yield Message.Version, 1 for gallery in self.galleries(): yield Message.Queue, gallery["url"], gallery def galleries(self): num = 1 while True: url = "{}{}/pag/{}/".format(self.root, self.path, num) page = self.request(url).text for info in text.extract_iter( page, 'class="g_title">') yield { "url" : text.urljoin(self.root, url), "gallery_id": text.parse_int( url.strip("/").rpartition("/")[2]), "title" : text.unescape(title), "_extractor": HentaifoxGalleryExtractor, } pos = page.find(">Next<") url = text.rextract(page, "href=", ">", pos)[0] if pos == -1 or "/pag" not in url: return num += 1