# -*- coding: utf-8 -*- # Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.hentai-foundry.com/""" from .common import Extractor, Message from .. import text, util BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com" class HentaifoundryExtractor(Extractor): """Base class for hentaifoundry extractors""" category = "hentaifoundry" directory_fmt = ("{category}", "{user}") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" cookies_domain = "www.hentai-foundry.com" root = "https://www.hentai-foundry.com" per_page = 25 def __init__(self, match): self.root = (match.group(1) or "https://") + "www.hentai-foundry.com" self.user = match.group(2) Extractor.__init__(self, match) self.page_url = "" self.start_post = 0 self.start_page = 1 def items(self): self._init_site_filters() data = self.metadata() for post_url in util.advance(self.posts(), self.start_post): image = self._parse_post(post_url) image.update(data) yield Message.Directory, image yield Message.Url, image["src"], image def skip(self, num): pages, posts = divmod(num, self.per_page) self.start_page += pages self.start_post += posts return num def metadata(self): return {"user": self.user} def posts(self): return self._pagination(self.page_url) def _pagination(self, url, begin='thumbTitle">', '<')), "artist" : text.unescape(extr('/profile">', '<')), "_body" : extr( '
Description
', '') .replace("\r\n", "\n"), "", "")), "ratings" : [text.unescape(r) for r in text.extract_iter(extr( "class='ratings_box'", ""), "title='", "'")], "date" : text.parse_datetime(extr("datetime='", "'")), "views" : text.parse_int(extr(">Views", "<")), "score" : text.parse_int(extr(">Vote Score", "<")), "media" : text.unescape(extr(">Media", "<").strip()), "tags" : text.split_html(extr( ">Tags ", "")), } body = data["_body"] if "", "").rpartition(">")[2]), "author" : text.unescape(extr('alt="', '"')), "date" : text.parse_datetime(extr( ">Updated<", "").rpartition(">")[2], "%B %d, %Y"), "status" : extr("class='indent'>", "<"), } for c in ("Chapters", "Words", "Comments", "Views", "Rating"): data[c.lower()] = text.parse_int(extr( ">" + c + ":", "<").replace(",", "")) data["description"] = text.unescape(extr( "class='storyDescript'>", ""), "title='", "'")] return text.nameext_from_url(data["src"], data) def _request_check(self, url, **kwargs): self.request = self._request_original # check for Enter button / front page # and update PHPSESSID and content filters if necessary response = self.request(url, **kwargs) content = response.content if len(content) < 5000 and \ b'
', '') class HentaifoundryStoryExtractor(HentaifoundryExtractor): """Extractor for a hentaifoundry story""" subcategory = "story" archive_fmt = "s_{index}" pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)/(\d+)" example = "https://www.hentai-foundry.com/stories/user/USER/12345/TITLE" skip = Extractor.skip def __init__(self, match): HentaifoundryExtractor.__init__(self, match) self.index = match.group(3) def items(self): story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( self.root, self.user, self.index) story = self._parse_story(self.request(story_url).text) yield Message.Directory, story yield Message.Url, story["src"], story