# -*- coding: utf-8 -*- # Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://e-hentai.org/ and https://exhentai.org/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache import itertools import math BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org" class ExhentaiExtractor(Extractor): """Base class for exhentai extractors""" category = "exhentai" directory_fmt = ("{category}", "{gid} {title[:247]}") filename_fmt = "{gid}_{num:>04}_{image_token}_{filename}.{extension}" archive_fmt = "{gid}_{num}" cookies_domain = ".exhentai.org" cookies_names = ("ipb_member_id", "ipb_pass_hash") root = "https://exhentai.org" request_interval = 5.0 ciphers = "DEFAULT:!DH" LIMIT = False def __init__(self, match): Extractor.__init__(self, match) self.version = match.group(1) def initialize(self): domain = self.config("domain", "auto") if domain == "auto": domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain self.api_url = self.root + "/api.php" self.cookies_domain = "." + domain Extractor.initialize(self) if self.version != "ex": self.cookies.set("nw", "1", domain=self.cookies_domain) self.original = self.config("original", True) limits = self.config("limits", False) if limits and limits.__class__ is int: self.limits = limits self._remaining = 0 else: self.limits = False def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) if response.history and response.headers.get("Content-Length") == "0": self.log.info("blank page") raise exception.AuthorizationError() return response def login(self): """Login and set necessary cookies""" if self.LIMIT: raise exception.StopExtraction("Image limit reached!") if self.cookies_check(self.cookies_names): return username, password = self._get_auth_info() if username: return self.cookies_update(self._login_impl(username, password)) self.log.info("no username given; using e-hentai.org") self.root = "https://e-hentai.org" self.cookies_domain = ".e-hentai.org" self.cookies.set("nw", "1", domain=self.cookies_domain) self.original = False self.limits = False @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01" headers = { "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1", } data = { "CookieDate": "1", "b": "d", "bt": "1-1", "UserName": username, "PassWord": password, "ipb_login_submit": "Login!", } response = self.request(url, method="POST", headers=headers, data=data) if b"You are now logged in as:" not in response.content: raise exception.AuthenticationError() return {c: response.cookies[c] for c in self.cookies_names} class ExhentaiGalleryExtractor(ExhentaiExtractor): """Extractor for image galleries from exhentai.org""" subcategory = "gallery" pattern = (BASE_PATTERN + r"(?:/g/(\d+)/([\da-f]{10})" r"|/s/([\da-f]{10})/(\d+)-(\d+))") example = "https://e-hentai.org/g/12345/67890abcde/" def __init__(self, match): ExhentaiExtractor.__init__(self, match) self.gallery_id = text.parse_int(match.group(2) or match.group(5)) self.gallery_token = match.group(3) self.image_token = match.group(4) self.image_num = text.parse_int(match.group(6), 1) self.key_start = None self.key_show = None self.key_next = None self.count = 0 def _init(self): source = self.config("source") if source == "hitomi": self.items = self._items_hitomi def favorite(self, slot="0"): url = self.root + "/gallerypopups.php" params = { "gid": self.gallery_id, "t" : self.gallery_token, "act": "addfav", } data = { "favcat" : slot, "apply" : "Apply Changes", "update" : "1", } self.request(url, method="POST", params=params, data=data) def items(self): self.login() if self.gallery_token: gpage = self._gallery_page() self.image_token = text.extr(gpage, 'hentai.org/s/', '"') if not self.image_token: self.log.debug("Page content:\n%s", gpage) raise exception.StopExtraction( "Failed to extract initial image token") ipage = self._image_page() else: ipage = self._image_page() part = text.extr(ipage, 'hentai.org/g/', '"') if not part: self.log.debug("Page content:\n%s", ipage) raise exception.StopExtraction( "Failed to extract gallery token") self.gallery_token = part.split("/")[1] gpage = self._gallery_page() data = self.get_metadata(gpage) self.count = text.parse_int(data["filecount"]) yield Message.Directory, data def _validate_response(response): # declared inside 'items()' to be able to access 'data' if not response.history and response.headers.get( "content-type", "").startswith("text/html"): page = response.text self.log.warning("'%s'", page) if " requires GP" in page: gp = self.config("gp") if gp == "stop": raise exception.StopExtraction("Not enough GP") elif gp == "wait": input("Press ENTER to continue.") return response.url self.log.info("Falling back to non-original downloads") self.original = False return data["_url_1280"] self._report_limits(data) return True images = itertools.chain( (self.image_from_page(ipage),), self.images_from_api()) for url, image in images: data.update(image) if self.limits: self._check_limits(data) if "/fullimg" in url: data["_http_validate"] = _validate_response else: data["_http_validate"] = None yield Message.Url, url, data fav = self.config("fav") if fav is not None: self.favorite(fav) def _items_hitomi(self): if self.config("metadata", False): data = self.metadata_from_api() data["date"] = text.parse_timestamp(data["posted"]) else: data = {} from .hitomi import HitomiGalleryExtractor url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id) data["_extractor"] = HitomiGalleryExtractor yield Message.Queue, url, data def get_metadata(self, page): """Extract gallery metadata""" data = self.metadata_from_page(page) if self.config("metadata", False): data.update(self.metadata_from_api()) data["date"] = text.parse_timestamp(data["posted"]) return data def metadata_from_page(self, page): extr = text.extract_from(page) api_url = extr('var api_url = "', '"') if api_url: self.api_url = api_url data = { "gid" : self.gallery_id, "token" : self.gallery_token, "thumb" : extr("background:transparent url(", ")"), "title" : text.unescape(extr('