# -*- coding: utf-8 -*- # Copyright 2014-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from galleries at https://exhentai.org/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache import time import random import requests class ExhentaiGalleryExtractor(Extractor): """Extractor for image galleries from exhentai.org""" category = "exhentai" subcategory = "gallery" directory_fmt = ["{category}", "{gallery_id}"] filename_fmt = "{gallery_id}_{num:>04}_{image_token}_{name}.{extension}" pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] test = [ ("https://exhentai.org/g/960460/4f0e369d82/", { "keyword": "173277161e28162dcc755d2e7a88e6cd750f2477", "content": "493d759de534355c9f55f8e365565b62411de146", }), ("https://exhentai.org/g/960461/4f0e369d82/", { "exception": exception.NotFoundError, }), ("http://exhentai.org/g/962698/7f02358e00/", { "exception": exception.AuthorizationError, }), ] root = "https://exhentai.org" cookienames = ("ipb_member_id", "ipb_pass_hash") cookiedomain = ".exhentai.org" def __init__(self, match): Extractor.__init__(self) self.key = {} self.count = 0 self.version, self.gid, self.token = match.groups() self.gid = util.safe_int(self.gid) self.original = self.config("original", True) self.wait_min = self.config("wait-min", 3) self.wait_max = self.config("wait-max", 6) if self.wait_max < self.wait_min: self.wait_max = self.wait_min self.session.headers["Referer"] = self.root + "/" def items(self): self.login() yield Message.Version, 1 url = "{}/g/{}/{}/".format(self.root, self.gid, self.token) response = self.request(url, fatal=False) page = response.text if response.status_code == 404 and "Gallery Not Available" in page: raise exception.AuthorizationError() if self._is_sadpanda(response): self.log.info("sadpanda.jpg") raise exception.AuthorizationError() if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") data = self.get_job_metadata(page) self.count = data["count"] yield Message.Directory, data for url, image in self.get_images(page): data.update(image) if "/fullimg.php" in url: data["extension"] = "" self.wait(1.5) yield Message.Url, url, data def get_job_metadata(self, page): """Collect metadata for extractor-job""" data = { "gallery_id" : self.gid, "gallery_token": self.token, } text.extract_all(page, ( ("title" , '

', '

'), ("title_jp" , '

', '

'), ("date" , '>Posted:', ''), ("language" , '>Language:', ' '), ("size" , '>File Size:', ' '), ("size_units", '', '<'), ("count" , '>Length:', ' '), ), values=data) data["lang"] = util.language_to_code(data["language"]) data["title"] = text.unescape(data["title"]) data["title_jp"] = text.unescape(data["title_jp"]) data["count"] = util.safe_int(data["count"]) return data def get_images(self, page): """Collect url and metadata for all images in this gallery""" part = text.extract(page, 'hentai.org/s/', '"')[0] yield self.image_from_page(self.root + "/s/" + part) yield from self.images_from_api() def image_from_page(self, url): """Get image url and data from webpage""" self.wait() page = self.request(url).text data = text.extract_all(page, ( (None , '