# -*- coding: utf-8 -*- # Copyright 2014-2016 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from galleries at https://exhentai.org/""" from .common import Extractor, Message from .. import config, text, iso639_1, exception from ..cache import cache import time import random import requests class ExhentaiGalleryExtractor(Extractor): """Extractor for image-galleries from exhentai.org""" category = "exhentai" subcategory = "gallery" directory_fmt = ["{category}", "{gallery-id}"] filename_fmt = "{gallery-id}_{num:>04}_{image-token}_{name}.{extension}" pattern = [r"(?:https?://)?(?:g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] test = [("https://exhentai.org/g/960460/4f0e369d82/", { "keyword": "623f8c86c9fe38e964682dd4309b96922655b900", "content": "493d759de534355c9f55f8e365565b62411de146", })] api_url = "https://exhentai.org/api.php" def __init__(self, match): Extractor.__init__(self) self.key = {} self.count = 0 self.gid, self.token = match.groups() self.original = config.interpolate(("extractor", "exhentai", "download-original"), True) self.wait_min = config.interpolate(("extractor", "exhentai", "wait-min"), 3) self.wait_max = config.interpolate(("extractor", "exhentai", "wait-max"), 6) if self.wait_max < self.wait_min: self.wait_max = self.wait_min def items(self): self.login() yield Message.Version, 1 yield Message.Headers, self.setup_headers() yield Message.Cookies, self.session.cookies url = "https://exhentai.org/g/{}/{}/".format(self.gid, self.token) page = self.request(url).text if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") data = self.get_job_metadata(page) self.count = int(data["count"]) yield Message.Directory, data for url, image in self.get_images(page): data.update(image) if "/fullimg.php" in url: data["extension"] = "" self.wait((1, 2)) yield Message.Url, url, data def setup_headers(self): """Initialize headers""" self.session.headers.update({ "User-Agent": "Mozilla/5.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Referer": "https://exhentai.org/", }) headers = self.session.headers.copy() headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5" return headers def get_job_metadata(self, page): """Collect metadata for extractor-job""" data = { "gallery-id" : self.gid, "gallery-token": self.token, } text.extract_all(page, ( ("title" , '

', '

'), ("title_jp" , '

', '

'), ("date" , '>Posted:', ''), ("language" , '>Language:', ' '), ("size" , '>File Size:', ' '), ("size-units", '', '<'), ("count" , '>Length:', ' '), ), values=data) data["lang"] = iso639_1.language_to_code(data["language"]) data["title"] = text.unescape(data["title"]) data["title_jp"] = text.unescape(data["title_jp"]) return data def get_images(self, page): """Collect url and metadata for all images in this gallery""" url = "https://exhentai.org/s/" + text.extract(page, 'hentai.org/s/', '"')[0] yield self.image_from_page(url) yield from self.images_from_api() def image_from_page(self, url): """Get image url and data from webpage""" self.wait() page = self.request(url).text data = text.extract_all(page, ( (None , '