# -*- coding: utf-8 -*- # Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Collection of extractors for various imagehosts""" from .common import Extractor, Message from .. import text, exception from ..cache import memcache from os.path import splitext class ImagehostImageExtractor(Extractor): """Base class for single-image extractors for various imagehosts""" basecategory = "imagehost" subcategory = "image" archive_fmt = "{token}" https = True params = None cookies = None encoding = None def __init__(self, match): Extractor.__init__(self, match) self.page_url = "http{}://{}".format( "s" if self.https else "", match.group(1)) self.token = match.group(2) if self.params == "simple": self.params = { "imgContinue": "Continue+to+image+...+", } elif self.params == "complex": self.params = { "op": "view", "id": self.token, "pre": "1", "adb": "1", "next": "Continue+to+image+...+", } def items(self): page = self.request( self.page_url, method=("POST" if self.params else "GET"), data=self.params, cookies=self.cookies, encoding=self.encoding, ).text url, filename = self.get_info(page) data = text.nameext_from_url(filename, {"token": self.token}) data.update(self.metadata(page)) if self.https and url.startswith("http:"): url = "https:" + url[5:] yield Message.Directory, data yield Message.Url, url, data def get_info(self, page): """Find image-url and string to get filename from""" def metadata(self, page): """Return additional metadata""" return () class ImxtoImageExtractor(ImagehostImageExtractor): """Extractor for single images from imx.to""" category = "imxto" pattern = (r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)" r"/(?:i/|img-)(\w+)(\.html)?)") test = ( ("https://imx.to/i/1qdeva", { # new-style URL "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { "size" : 18, "width" : 64, "height": 32, "hash" : "94d56c599223c59f3feb71ea603484d1", }, }), ("https://imx.to/img-57a2050547b97.html", { # old-style URL "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", "content": "54592f2635674c25677c6872db3709d343cdf92f", "keyword": { "size" : 5284, "width" : 320, "height": 160, "hash" : "40da6aaa7b8c42b18ef74309bbc713fc", }, }), ("https://img.yt/img-57a2050547b97.html", { # img.yt domain "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", }), ("https://imx.to/img-57a2050547b98.html", { "exception": exception.NotFoundError, }), ) params = "simple" encoding = "utf-8" def __init__(self, match): ImagehostImageExtractor.__init__(self, match) if "/img-" in self.page_url: self.page_url = self.page_url.replace("img.yt", "imx.to") self.url_ext = True else: self.url_ext = False def get_info(self, page): url, pos = text.extract( page, '
", "").replace(" ", "")[:-1] width, _, height = extr(">", " px").partition("x") return { "size" : text.parse_bytes(size), "width" : text.parse_int(width), "height": text.parse_int(height), "hash" : extr(">", ""), } class ImxtoGalleryExtractor(ImagehostImageExtractor): """Extractor for image galleries from imx.to""" category = "imxto" subcategory = "gallery" pattern = r"(?:https?://)?(?:www\.)?(imx\.to/g/([^/?#]+))" test = ("https://imx.to/g/ozdy", { "pattern": ImxtoImageExtractor.pattern, "keyword": {"title": "untitled gallery"}, "count": 40, }) def items(self): page = self.request(self.page_url).text title, pos = text.extract(page, '
")[2]).strip(), } for url in text.extract_iter(page, "") data = {"_extractor": PixhostImageExtractor} for url in text.extract_iter(page, '', '<', pos) return url, text.unescape(filename) class PostimgGalleryExtractor(ImagehostImageExtractor): """Extractor for images galleries from postimages.org""" category = "postimg" subcategory = "gallery" pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" r"/(?:gallery/)([^/?#]+)/?)") test = ("https://postimg.cc/gallery/wxpDLgX", { "pattern": PostimgImageExtractor.pattern, "count": 22, }) def items(self): page = self.request(self.page_url).text data = {"_extractor": PostimgImageExtractor} for url in text.extract_iter(page, ' class="thumb">