# -*- coding: utf-8 -*- # Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract hentai-gallery from https://myhentaigallery.com/""" from .common import GalleryExtractor from .. import text, exception class MyhentaigalleryGalleryExtractor(GalleryExtractor): """Extractor for image galleries from myhentaigallery.com""" category = "myhentaigallery" directory_fmt = ("{category}", "{gallery_id} [{artist}] {title}") pattern = (r"(?:https?://)?(myhentaigallery\.com" r"/gallery/thumbnails/[0-9]+)") test = ( ("https://myhentaigallery.com/gallery/thumbnails/16247"), ("https://myhentaigallery.com/gallery/thumbnails/15224"), ) def __init__(self, match): url = "https://" + match.group(1) GalleryExtractor.__init__(self, match, url) self.session.headers["Referer"] = url def metadata(self, page): extr = text.extract_from(page) split = text.split_html image = extr('
\n\n

', '

') if not title: raise exception.NotFoundError("gallery") data = { "title" : text.unescape(title), "gallery_id": text.parse_int(image.split("/")[-2]), "tags" : split(extr('
\nCategories:', '
')), } artists = split(extr('
\nArtists:', '
')) data["artist"] = artists[0] if artists else "Unknown" return data def images(self, page): extr = text.extract_iter return [ (text.unescape(url).replace("/thumbnail/", "/original/"), None) for url in extr(page, 'class="comic-thumb">\n