# -*- coding: utf-8 -*- # Copyright 2015-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Base classes for extractors for danbooru and co""" from .common import Extractor, Message, SharedConfigMixin from .. import text, exception from xml.etree import ElementTree import collections import datetime import operator import re class BooruExtractor(SharedConfigMixin, Extractor): """Base class for all booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" api_url = "" post_url = "" per_page = 50 page_start = 1 page_limit = None sort = False ugoira = True def __init__(self, match): super().__init__(match) self.params = {} self.extags = self.post_url and self.config("tags", False) def skip(self, num): pages = num // self.per_page if self.page_limit and pages + self.page_start > self.page_limit: pages = self.page_limit - self.page_start self.page_start += pages return pages * self.per_page def items(self): yield Message.Version, 1 data = self.get_metadata() self.reset_page() while True: images = self.parse_response( self.request(self.api_url, params=self.params)) for image in images: try: if "pixiv_ugoira_frame_data" in image and \ "large_file_url" in image and not self.ugoira: url = image["large_file_url"] else: url = image["file_url"] except KeyError: continue if url.startswith("/"): url = text.urljoin(self.api_url, url) image.update(data) text.nameext_from_url(url, image) if self.extags: self.extended_tags(image) yield Message.Directory, image yield Message.Url, url, image if len(images) < self.per_page: return self.update_page(image) def reset_page(self): """Initialize params to point to the first page""" self.params["page"] = self.page_start def update_page(self, data): """Update params to point to the next page""" def parse_response(self, response): """Parse JSON API response""" images = response.json() if self.sort: images.sort(key=operator.itemgetter("score", "id"), reverse=True) return images def get_metadata(self): """Collect metadata for extractor-job""" return {} def extended_tags(self, image, page=None): """Retrieve extended tag information""" if not page: url = self.post_url.format(image["id"]) page = self.request(url).text tags = collections.defaultdict(list) tags_html = text.extract(page, '