# -*- coding: utf-8 -*- # Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Base classes for extractors for danbooru and co""" from .common import SharedConfigExtractor, Message from .. import text from urllib.parse import urljoin from xml.etree import ElementTree import datetime import operator class BooruExtractor(SharedConfigExtractor): """Base class for all booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" archive_fmt = "{id}" api_url = "" per_page = 50 page_start = 1 page_limit = None sort = False def __init__(self, match): super().__init__() self.params = {} def skip(self, num): pages = num // self.per_page if self.page_limit and pages + self.page_start > self.page_limit: pages = self.page_limit - self.page_start self.page_start += pages return pages * self.per_page def items(self): yield Message.Version, 1 yield Message.Directory, self.get_metadata() self.reset_page() while True: images = self.parse_response( self.request(self.api_url, params=self.params)) for data in images: try: url = data["file_url"] if url.startswith("/"): url = urljoin(self.api_url, url) yield Message.Url, url, text.nameext_from_url(url, data) except KeyError: continue if len(images) < self.per_page: return self.update_page(data) def reset_page(self): """Initialize params to point to the first page""" self.params["page"] = self.page_start def update_page(self, data): """Update params to point to the next page""" def parse_response(self, response): """Parse JSON API response""" images = response.json() if self.sort: images.sort(key=operator.itemgetter("score", "id"), reverse=True) return images def get_metadata(self): """Collect metadata for extractor-job""" return {} class XmlParserMixin(): """Mixin for XML based API responses""" def parse_response(self, response): root = ElementTree.fromstring(response.text) return [post.attrib for post in root] class DanbooruPageMixin(): """Pagination for Danbooru v2""" def update_page(self, data): self.params["page"] = "b{}".format(data["id"]) class MoebooruPageMixin(): """Pagination for Moebooru and Danbooru v1""" def update_page(self, data): if self.page_limit: self.params["page"] = None self.params["before_id"] = data["id"] else: self.params["page"] += 1 class GelbooruPageMixin(): """Pagination for Gelbooru-like sites""" def reset_page(self): self.params["pid"] = self.page_start - 1 def update_page(self, data): self.params["pid"] += 1 class TagMixin(): """Extraction of images based on search-tags""" subcategory = "tag" directory_fmt = ["{category}", "{tags}"] def __init__(self, match): super().__init__(match) self.tags = text.unquote(match.group("tags").replace("+", " ")) self.params["tags"] = self.tags self.params["limit"] = self.per_page def get_metadata(self): return {"tags": self.tags} class PoolMixin(): """Extraction of image-pools""" subcategory = "pool" directory_fmt = ["{category}", "pool", "{pool}"] def __init__(self, match): super().__init__(match) self.pool = match.group("pool") self.params["tags"] = "pool:" + self.pool self.params["limit"] = self.per_page def get_metadata(self): return {"pool": self.pool} class PostMixin(): """Extraction of a single image-post""" subcategory = "post" def __init__(self, match): super().__init__(match) self.post = match.group("post") self.params["tags"] = "id:" + self.post class PopularMixin(): """Extraction and metadata handling for Danbooru v2""" subcategory = "popular" directory_fmt = ["{category}", "popular", "{scale}", "{date}"] page_start = None sort = True def __init__(self, match): super().__init__(match) self.params.update(text.parse_query(match.group("query"))) def get_metadata(self, fmt="%Y-%m-%d"): date = self.get_date() or datetime.datetime.utcnow().strftime(fmt) scale = self.get_scale() or "day" if scale == "week": dt = datetime.datetime.strptime(date, fmt) dt -= datetime.timedelta(days=dt.weekday()) date = dt.strftime(fmt) elif scale == "month": date = date[:-3] return {"date": date, "scale": scale} def get_scale(self): if "scale" in self.params: return self.params["scale"] return None def get_date(self): if "date" in self.params: return self.params["date"][:10] return None class MoebooruPopularMixin(PopularMixin): """Extraction and metadata handling for Moebooru and Danbooru v1""" def __init__(self, match): super().__init__(match) self.scale = match.group("scale") def get_date(self): if "year" in self.params: return "{:>04}-{:>02}-{:>02}".format( self.params["year"], self.params.get("month", "01"), self.params.get("day", "01")) return None def get_scale(self): if self.scale and self.scale.startswith("by_"): return self.scale[3:] return self.scale