# -*- coding: utf-8 -*- # Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://www.tumblr.com/""" from .common import Extractor, Message from .. import text, oauth, extractor, exception from datetime import datetime, timedelta import re def _original_inline_image(url): return re.sub( (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"), r"https://\1_1280.\2", url ) def _original_video(url): return re.sub( (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), r"https://\1.\2", url ) POST_TYPES = frozenset(( "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?([^.]+\.tumblr\.com))") class TumblrExtractor(Extractor): """Base class for tumblr extractors""" category = "tumblr" directory_fmt = ("{category}", "{name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" cookiedomain = None def __init__(self, match): Extractor.__init__(self, match) self.blog = match.group(1) or match.group(2) self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.avatar = self.config("avatar", False) self.inline = self.config("inline", True) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) elif not self.types: self.log.warning("no valid post types selected") if self.reblogs == "same-blog": self._skip_reblog = self._skip_reblog_same_blog self.date_min, self.api.before = self._get_date_min_max(0, None) def items(self): blog = None yield Message.Version, 1 for post in self.posts(): if self.date_min > post["timestamp"]: return if post["type"] not in self.types: continue if not blog: blog = self.api.info(self.blog) blog["uuid"] = self.blog yield Message.Directory, blog.copy() if self.avatar: url = self.api.avatar(self.blog) yield self._prepare_avatar(url, post.copy(), blog) reblog = "reblogged_from_id" in post if reblog and self._skip_reblog(post): continue post["reblogged"] = reblog post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) post["num"] = 0 if "trail" in post: del post["trail"] if "photos" in post: # type "photo" or "link" photos = post["photos"] del post["photos"] for photo in photos: post["photo"] = photo photo.update(photo["original_size"]) del photo["original_size"] del photo["alt_sizes"] yield self._prepare_image(photo["url"], post) url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): yield self._prepare(url, post) url = post.get("video_url") # type "video" if url: yield self._prepare(_original_video(url), post) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in re.findall('= data["total_posts"]: return def likes(self, blog): """Retrieve liked posts""" params = {"limit": "50", "before": self.before} while True: posts = self._call(blog, "likes", params)["liked_posts"] if not posts: return yield from posts params["before"] = posts[-1]["liked_timestamp"] def _call(self, blog, endpoint, params, **kwargs): if self.api_key: params["api_key"] = self.api_key url = "https://api.tumblr.com/v2/blog/{}/{}".format( blog, endpoint) response = self.request(url, params=params, **kwargs) try: data = response.json() except ValueError: data = response.text status = response.status_code else: status = data["meta"]["status"] if 200 <= status < 400: return data["response"] if status == 403: raise exception.AuthorizationError() elif status == 404: raise exception.NotFoundError("user or post") elif status == 429: # daily rate limit if response.headers.get("x-ratelimit-perday-remaining") == "0": reset = response.headers.get("x-ratelimit-perday-reset") t = (datetime.now() + timedelta(seconds=float(reset))).time() self.log.error("Daily API rate limit exceeded") raise exception.StopExtraction( "Aborting - Rate limit will reset at %s", "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)) # hourly rate limit reset = response.headers.get("x-ratelimit-perhour-reset") if reset: self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) return self._call(blog, endpoint, params) raise exception.StopExtraction(data)