# -*- coding: utf-8 -*- # Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.tumblr.com/""" from .common import Extractor, Message from .. import text, oauth, exception from datetime import datetime, timedelta import re def _original_video(url): return re.sub( (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), r"https://\1.\2", url ) POST_TYPES = frozenset(( "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?" r"(?:www\.tumblr\.com/blog/(?:view/)?([\w-]+)|" r"([\w-]+\.tumblr\.com)))" ) class TumblrExtractor(Extractor): """Base class for tumblr extractors""" category = "tumblr" directory_fmt = ("{category}", "{blog_name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" cookiedomain = None def __init__(self, match): Extractor.__init__(self, match) name = match.group(2) if name: self.blog = name + ".tumblr.com" else: self.blog = match.group(1) or match.group(3) self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.avatar = self.config("avatar", False) self.inline = self.config("inline", True) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) self.original = self.config("original", True) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) elif not self.types: self.log.warning("no valid post types selected") if self.reblogs == "same-blog": self._skip_reblog = self._skip_reblog_same_blog self.date_min, self.api.before = self._get_date_min_max(0, None) def items(self): blog = None for post in self.posts(): if self.date_min > post["timestamp"]: return if post["type"] not in self.types: continue if not blog: blog = self.api.info(self.blog) blog["uuid"] = self.blog if self.avatar: url = self.api.avatar(self.blog) yield Message.Directory, {"blog": blog} yield self._prepare_avatar(url, post.copy(), blog) reblog = "reblogged_from_id" in post if reblog and self._skip_reblog(post): continue post["reblogged"] = reblog if "trail" in post: del post["trail"] post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) posts = [] if "photos" in post: # type "photo" or "link" photos = post["photos"] del post["photos"] for photo in photos: post["photo"] = photo best_photo = photo["original_size"] for alt_photo in photo["alt_sizes"]: if (alt_photo["height"] > best_photo["height"] or alt_photo["width"] > best_photo["width"]): best_photo = alt_photo photo.update(best_photo) if self.original and "/s2048x3072/" in photo["url"] and ( photo["width"] == 2048 or photo["height"] == 3072): photo["url"] = self._original_image(photo["url"]) del photo["original_size"] del photo["alt_sizes"] posts.append( self._prepare_image(photo["url"], post.copy())) del post["photo"] url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): posts.append(self._prepare(url, post.copy())) url = post.get("video_url") # type "video" if url: posts.append(self._prepare(_original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in re.findall('= data["total_posts"]: return def likes(self, blog): """Retrieve liked posts""" params = {"limit": "50", "before": self.before} while True: posts = self._call(blog, "likes", params)["liked_posts"] if not posts: return yield from posts params["before"] = posts[-1]["liked_timestamp"] def _call(self, blog, endpoint, params, **kwargs): if self.api_key: params["api_key"] = self.api_key url = "https://api.tumblr.com/v2/blog/{}/{}".format( blog, endpoint) response = self.request(url, params=params, **kwargs) try: data = response.json() except ValueError: data = response.text status = response.status_code else: status = data["meta"]["status"] if 200 <= status < 400: return data["response"] if status == 403: raise exception.AuthorizationError() elif status == 404: raise exception.NotFoundError("user or post") elif status == 429: # daily rate limit if response.headers.get("x-ratelimit-perday-remaining") == "0": reset = response.headers.get("x-ratelimit-perday-reset") t = (datetime.now() + timedelta(seconds=float(reset))).time() self.log.error("Daily API rate limit exceeded") api_key = self.api_key or self.session.auth.consumer_key if api_key == self.API_KEY: self.log.info("Register your own OAuth application and " "use its credentials to prevent this error: " "https://github.com/mikf/gallery-dl/blob/mas" "ter/docs/configuration.rst#extractortumblra" "pi-key--api-secret") raise exception.StopExtraction( "Aborting - Rate limit will reset at %s", "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)) # hourly rate limit reset = response.headers.get("x-ratelimit-perhour-reset") if reset: self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) return self._call(blog, endpoint, params) raise exception.StopExtraction(data)