# -*- coding: utf-8 -*- # Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://www.tumblr.com/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import memcache import re def _original_image(url): match = re.match( r"https?://\d+\.media\.tumblr\.com" r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+_)\d+(\.[0-9a-z]+)", url) if not match: return (url,) root = "https://s3.amazonaws.com/data.tumblr.com" path, key, ext = match.groups() return ( "".join((root, path, "raw" if key else "1280", ext)), "".join((root, path, "500", ext)), url, ) def _original_video(url): return re.sub( (r"https?://vt\.media\.tumblr\.com" r"/tumblr_([^_]+)_\d+\.([0-9a-z]+)"), r"https://vt.media.tumblr.com/tumblr_\1.\2", url ) POST_TYPES = frozenset(( "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?([^.]+\.tumblr\.com))") class TumblrExtractor(Extractor): """Base class for tumblr extractors""" category = "tumblr" directory_fmt = ["{category}", "{name}"] filename_fmt = "{category}_{blog_name}_{id}o{offset}.{extension}" archive_fmt = "{id}_{offset}" def __init__(self, match): Extractor.__init__(self) self.blog = match.group(1) or match.group(2) self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.inline = self.config("inline", False) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) elif not self.types: self.log.warning("no valid post types selected") def items(self): blog = self.api.info(self.blog) yield Message.Version, 1 yield Message.Directory, blog for post in self.posts(): if post["type"] not in self.types: continue reblog = "reblogged_from_id" in post if reblog and not self.reblogs: continue post["reblogged"] = reblog post["blog"] = blog post["offset"] = 0 if "trail" in post: del post["trail"] if "photos" in post: # type "photo" or "link" photos = post["photos"] del post["photos"] for photo in photos: post["photo"] = photo photo.update(photo["original_size"]) del photo["original_size"] del photo["alt_sizes"] yield self._prepare_image(photo["url"], post) if "audio_url" in post: # type: "audio" yield self._prepare(post["audio_url"], post) if "video_url" in post: # type: "video" yield self._prepare(_original_video(post["video_url"]), post) if self.inline: # inline images for key in ("body", "description", "source"): if key in post: for url in re.findall('= data["total_posts"]: return def likes(self, blog): """Retrieve liked posts""" params = {"limit": 50} while True: posts = self._call(blog, "likes", params)["liked_posts"] if not posts: return yield from posts params["before"] = posts[-1]["liked_timestamp"] def _call(self, blog, endpoint, params): if self.api_key: params["api_key"] = self.api_key url = "https://api.tumblr.com/v2/blog/{}/{}".format( blog, endpoint) response = self.session.get(url, params=params).json() status = response["meta"]["status"] if status == 200: return response["response"] elif status == 403: raise exception.AuthorizationError() elif status == 404: raise exception.NotFoundError("user or post") else: self.extractor.log.error(response) raise exception.StopExtraction()