gallery-dl/gallery_dl/extractor/tumblr.py

# -*- coding: utf-8 -*-

# Copyright 2016-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://www.tumblr.com/"""

from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache


class TumblrExtractor(Extractor):
    """Base class for tumblr extractors"""
    category = "tumblr"
    directory_fmt = ["{category}", "{name}"]
    filename_fmt = "{category}_{blog[name]}_{id}{offset}.{extension}"

    def __init__(self, match):
        Extractor.__init__(self)
        self.user = match.group(1)
        self.api = TumblrAPI(self, "photo")

    def items(self):
        blog = self.api.info(self.user)
        yield Message.Version, 1
        yield Message.Directory, blog

        for post in self.posts():
            if "photos" not in post:
                continue
            photos = post["photos"]
            del post["photos"]
            del post["trail"]
            for offset, photo in enumerate(photos, 1):
                photo.update(photo["original_size"])
                del photo["original_size"]
                del photo["alt_sizes"]
                post["extension"] = photo["url"].rpartition(".")[2]
                post["offset"] = "o{}".format(offset)
                post["photo"] = photo
                post["blog"] = blog
                yield Message.Url, photo["url"], post

    def posts(self):
        """Return an iterable containing all relevant posts"""


class TumblrUserExtractor(TumblrExtractor):
    """Extractor for all images from a tumblr-user"""
    subcategory = "user"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"]
    test = [("http://demo.tumblr.com/", {
        "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]

    def posts(self):
        return self.api.posts(self.user, {})


class TumblrPostExtractor(TumblrExtractor):
    """Extractor for images from a single post on tumblr"""
    subcategory = "post"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/post/(\d+)"]
    test = [("http://demo.tumblr.com/post/459265350", {
        "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]

    def __init__(self, match):
        TumblrExtractor.__init__(self, match)
        self.post_id = match.group(2)

    def posts(self):
        return self.api.posts(self.user, {"id": self.post_id})


class TumblrTagExtractor(TumblrExtractor):
    """Extractor for images from a tumblr-user by tag"""
    subcategory = "tag"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/tagged/(.+)"]
    test = [("http://demo.tumblr.com/tagged/Times%20Square", {
        "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]

    def __init__(self, match):
        TumblrExtractor.__init__(self, match)
        self.tag = text.unquote(match.group(2))

    def posts(self):
        return self.api.posts(self.user, {"tag": self.tag})


class TumblrAPI():
    """Minimal interface for the Tumblr API v2"""
    API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"

    def __init__(self, extractor, typ=None):
        self.api_key = extractor.config("api-key", TumblrAPI.API_KEY)
        self.params = {"offset": 0, "limit": 50, "type": typ}
        self.extractor = extractor

    @memcache(keyarg=1)
    def info(self, blog):
        """Return general information about a blog"""
        return self._call(blog, "info", {})["blog"]

    def posts(self, blog, params):
        """Retrieve published posts"""
        params.update(self.params)
        return self._pagination(blog, "posts", params)

    def _call(self, blog, endpoint, params):
        params["api_key"] = self.api_key
        url = "https://api.tumblr.com/v2/blog/{}.tumblr.com/{}".format(
            blog, endpoint)

        response = self.extractor.request(
            url, params=params, fatal=False).json()
        if response["meta"]["status"] == 404:
            raise exception.NotFoundError("user")
        elif response["meta"]["status"] != 200:
            self.extractor.log.error(response)
            raise exception.StopExtraction()

        return response["response"]

    def _pagination(self, blog, endpoint, params):
        while True:
            data = self._call(blog, endpoint, params)
            yield from data["posts"]
            params["offset"] += params["limit"]
            if params["offset"] >= data["total_posts"]:
                return
[tumblr] add extractor 9 years ago			`# -- coding: utf-8 --`

[tumblr] keyword consistency 8 years ago			`# Copyright 2016-2017 Mike Fährmann`
[tumblr] add extractor 9 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images from https://www.tumblr.com/"""`

			`from .common import Extractor, Message`
[tumblr] use API v2 (#48) 7 years ago			`from .. import text, exception`
			`from ..cache import memcache`
[tumblr] add extractor 9 years ago
code adjustments according to pep8 nr2 8 years ago
[tumblr] use API v2 (#48) 7 years ago			`class TumblrExtractor(Extractor):`
			`"""Base class for tumblr extractors"""`
[tumblr] add extractor 9 years ago			`category = "tumblr"`
[tumblr] use API v2 (#48) 7 years ago			`directory_fmt = ["{category}", "{name}"]`
			`filename_fmt = "{category}_{blog[name]}_{id}{offset}.{extension}"`
[tumblr] add extractor 9 years ago
			`def __init__(self, match):`
			`Extractor.__init__(self)`
			`self.user = match.group(1)`
[tumblr] use API v2 (#48) 7 years ago			`self.api = TumblrAPI(self, "photo")`
[tumblr] add extractor 9 years ago
			`def items(self):`
[tumblr] use API v2 (#48) 7 years ago			`blog = self.api.info(self.user)`
[tumblr] add extractor 9 years ago			`yield Message.Version, 1`
[tumblr] use API v2 (#48) 7 years ago			`yield Message.Directory, blog`
[tumblr] add post-extractor 9 years ago
[tumblr] use API v2 (#48) 7 years ago			`for post in self.posts():`
			`if "photos" not in post:`
			`continue`
[tumblr] add post-extractor 9 years ago			`photos = post["photos"]`
[tumblr] use API v2 (#48) 7 years ago			`del post["photos"]`
			`del post["trail"]`
			`for offset, photo in enumerate(photos, 1):`
			`photo.update(photo["original_size"])`
			`del photo["original_size"]`
			`del photo["alt_sizes"]`
			`post["extension"] = photo["url"].rpartition(".")[2]`
			`post["offset"] = "o{}".format(offset)`
			`post["photo"] = photo`
			`post["blog"] = blog`
			`yield Message.Url, photo["url"], post`

			`def posts(self):`
			`"""Return an iterable containing all relevant posts"""`


			`class TumblrUserExtractor(TumblrExtractor):`
			`"""Extractor for all images from a tumblr-user"""`
			`subcategory = "user"`
			`pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"]`
			`test = [("http://demo.tumblr.com/", {`
			`"pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",`
			`"count": 1,`
			`})]`

			`def posts(self):`
			`return self.api.posts(self.user, {})`


			`class TumblrPostExtractor(TumblrExtractor):`
consistent extractor naming scheme + docstrings 8 years ago			`"""Extractor for images from a single post on tumblr"""`
[tumblr] add post-extractor 9 years ago			`subcategory = "post"`
			`pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/post/(\d+)"]`
			`test = [("http://demo.tumblr.com/post/459265350", {`
[tumblr] use API v2 (#48) 7 years ago			`"pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",`
[tumblr] update tests 7 years ago			`"count": 1,`
[tumblr] add post-extractor 9 years ago			`})]`

			`def __init__(self, match):`
[tumblr] use API v2 (#48) 7 years ago			`TumblrExtractor.__init__(self, match)`
			`self.post_id = match.group(2)`
[tumblr] add tag-extractor 9 years ago
[tumblr] use API v2 (#48) 7 years ago			`def posts(self):`
			`return self.api.posts(self.user, {"id": self.post_id})`
[tumblr] add tag-extractor 9 years ago
[tumblr] use API v2 (#48) 7 years ago
			`class TumblrTagExtractor(TumblrExtractor):`
consistent extractor naming scheme + docstrings 8 years ago			`"""Extractor for images from a tumblr-user by tag"""`
[tumblr] add tag-extractor 9 years ago			`subcategory = "tag"`
			`pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/tagged/(.+)"]`
replace space characters in unit test URLs 7 years ago			`test = [("http://demo.tumblr.com/tagged/Times%20Square", {`
[tumblr] use API v2 (#48) 7 years ago			`"pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",`
[tumblr] update tests 7 years ago			`"count": 1,`
[tumblr] add tag-extractor 9 years ago			`})]`

			`def __init__(self, match):`
[tumblr] use API v2 (#48) 7 years ago			`TumblrExtractor.__init__(self, match)`
			`self.tag = text.unquote(match.group(2))`

			`def posts(self):`
			`return self.api.posts(self.user, {"tag": self.tag})`


			`class TumblrAPI():`
			`"""Minimal interface for the Tumblr API v2"""`
			`API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"`

			`def __init__(self, extractor, typ=None):`
			`self.api_key = extractor.config("api-key", TumblrAPI.API_KEY)`
			`self.params = {"offset": 0, "limit": 50, "type": typ}`
			`self.extractor = extractor`

			`@memcache(keyarg=1)`
			`def info(self, blog):`
			`"""Return general information about a blog"""`
			`return self._call(blog, "info", {})["blog"]`

			`def posts(self, blog, params):`
			`"""Retrieve published posts"""`
			`params.update(self.params)`
			`return self._pagination(blog, "posts", params)`

			`def _call(self, blog, endpoint, params):`
			`params["api_key"] = self.api_key`
			`url = "https://api.tumblr.com/v2/blog/{}.tumblr.com/{}".format(`
			`blog, endpoint)`

			`response = self.extractor.request(`
			`url, params=params, fatal=False).json()`
			`if response["meta"]["status"] == 404:`
			`raise exception.NotFoundError("user")`
			`elif response["meta"]["status"] != 200:`
			`self.extractor.log.error(response)`
			`raise exception.StopExtraction()`

			`return response["response"]`

			`def _pagination(self, blog, endpoint, params):`
			`while True:`
			`data = self._call(blog, endpoint, params)`
			`yield from data["posts"]`
			`params["offset"] += params["limit"]`
			`if params["offset"] >= data["total_posts"]:`
			`return`