[tumblr] update pagination logic (#2191)

1 year ago · de670bd7de
parent 98c9fdb414
commit de670bd7de
1 changed files with 69 additions and 41 deletions
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright 2016-2022 Mike Fährmann
+# Copyright 2016-2023 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -269,7 +269,7 @@ class TumblrExtractor(Extractor):
 class TumblrUserExtractor(TumblrExtractor):
-    """Extractor for all images from a tumblr-user"""
+    """Extractor for a Tumblr user's posts"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$"
    test = (
@ -307,6 +307,16 @@ class TumblrUserExtractor(TumblrExtractor):
            "options": (("date-min", "201804"), ("date-max", "201805"),
                        ("date-format", "%Y%m"))
        }),
        # pagination with 'date-max' (#2191) and 'api-key'
        ("https://donttrustthetits.tumblr.com/", {
            "options": (
                ("access-token", None),
                ("original", False),
                ("date-max", "2015-04-25T00:00:00"),
                ("date-min", "2015-04-01T00:00:00"),
            ),
            "count": 316,
        }),
        ("https://demo.tumblr.com/page/2"),
        ("https://demo.tumblr.com/archive"),
        ("tumblr:http://www.b-authentique.com/"),
@ -321,7 +331,7 @@ class TumblrUserExtractor(TumblrExtractor):
 class TumblrPostExtractor(TumblrExtractor):
-    """Extractor for images from a single post on tumblr"""
+    """Extractor for a single Tumblr post"""
    subcategory = "post"
    pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)"
    test = (
@ -389,7 +399,7 @@ class TumblrPostExtractor(TumblrExtractor):
 class TumblrTagExtractor(TumblrExtractor):
-    """Extractor for images from a tumblr-user by tag"""
+    """Extractor for Tumblr user's posts by tag"""
    subcategory = "tag"
    pattern = BASE_PATTERN + r"/tagged/([^/?#]+)"
    test = (
@ -412,7 +422,7 @@ class TumblrTagExtractor(TumblrExtractor):
 class TumblrLikesExtractor(TumblrExtractor):
-    """Extractor for images from a tumblr-user's liked posts"""
+    """Extractor for a Tumblr user's liked posts"""
    subcategory = "likes"
    directory_fmt = ("{category}", "{blog_name}", "likes")
    archive_fmt = "f_{blog[name]}_{id}_{num}"
@ -431,7 +441,11 @@ class TumblrLikesExtractor(TumblrExtractor):
 class TumblrAPI(oauth.OAuth1API):
-    """Minimal interface for the Tumblr API v2"""
+    """Interface for the Tumblr API v2
    https://github.com/tumblr/docs/blob/master/api.md
    """
    ROOT = "https://api.tumblr.com"
    API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"
    API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03"
    BLOG_CACHE = {}
@ -442,55 +456,46 @@ class TumblrAPI(oauth.OAuth1API):
    def info(self, blog):
        """Return general information about a blog"""
-        if blog not in self.BLOG_CACHE:
+        try:
-            self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"]
+            return self.BLOG_CACHE[blog]
-        return self.BLOG_CACHE[blog]
+        except KeyError:
            endpoint = "/v2/blog/{}/info".format(blog)
            params = {"api_key": self.api_key} if self.api_key else None
            self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"]
            return blog
    def avatar(self, blog, size="512"):
        """Retrieve a blog avatar"""
        if self.api_key:
-            url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}"
+            return "{}/v2/blog/{}/avatar/{}?api_key={}".format(
-            return url_fmt.format(blog, size, self.api_key)
+                self.ROOT, blog, size, self.api_key)
        endpoint = "/v2/blog/{}/avatar".format(blog)
        params = {"size": size}
-        data = self._call(blog, "avatar", params, allow_redirects=False)
+        return self._call(
-        return data["avatar_url"]
+            endpoint, params, allow_redirects=False)["avatar_url"]
    def posts(self, blog, params):
        """Retrieve published posts"""
-        params["offset"] = self.extractor.config("offset") or 0
+        params["offset"] = self.extractor.config("offset")
-        params["limit"] = 50
+        params["limit"] = "50"
        params["reblog_info"] = "true"
        params["type"] = self.posts_type
        params["before"] = self.before
-        if self.posts_type:
+        if self.before and params["offset"]:
-            params["type"] = self.posts_type
+            self.log.warning("'offset' and 'date-max' cannot be used together")
        if self.before:
            params["before"] = self.before
-        while True:
+        return self._pagination(blog, "/posts", params, cache=True)
            data = self._call(blog, "posts", params)
            self.BLOG_CACHE[blog] = data["blog"]
            yield from data["posts"]
            params["offset"] += params["limit"]
            if params["offset"] >= data["total_posts"]:
                return
    def likes(self, blog):
        """Retrieve liked posts"""
        params = {"limit": "50", "before": self.before}
-        while True:
+        return self._pagination(blog, "/likes", params, key="liked_posts")
            posts = self._call(blog, "likes", params)["liked_posts"]
            if not posts:
                return
            yield from posts
            params["before"] = posts[-1]["liked_timestamp"]
-    def _call(self, blog, endpoint, params, **kwargs):
+    def _call(self, endpoint, params, **kwargs):
-        if self.api_key:
+        url = self.ROOT + endpoint
-            params["api_key"] = self.api_key
+        kwargs["params"] = params
-        url = "https://api.tumblr.com/v2/blog/{}/{}".format(
+        response = self.request(url, **kwargs)
            blog, endpoint)
        response = self.request(url, params=params, **kwargs)
        try:
            data = response.json()
@ -535,7 +540,7 @@ class TumblrAPI(oauth.OAuth1API):
                if self.extractor.config("ratelimit") == "wait":
                    self.extractor.wait(seconds=reset)
-                    return self._call(blog, endpoint, params)
+                    return self._call(endpoint, params, **kwargs)
                t = (datetime.now() + timedelta(seconds=float(reset))).time()
                raise exception.StopExtraction(
@ -547,6 +552,29 @@ class TumblrAPI(oauth.OAuth1API):
            if reset:
                self.log.info("Hourly API rate limit exceeded")
                self.extractor.wait(seconds=reset)
-                return self._call(blog, endpoint, params)
+                return self._call(endpoint, params, **kwargs)
        raise exception.StopExtraction(data)
    def _pagination(self, blog, endpoint, params, key="posts", cache=False):
        endpoint = "/v2/blog/{}{}".format(blog, endpoint)
        if self.api_key:
            params["api_key"] = self.api_key
        while True:
            data = self._call(endpoint, params)
            if cache:
                self.BLOG_CACHE[blog] = data["blog"]
                cache = False
            yield from data[key]
            try:
                endpoint = data["_links"]["next"]["href"]
            except KeyError:
                return
            params = None
            if self.api_key:
                endpoint += "&api_key=" + self.api_key