[tumblr] update pagination logic (#2191)

1 year ago · de670bd7de
parent 98c9fdb414
commit de670bd7de
1 changed files with 69 additions and 41 deletions
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2022 Mike Fährmann
+# Copyright 2016-2023 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -269,7 +269,7 @@ class TumblrExtractor(Extractor):


 class TumblrUserExtractor(TumblrExtractor):
-    """Extractor for all images from a tumblr-user"""
+    """Extractor for a Tumblr user's posts"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$"
    test = (
@ -307,6 +307,16 @@ class TumblrUserExtractor(TumblrExtractor):
            "options": (("date-min", "201804"), ("date-max", "201805"),
                        ("date-format", "%Y%m"))
        }),
+        # pagination with 'date-max' (#2191) and 'api-key'
+        ("https://donttrustthetits.tumblr.com/", {
+            "options": (
+                ("access-token", None),
+                ("original", False),
+                ("date-max", "2015-04-25T00:00:00"),
+                ("date-min", "2015-04-01T00:00:00"),
+            ),
+            "count": 316,
+        }),
        ("https://demo.tumblr.com/page/2"),
        ("https://demo.tumblr.com/archive"),
        ("tumblr:http://www.b-authentique.com/"),
@ -321,7 +331,7 @@ class TumblrUserExtractor(TumblrExtractor):


 class TumblrPostExtractor(TumblrExtractor):
-    """Extractor for images from a single post on tumblr"""
+    """Extractor for a single Tumblr post"""
    subcategory = "post"
    pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)"
    test = (
@ -389,7 +399,7 @@ class TumblrPostExtractor(TumblrExtractor):


 class TumblrTagExtractor(TumblrExtractor):
-    """Extractor for images from a tumblr-user by tag"""
+    """Extractor for Tumblr user's posts by tag"""
    subcategory = "tag"
    pattern = BASE_PATTERN + r"/tagged/([^/?#]+)"
    test = (
@ -412,7 +422,7 @@ class TumblrTagExtractor(TumblrExtractor):


 class TumblrLikesExtractor(TumblrExtractor):
-    """Extractor for images from a tumblr-user's liked posts"""
+    """Extractor for a Tumblr user's liked posts"""
    subcategory = "likes"
    directory_fmt = ("{category}", "{blog_name}", "likes")
    archive_fmt = "f_{blog[name]}_{id}_{num}"
@ -431,7 +441,11 @@ class TumblrLikesExtractor(TumblrExtractor):


 class TumblrAPI(oauth.OAuth1API):
-    """Minimal interface for the Tumblr API v2"""
+    """Interface for the Tumblr API v2
+
+    https://github.com/tumblr/docs/blob/master/api.md
+    """
+    ROOT = "https://api.tumblr.com"
    API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"
    API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03"
    BLOG_CACHE = {}
@ -442,55 +456,46 @@ class TumblrAPI(oauth.OAuth1API):

    def info(self, blog):
        """Return general information about a blog"""
-        if blog not in self.BLOG_CACHE:
-            self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"]
+        try:
            return self.BLOG_CACHE[blog]
+        except KeyError:
+            endpoint = "/v2/blog/{}/info".format(blog)
+            params = {"api_key": self.api_key} if self.api_key else None
+            self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"]
+            return blog

    def avatar(self, blog, size="512"):
        """Retrieve a blog avatar"""
        if self.api_key:
-            url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}"
-            return url_fmt.format(blog, size, self.api_key)
+            return "{}/v2/blog/{}/avatar/{}?api_key={}".format(
+                self.ROOT, blog, size, self.api_key)
+        endpoint = "/v2/blog/{}/avatar".format(blog)
        params = {"size": size}
-        data = self._call(blog, "avatar", params, allow_redirects=False)
-        return data["avatar_url"]
+        return self._call(
+            endpoint, params, allow_redirects=False)["avatar_url"]

    def posts(self, blog, params):
        """Retrieve published posts"""
-        params["offset"] = self.extractor.config("offset") or 0
-        params["limit"] = 50
+        params["offset"] = self.extractor.config("offset")
+        params["limit"] = "50"
        params["reblog_info"] = "true"
-
-        if self.posts_type:
        params["type"] = self.posts_type
-        if self.before:
        params["before"] = self.before

-        while True:
-            data = self._call(blog, "posts", params)
-            self.BLOG_CACHE[blog] = data["blog"]
-            yield from data["posts"]
-            params["offset"] += params["limit"]
-            if params["offset"] >= data["total_posts"]:
-                return
+        if self.before and params["offset"]:
+            self.log.warning("'offset' and 'date-max' cannot be used together")
+
+        return self._pagination(blog, "/posts", params, cache=True)

    def likes(self, blog):
        """Retrieve liked posts"""
        params = {"limit": "50", "before": self.before}
-        while True:
-            posts = self._call(blog, "likes", params)["liked_posts"]
-            if not posts:
-                return
-            yield from posts
-            params["before"] = posts[-1]["liked_timestamp"]
-
-    def _call(self, blog, endpoint, params, **kwargs):
-        if self.api_key:
-            params["api_key"] = self.api_key
-        url = "https://api.tumblr.com/v2/blog/{}/{}".format(
-            blog, endpoint)
+        return self._pagination(blog, "/likes", params, key="liked_posts")

-        response = self.request(url, params=params, **kwargs)
+    def _call(self, endpoint, params, **kwargs):
+        url = self.ROOT + endpoint
+        kwargs["params"] = params
+        response = self.request(url, **kwargs)

        try:
            data = response.json()
@ -535,7 +540,7 @@ class TumblrAPI(oauth.OAuth1API):

                if self.extractor.config("ratelimit") == "wait":
                    self.extractor.wait(seconds=reset)
-                    return self._call(blog, endpoint, params)
+                    return self._call(endpoint, params, **kwargs)

                t = (datetime.now() + timedelta(seconds=float(reset))).time()
                raise exception.StopExtraction(
@ -547,6 +552,29 @@ class TumblrAPI(oauth.OAuth1API):
            if reset:
                self.log.info("Hourly API rate limit exceeded")
                self.extractor.wait(seconds=reset)
-                return self._call(blog, endpoint, params)
+                return self._call(endpoint, params, **kwargs)

        raise exception.StopExtraction(data)
+
+    def _pagination(self, blog, endpoint, params, key="posts", cache=False):
+        endpoint = "/v2/blog/{}{}".format(blog, endpoint)
+        if self.api_key:
+            params["api_key"] = self.api_key
+
+        while True:
+            data = self._call(endpoint, params)
+
+            if cache:
+                self.BLOG_CACHE[blog] = data["blog"]
+                cache = False
+
+            yield from data[key]
+
+            try:
+                endpoint = data["_links"]["next"]["href"]
+            except KeyError:
+                return
+
+            params = None
+            if self.api_key:
+                endpoint += "&api_key=" + self.api_key