[tumblr] add options to control extraction behavior (#48)

- posts : list of post-types to inspect - inline : scan post bodies for inline images - external: follow external links
7 years ago · 12de658937
parent 077f8c12be
commit 12de658937
3 changed files with 135 additions and 49 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -577,6 +577,40 @@ Description Minimum and maximum wait time in seconds between each image
 =========== =====


+extractor.tumblr.external
+-------------------------
+=========== =====
+Type        ``bool``
+Default     ``false``
+Description Follow external URLs (e.g. from "Link" posts) and try to extract
+            images from them.
+=========== =====
+
+
+extractor.tumblr.inline
+-----------------------
+=========== =====
+Type        ``bool``
+Default     ``false``
+Description Search posts for inline images.
+=========== =====
+
+
+extractor.tumblr.posts
+----------------------
+=========== =====
+Type        ``string``
+Default     ``"photo"``
+Description A comma-separated list of post types to extract images, etc. from.
+            For example: ``"text,link,photo"``.
+
+            Possible types are ``text``, ``quote``, ``link``, ``answer``,
+            ``video``, ``audio``, ``photo``, ``chat``.
+
+            You can use ``"all"`` instead of listing all types separately.
+=========== =====
+
+

 API Tokens & IDs
 ================
@ -590,7 +624,7 @@ extractor.deviantart.client-id & .client-secret
 -----------------------------------------------
 =========== =====
 Type        ``string``
-How To      - login and visit DeviantArt's `Applications & Keys`_ section
+How To      - login and visit DeviantArt's `Applications & Keys`_ section
            - click "Register your Application"
            - click "Save" (top right; default settings are fine)
            - copy ``client_id`` and ``client_secret`` of your new "Untitled"
@ -602,7 +636,7 @@ extractor.flickr.api-key & .api-secret
 --------------------------------------
 =========== =====
 Type        ``string``
-How To      - login and `Create an App`_ in Flickr's `App Garden`_
+How To      - login and `Create an App`_ in Flickr's `App Garden`_
            - click "APPLY FOR A NON-COMMERCIAL KEY"
            - fill out the form with a random name and description
              and click "SUBMIT"
@ -615,7 +649,7 @@ extractor.pawoo.access-token
 ----------------------------
 =========== =====
 Type        ``string``
-How To
+How To
 =========== =====


@ -623,7 +657,7 @@ extractor.pinterest.access-token
 --------------------------------
 =========== =====
 Type        ``string``
-How To
+How To
 =========== =====


@ -631,7 +665,7 @@ extractor.reddit.client-id & .user-agent
 ----------------------------------------
 =========== =====
 Type        ``string``
-How To      - login and visit the apps_ section of your account's preferences
+How To      - login and visit the apps_ section of your account's preferences
            - click the "are you a developer? create an app..." button
            - fill out the form, choose "installed app", preferably set
              "http://localhost:6414/" as "redirect uri" and finally click
@ -644,6 +678,21 @@ How To      - login and visit the apps_ section of your account's preferences
 =========== =====


+extractor.tumblr.api-key
+------------------------
+=========== =====
+Type        ``string``
+How To      - login and visit Tumblr's Applications_ section
+            - click "Register application"
+            - fill out the form: use a random name and description, set
+              https://example.org/ as "Application Website" and "Default
+              callback URL"
+            - solve Google's "I'm not a robot" challenge and click "Register"
+            - copy your ``OAuth Consumer Key`` and put it in your configuration
+              file
+=========== =====
+
+
 .. |.netrc| replace:: ``.netrc``
 .. |tempfile.gettempdir()| replace:: ``tempfile.gettempdir()``
 .. |requests.request()| replace:: ``requests.request()``
@ -675,3 +724,4 @@ How To      - login and visit the apps_ section of your account's preferences
 .. _`App Garden`:          https://www.flickr.com/services/
 .. _apps:                  https://www.reddit.com/prefs/apps/
 .. _`API access rules`:    https://github.com/reddit/reddit/wiki/API
+.. _Applications:          https://www.tumblr.com/oauth/apps
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -100,6 +100,12 @@
        {
            "mp4": true
        },
+        "tumblr":
+        {
+            "posts": "photo",
+            "inline": false,
+            "external": false
+        },
        "recursive":
        {
            "blacklist": ["directlink", "oauth", "recursive", "test"]
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@ -14,6 +14,26 @@ from ..cache import memcache
 import re


+def _original_image(url):
+    return re.sub(
+        (r"https?://\d+\.media\.tumblr\.com"
+         r"/([0-9a-f]+)/tumblr_([^/?&#.]+)_\d+\.([0-9a-z]+)"),
+        r"http://data.tumblr.com/\1/tumblr_\2_raw.\3", url
+    )
+
+
+def _original_video(url):
+    return re.sub(
+        (r"https?://vt\.media\.tumblr\.com"
+         r"/tumblr_([^_]+)_\d+\.([0-9a-z]+)"),
+        r"https://vt.media.tumblr.com/tumblr_\1.\2", url
+    )
+
+
+POST_TYPES = frozenset((
+    "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
+
 class TumblrExtractor(Extractor):
    """Base class for tumblr extractors"""
    category = "tumblr"
@ -25,85 +45,94 @@ class TumblrExtractor(Extractor):
        self.user = match.group(1)
        self.api = TumblrAPI(self)

+        self.inline = self.config("inline", False)
+        self.external = self.config("external", False)
+
+        types = self.config("posts", ("photo",))
+        if types == "all":
+            self.types = POST_TYPES
+        elif types:
+            if isinstance(types, str):
+                types = types.split(",")
+            self.types = frozenset(types)
+        else:
+            self.types = frozenset()
+
    def items(self):
        blog = self.api.info(self.user)
        yield Message.Version, 1
        yield Message.Directory, blog

        for post in self.posts():
+            if post["type"] not in self.types:
+                continue
+
            post["blog"] = blog
            post["offset"] = 0

            if "trail" in post:
                del post["trail"]

-            if "photos" in post:
+            if "photos" in post:  # type "photo" or "link"
                photos = post["photos"]
                del post["photos"]

                for photo in photos:
+                    post["photo"] = photo
                    photo.update(photo["original_size"])
-                    photo["url"] = self._original_image(photo["url"])
                    del photo["original_size"]
                    del photo["alt_sizes"]
-                    post["extension"] = photo["url"].rpartition(".")[2]
-                    post["offset"] += 1
-                    post["photo"] = photo
-                    yield Message.Url, photo["url"], post
+                    yield self._prepare(photo["url"], post)

            if "audio_url" in post:  # type: "audio"
-                post["extension"] = None
-                post["offset"] += 1
-                yield Message.Url, post["audio_url"], post
+                yield self._prepare(
+                    post["audio_url"], post, None)

            if "video_url" in post:  # type: "video"
-                url = post["video_url"]
-                post["extension"] = url.rpartition(".")[2]
-                post["offset"] += 1
-                yield Message.Url, self._original_video(url), post
+                yield self._prepare(
+                    post["video_url"], post, _original_video)

-            if "description" in post:  # inline images
-                for url in re.findall(r' src="([^"]+)"', post["description"]):
-                    post["extension"] = url.rpartition(".")[2]
-                    post["offset"] += 1
-                    yield Message.Url, self._original_image(url), post
-
-            if "permalink_url" in post:  # external video/audio
-                yield Message.Queue, post["permalink_url"], post
+            if self.inline:  # inline images
+                for key in ("body", "description"):
+                    if key in post:
+                        for url in re.findall('<img src="([^"]+)"', post[key]):
+                            yield self._prepare(url, post)

-            if "url" in post:  # type: "link"
-                yield Message.Queue, post["url"], post
+            if self.external:  # external links
+                post["extension"] = None
+                for key in ("permalink_url", "url"):
+                    if key in post:
+                        yield Message.Queue, post[key], post

    def posts(self):
        """Return an iterable containing all relevant posts"""

    @staticmethod
-    def _original_image(url):
-        return re.sub(
-            (r"https?://\d+\.media\.tumblr\.com"
-             r"/([0-9a-f]+)/tumblr_([^/?&#.]+)_\d+\.([0-9a-z]+)"),
-            r"http://data.tumblr.com/\1/tumblr_\2_raw.\3", url
-        )
-
-    @staticmethod
-    def _original_video(url):
-        return re.sub(
-            (r"https?://vt\.media\.tumblr\.com"
-             r"/tumblr_([^_]+)_\d+\.([0-9a-z]+)"),
-            r"https://vt.media.tumblr.com/tumblr_\1.\2", url
-        )
+    def _prepare(url, post, transform=_original_image):
+        if transform:
+            url = transform(url)
+        post["offset"] += 1
+        return Message.Url, url, text.nameext_from_url(url, post)


 class TumblrUserExtractor(TumblrExtractor):
    """Extractor for all images from a tumblr-user"""
    subcategory = "user"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"]
-    test = [("http://demo.tumblr.com/", {
+    test = [
+        ("http://demo.tumblr.com/", {
+            "pattern": (r"https?://\d+\.media\.tumblr\.com"
+                        r"/tumblr_[^/_]+_\d+\.jpg"),
+            "count": 1,
+        }),
+        ("http://demo.tumblr.com/", {
            "pattern": (r"https?://(?:$|"
                        r"\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280\.jpg|"
                        r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"),
            "count": 3,
-    })]
+            "options": (("posts", "all"), ("external", True), ("inline", True))
+        }),
+    ]

    def posts(self):
        return self.api.posts(self.user, {})
@ -121,6 +150,7 @@ class TumblrPostExtractor(TumblrExtractor):
    def __init__(self, match):
        TumblrExtractor.__init__(self, match)
        self.post_id = match.group(2)
+        self.types = POST_TYPES

    def posts(self):
        return self.api.posts(self.user, {"id": self.post_id})
@ -170,7 +200,7 @@ class TumblrAPI():
        response = self.extractor.request(
            url, params=params, fatal=False).json()
        if response["meta"]["status"] == 404:
-            raise exception.NotFoundError("user")
+            raise exception.NotFoundError("user or post")
        elif response["meta"]["status"] != 200:
            self.extractor.log.error(response)
            raise exception.StopExtraction()