[tumblr] attempt to extract full-resolution photos

- for photos with apparent width == 2048 or height == 3072
- can be disabled with 'original' option
pull/2827/head
Mike Fährmann 2 years ago
parent d0adc13e23
commit df1c643dda
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -2237,6 +2237,20 @@ Description
Search posts for inline images and videos.
extractor.tumblr.original
-------------------------
Type
``bool``
Default
``true``
Description
Download full-resolution ``photo`` images.
For each photo with "maximum" resolution
(width equal to 2048 or height equal to 3072),
use an extra HTTP request to find the URL to its full-resolution version.
extractor.tumblr.reblogs
------------------------
Type

@ -284,6 +284,7 @@
"external": false,
"inline": true,
"posts": "all",
"original": true,
"reblogs": true
},
"twitter":

@ -64,6 +64,7 @@ class TumblrExtractor(Extractor):
self.inline = self.config("inline", True)
self.reblogs = self.config("reblogs", True)
self.external = self.config("external", False)
self.original = self.config("original", True)
if len(self.types) == 1:
self.api.posts_type = next(iter(self.types))
@ -110,12 +111,17 @@ class TumblrExtractor(Extractor):
for photo in photos:
post["photo"] = photo
best_photo = photo["original_size"]
for alt_photo in photo["alt_sizes"]:
if (alt_photo["height"] > best_photo["height"] or
alt_photo["width"] > best_photo["width"]):
best_photo = alt_photo
photo.update(best_photo)
if "/s2048x3072/" in photo["url"] and self.original:
photo["url"] = self._original_image(photo["url"])
del photo["original_size"]
del photo["alt_sizes"]
yield self._prepare_image(photo["url"], post)
@ -205,6 +211,12 @@ class TumblrExtractor(Extractor):
def _skip_reblog_same_blog(self, post):
return self.blog != post.get("reblogged_root_uuid")
def _original_image(self, url):
url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
headers = {"Accept": "text/html,*/*;q=0.8"}
response = self.request(url, headers=headers)
return text.extract(response.text, '" src="', '"')[0]
class TumblrUserExtractor(TumblrExtractor):
"""Extractor for all images from a tumblr-user"""
@ -284,6 +296,12 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
"count": 0,
}),
("https://mikf123.tumblr.com/image/689860196535762944", {
"pattern": r"^https://\d+\.media\.tumblr\.com"
r"/134791621559a79793563b636b5fe2c6"
r"/8f1131551cef6e74-bc/s99999x99999"
r"/188cf9b8915b0d0911c6c743d152fc62e8f38491\.png$",
}),
("http://ziemniax.tumblr.com/post/109697912859/", {
"exception": exception.NotFoundError, # HTML response (#297)
}),

Loading…
Cancel
Save