[tumblr] extend 'reblogs' functionality (#103)

Setting 'reblogs' to "deleted" will check if the parent post of a
reblog has been deleted and download its media content if that is the
case, otherwise it will be skipped.

This is a rather costly operation (1 API request per reblogged post)
and should therefore be used with care.
pull/133/head
Mike Fährmann 6 years ago
parent c9b8e6aefc
commit a666ddd16b
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -614,16 +614,20 @@ extractor.tumblr.inline
=========== =====
Type ``bool``
Default ``false``
Description Search posts for inline images.
Description Search posts for inline images and videos.
=========== =====
extractor.tumblr.reblogs
------------------------
=========== =====
Type ``bool``
Type ``bool`` or ``string``
Default ``true``
Description Extract images from reblogged posts.
Description * ``true``: Extract media from reblogged posts
* ``false``: Skip reblogged posts
* ``"deleted"``: Skip reblogged posts, but download from them
anyway if the parent post has been deleted
(requires 1 additional API request per reblogged post)
=========== =====

@ -73,7 +73,7 @@ class TumblrExtractor(Extractor):
yield Message.Directory, blog.copy()
reblog = "reblogged_from_id" in post
if reblog and not self.reblogs:
if reblog and self._skip_reblog(post):
continue
post["reblogged"] = reblog
@ -158,6 +158,19 @@ class TumblrExtractor(Extractor):
return Message.Url, url, post
def _skip_reblog(self, post):
if self.reblogs != "deleted":
return not self.reblogs
match = re.match(
TumblrPostExtractor.pattern[0], post["reblogged_root_url"])
if match:
blog = match.group(1) or match.group(2)
try:
next(self.api.posts(blog, {"id": match.group(3)}))
except exception.NotFoundError:
return False
return True
class TumblrUserExtractor(TumblrExtractor):
"""Extractor for all images from a tumblr-user"""

Loading…
Cancel
Save