[tumblr] provide fallback URLs (#64)

Each image now produces 3 URLs:
- amazonaws.com _raw (or _1280 for older images)
- amazonaws.com _500
- media.tumblr.com (URL returned by API)
pull/79/head
Mike Fährmann 7 years ago
parent b837420291
commit 9fccd7b783
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -15,12 +15,20 @@ import re
def _original_image(url): def _original_image(url):
if url.endswith(".gif") and "_inline_" in url: match = re.match(
return url r"https?://\d+\.media\.tumblr\.com"
return re.sub( r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+)_\d+\.([0-9a-z]+)",
(r"https?://\d+\.media\.tumblr\.com" url)
r"/([0-9a-f]+/tumblr_[^/?&#.]+)_\d+\.([0-9a-z]+)"),
r"https://s3.amazonaws.com/data.tumblr.com/\1_raw.\2", url if not match:
return (url,)
root = "https://s3.amazonaws.com/data.tumblr.com"
path, key, ext = match.groups()
return (
"".join((root, path, "_raw." if key else "_1280.", ext)),
"".join((root, path, "_500.", ext)),
url,
) )
@ -90,7 +98,7 @@ class TumblrExtractor(Extractor):
photo.update(photo["original_size"]) photo.update(photo["original_size"])
del photo["original_size"] del photo["original_size"]
del photo["alt_sizes"] del photo["alt_sizes"]
yield self._prepare(_original_image(photo["url"]), post) yield self._prepare_image(photo["url"], post)
if "audio_url" in post: # type: "audio" if "audio_url" in post: # type: "audio"
yield self._prepare(post["audio_url"], post) yield self._prepare(post["audio_url"], post)
@ -102,7 +110,7 @@ class TumblrExtractor(Extractor):
for key in ("body", "description"): for key in ("body", "description"):
if key in post: if key in post:
for url in re.findall('<img src="([^"]+)"', post[key]): for url in re.findall('<img src="([^"]+)"', post[key]):
yield self._prepare(_original_image(url), post) yield self._prepare_image(url, post)
if self.external: # external links if self.external: # external links
post["extension"] = None post["extension"] = None
@ -139,6 +147,12 @@ class TumblrExtractor(Extractor):
post["offset"] += 1 post["offset"] += 1
return Message.Url, url, text.nameext_from_url(url, post) return Message.Url, url, text.nameext_from_url(url, post)
@staticmethod
def _prepare_image(url, post):
post["offset"] += 1
urls = _original_image(url)
return Message.Urllist, urls, text.nameext_from_url(url, post)
class TumblrUserExtractor(TumblrExtractor): class TumblrUserExtractor(TumblrExtractor):
"""Extractor for all images from a tumblr-user""" """Extractor for all images from a tumblr-user"""
@ -146,13 +160,13 @@ class TumblrUserExtractor(TumblrExtractor):
pattern = [BASE_PATTERN + r"(?:/page/\d+)?/?$"] pattern = [BASE_PATTERN + r"(?:/page/\d+)?/?$"]
test = [ test = [
("http://demo.tumblr.com/", { ("http://demo.tumblr.com/", {
"pattern": (r"https?://\d+\.media\.tumblr\.com" "pattern": (r"https://s3\.amazonaws\.com/data\.tumblr\.com"
r"/tumblr_[^/_]+_\d+\.jpg"), r"/tumblr_[^/_]+_\d+\.jpg"),
"count": 1, "count": 1,
}), }),
("http://demo.tumblr.com/", { ("http://demo.tumblr.com/", {
"pattern": (r"https?://(?:$|" "pattern": (r"https?://(?:$|"
r"\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280\.jpg|" r"s3\.amazonaws\.com/data\.tumblr\.com/.+_1280\.jpg|"
r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"), r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"),
"count": 3, "count": 3,
"options": (("posts", "all"), ("external", True), "options": (("posts", "all"), ("external", True),
@ -171,7 +185,8 @@ class TumblrPostExtractor(TumblrExtractor):
subcategory = "post" subcategory = "post"
pattern = [BASE_PATTERN + r"/post/(\d+)"] pattern = [BASE_PATTERN + r"/post/(\d+)"]
test = [("http://demo.tumblr.com/post/459265350", { test = [("http://demo.tumblr.com/post/459265350", {
"pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", "pattern": (r"https://s3\.amazonaws\.com/data\.tumblr\.com"
r"/tumblr_[^/_]+_1280.jpg"),
"count": 1, "count": 1,
})] })]
@ -193,7 +208,8 @@ class TumblrTagExtractor(TumblrExtractor):
subcategory = "tag" subcategory = "tag"
pattern = [BASE_PATTERN + r"/tagged/([^/?&#]+)"] pattern = [BASE_PATTERN + r"/tagged/([^/?&#]+)"]
test = [("http://demo.tumblr.com/tagged/Times%20Square", { test = [("http://demo.tumblr.com/tagged/Times%20Square", {
"pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", "pattern": (r"https://s3\.amazonaws\.com/data\.tumblr\.com"
r"/tumblr_[^/_]+_1280.jpg"),
"count": 1, "count": 1,
})] })]

Loading…
Cancel
Save