[twitter] add option to extract TwitPic embeds (#579)

pull/586/head
Mike Fährmann 5 years ago
parent 254f7c3999
commit 25d5ec4ff3
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1067,6 +1067,15 @@ Description Extract images from retweets.
=========== =====
extractor.twitter.twitpic
-------------------------
=========== =====
Type ``bool``
Default ``false``
Description Extract `TwitPic <https://twitpic.com/>`__ embeds.
=========== =====
extractor.twitter.videos
------------------------
=========== =====

@ -144,6 +144,7 @@
{
"content": false,
"retweets": true,
"twitpic": false,
"videos": false
},
"vsco":

@ -30,6 +30,7 @@ class TwitterExtractor(Extractor):
self._user_dict = None
self.logged_in = False
self.retweets = self.config("retweets", True)
self.twitpic = self.config("twitpic", False)
self.content = self.config("content", False)
self.videos = self.config("videos", False)
@ -79,6 +80,26 @@ class TwitterExtractor(Extractor):
urls = [url + size for size in self.sizes]
yield Message.Urllist, urls, data
if self.twitpic and "//twitpic.com/" in tweet:
urls = [
url for url in text.extract_iter(
tweet, 'data-expanded-url="', '"')
if "//twitpic.com/" in url
]
if "num" not in data:
if urls:
yield Message.Directory, data
data["num"] = 0
for data["num"], url in enumerate(urls, data["num"]+1):
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
url = text.extract(
response.text, 'name="twitter:image" value="', '"')[0]
yield Message.Url, url, text.nameext_from_url(url, data)
def metadata(self):
"""Return general metadata"""
return {}
@ -230,7 +251,7 @@ class TwitterExtractor(Extractor):
for tweet in text.extract_iter(
data["items_html"], '<div class="tweet ', '\n</li>'):
yield tweet
if data.get("min_position") is None:
if data["has_more_items"] and "min_position" not in data:
pass
@ -348,6 +369,12 @@ class TwitterTweetExtractor(TwitterExtractor):
"count": 4,
"keyword": "0c627af2b8cdccc7e0da8fd221155c4a4a3141a8",
}),
# TwitPic embeds (#579)
("https://twitter.com/i/web/status/112900228289540096", {
"options": (("twitpic", True),),
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
"count": 3,
}),
)
def __init__(self, match):

Loading…
Cancel
Save