Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
pull/465/head
Alice 5 years ago committed by Mike Fährmann
parent 1693d97bd3
commit bcddcca6db

@ -140,6 +140,11 @@ class TwitterExtractor(Extractor):
if not data["has_more_items"]:
return
if "min_position" in data:
position = data["min_position"]
if "max_position" in params and position == params["max_position"]:
return
else:
position = text.parse_int(text.extract(
tweet, 'data-tweet-id="', '"')[0])
if "max_position" in params and position >= params["max_position"]:
@ -151,7 +156,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for all images from a user's timeline"""
subcategory = "timeline"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/?(?:$|[?#])")
r"/((?!search)[^/?&#]+)/?(?:$|[?#])")
test = (
("https://twitter.com/supernaturepics", {
"range": "1-40",
@ -171,7 +176,7 @@ class TwitterMediaExtractor(TwitterExtractor):
"""Extractor for all images from a user's Media Tweets"""
subcategory = "media"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/media(?!\w)")
r"/((?!search)[^/?&#]+)/media(?!\w)")
test = (
("https://twitter.com/supernaturepics/media", {
"range": "1-40",
@ -185,6 +190,17 @@ class TwitterMediaExtractor(TwitterExtractor):
self.root, self.user)
return self._tweets_from_api(url)
class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for all images from a search timeline"""
subcategory = "search"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/search[^q]+q=([^/?&#]+)(?:$|&)")
test = ()
def tweets(self):
url = "{}/i/search/timeline?f=tweets&q={}".format(
self.root, self.user)
return self._tweets_from_api(url)
class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for images from individual tweets"""

Loading…
Cancel
Save