Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
5 years ago · bcddcca6db
parent 1693d97bd3
commit bcddcca6db
1 changed files with 22 additions and 6 deletions
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@ -140,6 +140,11 @@ class TwitterExtractor(Extractor):
            if not data["has_more_items"]:
                return

+            if "min_position" in data:
+                position = data["min_position"]
+                if "max_position" in params and position == params["max_position"]:
+                    return
+            else:
                position = text.parse_int(text.extract(
                    tweet, 'data-tweet-id="', '"')[0])
                if "max_position" in params and position >= params["max_position"]:
@ -151,7 +156,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for all images from a user's timeline"""
    subcategory = "timeline"
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-               r"/([^/?&#]+)/?(?:$|[?#])")
+               r"/((?!search)[^/?&#]+)/?(?:$|[?#])")
    test = (
        ("https://twitter.com/supernaturepics", {
            "range": "1-40",
@ -171,7 +176,7 @@ class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for all images from a user's Media Tweets"""
    subcategory = "media"
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-               r"/([^/?&#]+)/media(?!\w)")
+               r"/((?!search)[^/?&#]+)/media(?!\w)")
    test = (
        ("https://twitter.com/supernaturepics/media", {
            "range": "1-40",
@ -185,6 +190,17 @@ class TwitterMediaExtractor(TwitterExtractor):
            self.root, self.user)
        return self._tweets_from_api(url)

+class TwitterSearchExtractor(TwitterExtractor):
+    """Extractor for all images from a search timeline"""
+    subcategory = "search"
+    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+               r"/search[^q]+q=([^/?&#]+)(?:$|&)")
+    test = ()
+    
+    def tweets(self):
+        url = "{}/i/search/timeline?f=tweets&q={}".format(
+            self.root, self.user)
+        return self._tweets_from_api(url)

 class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""