[twitter] ignore previously seen Tweets (#2712)

occurs primarily for /with_replies results when logged in
pull/2739/head
Mike Fährmann 2 years ago
parent 4b2a0a0eda
commit 1d14928bd9
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -2415,6 +2415,16 @@ Description
Extract `TwitPic <https://twitpic.com/>`__ embeds.
extractor.twitter.unique
------------------------
Type
``bool``
Default
``true``
Description
Ignore previously seen Tweets.
extractor.twitter.users
-----------------------
Type

@ -288,6 +288,7 @@
"strategy": null,
"text-tweets": false,
"twitpic": false,
"unique": true,
"users": "timeline",
"videos": true
},

@ -64,6 +64,11 @@ class TwitterExtractor(Extractor):
tweets = self._expand_tweets(self.tweets())
self.tweets = lambda : tweets
if self.config("unique", True):
seen_tweets = set()
else:
seen_tweets = None
for tweet in self.tweets():
if "legacy" in tweet:
@ -71,6 +76,11 @@ class TwitterExtractor(Extractor):
else:
data = tweet
if seen_tweets is not None:
if data["id_str"] in seen_tweets:
continue
seen_tweets.add(data["id_str"])
if not self.retweets and "retweeted_status_id_str" in data:
self.log.debug("Skipping %s (retweet)", data["id_str"])
continue

Loading…
Cancel
Save