[twitter] improve 'cards-blacklist' (#2875)

allow blacklisting domains and 'name:domain',
where 'domain' depends on a card's 'vanity_url' value
pull/2967/head
Mike Fährmann 2 years ago
parent aaf6992bae
commit e99a9b2aff
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -2362,9 +2362,15 @@ extractor.twitter.cards-blacklist
Type
``list`` of ``strings``
Example
``["player", "summary"]``
``["summary", "youtube.com", "player:twitch.tv"]``
Description
List of card types to ignore
List of card types to ignore.
Possible values are
* card names
* card domains
* ``<card name>:<card domain>``
extractor.twitter.conversations

@ -41,7 +41,7 @@ class TwitterExtractor(Extractor):
self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True)
self.cards = self.config("cards", False)
self.cards_blacklist = self.config("cards-blacklist") or ()
self.cards_blacklist = self.config("cards-blacklist")
self._user = self._user_obj = None
self._user_cache = {}
self._init_sizes()
@ -180,16 +180,21 @@ class TwitterExtractor(Extractor):
card = card["legacy"]
name = card["name"].rpartition(":")[2]
if name in self.cards_blacklist:
return
bvals = card["binding_values"]
if isinstance(bvals, list):
bvals = {bval["key"]: bval["value"]
for bval in card["binding_values"]}
cbl = self.cards_blacklist
if cbl:
if name in cbl:
return
if "vanity_url" in bvals:
domain = bvals["vanity_url"]["string_value"]
if domain in cbl or name + ":" + domain in cbl:
return
if name in ("summary", "summary_large_image"):
bvals = card["binding_values"]
if isinstance(bvals, list):
bvals = {
bval["key"]: bval["value"]
for bval in card["binding_values"]
}
for prefix in ("photo_image_full_size_",
"summary_photo_image_",
"thumbnail_image_"):
@ -206,15 +211,7 @@ class TwitterExtractor(Extractor):
files.append(value)
return
elif name == "unified_card":
bvals = card["binding_values"]
if isinstance(bvals, list):
for bval in card["binding_values"]:
if bval["key"] == "unified_card":
bval = bval["value"]["string_value"]
break
else:
bval = bvals["unified_card"]["string_value"]
data = json.loads(bval)
data = json.loads(bvals["unified_card"]["string_value"])
self._extract_media(tweet, data["media_entities"].values(), files)
return
@ -761,6 +758,12 @@ class TwitterTweetExtractor(TwitterExtractor):
("https://twitter.com/i/web/status/1466183847628865544", {
"count": 0,
}),
# 'cards-blacklist' option
("https://twitter.com/i/web/status/1571141912295243776", {
"options": (("cards", "ytdl"),
("cards-blacklist", ("twitch.tv",))),
"count": 0,
}),
# original retweets (#1026)
("https://twitter.com/jessica_3978/status/1296304589591810048", {
"options": (("retweets", "original"),),

Loading…
Cancel
Save