[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API
pull/2484/head
Mike Fährmann 3 years ago
parent a53cfc845e
commit 1171911dc3
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -2182,6 +2182,16 @@ Description
``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``. ``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``.
extractor.twitter.syndication
-----------------------------
Type
``bool``
Default
``false``
Description
Retrieve age-restricted content using Twitter's syndication API.
extractor.twitter.logout extractor.twitter.logout
------------------------ ------------------------
Type Type

@ -217,23 +217,24 @@ class TwitterExtractor(Extractor):
if "legacy" in tweet: if "legacy" in tweet:
tweet = tweet["legacy"] tweet = tweet["legacy"]
tget = tweet.get
entities = tweet["entities"] entities = tweet["entities"]
tdata = { tdata = {
"tweet_id" : text.parse_int(tweet["id_str"]), "tweet_id" : text.parse_int(tweet["id_str"]),
"retweet_id" : text.parse_int( "retweet_id" : text.parse_int(
tweet.get("retweeted_status_id_str")), tget("retweeted_status_id_str")),
"quote_id" : text.parse_int( "quote_id" : text.parse_int(
tweet.get("quoted_status_id_str")), tget("quoted_status_id_str")),
"reply_id" : text.parse_int( "reply_id" : text.parse_int(
tweet.get("in_reply_to_status_id_str")), tget("in_reply_to_status_id_str")),
"date" : text.parse_datetime( "date" : text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"user" : user, "user" : user,
"lang" : tweet["lang"], "lang" : tweet["lang"],
"favorite_count": tweet["favorite_count"], "favorite_count": tget("favorite_count"),
"quote_count" : tweet["quote_count"], "quote_count" : tget("quote_count"),
"reply_count" : tweet["reply_count"], "reply_count" : tget("reply_count"),
"retweet_count" : tweet["retweet_count"], "retweet_count" : tget("retweet_count"),
} }
hashtags = entities.get("hashtags") hashtags = entities.get("hashtags")
@ -248,7 +249,7 @@ class TwitterExtractor(Extractor):
"nick": u["name"], "nick": u["name"],
} for u in mentions] } for u in mentions]
content = tweet["full_text"] content = tget("full_text") or tget("text") or ""
urls = entities.get("urls") urls = entities.get("urls")
if urls: if urls:
for url in urls: for url in urls:
@ -269,33 +270,36 @@ class TwitterExtractor(Extractor):
return tdata return tdata
def _transform_user(self, user): def _transform_user(self, user):
uid = user.get("rest_id") or user["id_str"]
try: try:
return self._user_cache[user.get("rest_id") or user["id_str"]] return self._user_cache[uid]
except KeyError: except KeyError:
pass pass
uid = user.get("rest_id") or user["id_str"]
if "legacy" in user: if "legacy" in user:
user = user["legacy"] user = user["legacy"]
uget = user.get
entities = user["entities"] entities = user["entities"]
self._user_cache[uid] = udata = { self._user_cache[uid] = udata = {
"id" : text.parse_int(uid), "id" : text.parse_int(uid),
"name" : user["screen_name"], "name" : user["screen_name"],
"nick" : user["name"], "nick" : user["name"],
"location" : user["location"], "location" : uget("location"),
"date" : text.parse_datetime( "date" : text.parse_datetime(
user["created_at"], "%a %b %d %H:%M:%S %z %Y"), uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
"verified" : user.get("verified", False), "verified" : uget("verified", False),
"profile_banner" : user.get("profile_banner_url", ""), "profile_banner" : uget("profile_banner_url", ""),
"profile_image" : user.get( "profile_image" : uget(
"profile_image_url_https", "").replace("_normal.", "."), "profile_image_url_https", "").replace("_normal.", "."),
"favourites_count": user["favourites_count"], "favourites_count": uget("favourites_count"),
"followers_count" : user["followers_count"], "followers_count" : uget("followers_count"),
"friends_count" : user["friends_count"], "friends_count" : uget("friends_count"),
"listed_count" : user["listed_count"], "listed_count" : uget("listed_count"),
"media_count" : user["media_count"], "media_count" : uget("media_count"),
"statuses_count" : user["statuses_count"], "statuses_count" : uget("statuses_count"),
} }
descr = user["description"] descr = user["description"]
@ -653,6 +657,11 @@ class TwitterTweetExtractor(TwitterExtractor):
("https://twitter.com/i/web/status/1486373748911575046", { ("https://twitter.com/i/web/status/1486373748911575046", {
"count": 4, "count": 4,
}), }),
# age-restricted (#2354)
("https://twitter.com/mightbecursed/status/1492954264909479936", {
"options": (("syndication", True),),
"count": 1,
}),
) )
def __init__(self, match): def __init__(self, match):
@ -770,6 +779,7 @@ class TwitterAPI():
} }
self._nsfw_warning = True self._nsfw_warning = True
self._syndication = extractor.config("syndication")
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
self._user = None self._user = None
@ -1153,9 +1163,10 @@ class TwitterAPI():
elif esw("conversationthread-"): elif esw("conversationthread-"):
tweets.extend(entry["content"]["items"]) tweets.extend(entry["content"]["items"])
elif esw("tombstone-"): elif esw("tombstone-"):
self._report_tombstone( item = entry["content"]["itemContent"]
entry, item["tweet_results"] = \
entry["content"]["itemContent"]["tombstoneInfo"]) {"result": {"tombstone": item["tombstoneInfo"]}}
tweets.append(entry)
elif esw("cursor-bottom-"): elif esw("cursor-bottom-"):
cursor = entry["content"] cursor = entry["content"]
if not cursor.get("stopOnEmptyResponse", True): if not cursor.get("stopOnEmptyResponse", True):
@ -1168,7 +1179,9 @@ class TwitterAPI():
tweet = ((entry.get("content") or entry["item"]) tweet = ((entry.get("content") or entry["item"])
["itemContent"]["tweet_results"]["result"]) ["itemContent"]["tweet_results"]["result"])
if "tombstone" in tweet: if "tombstone" in tweet:
self._report_tombstone(entry, tweet["tombstone"]) tweet = self._process_tombstone(
entry, tweet["tombstone"])
if not tweet:
continue continue
if "tweet" in tweet: if "tweet" in tweet:
tweet = tweet["tweet"] tweet = tweet["tweet"]
@ -1259,10 +1272,45 @@ class TwitterAPI():
return return
variables["cursor"] = cursor variables["cursor"] = cursor
def _report_tombstone(self, entry, tombstone): def _process_tombstone(self, entry, tombstone):
text = (tombstone.get("richText") or tombstone["text"])["text"] text = (tombstone.get("richText") or tombstone["text"])["text"]
if text.startswith("Age-restricted") and self._nsfw_warning: tweet_id = entry["entryId"].rpartition("-")[2]
self.extractor.log.warning(text)
if text.startswith("Age-restricted"):
if self._syndication:
return self._syndication_tweet(tweet_id)
elif self._nsfw_warning:
self._nsfw_warning = False self._nsfw_warning = False
self.extractor.log.debug( self.extractor.log.warning('"%s"', text)
"Skipping %s (%s)", entry["entryId"].rpartition("-")[2], text)
self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)
def _syndication_tweet(self, tweet_id):
tweet = self.extractor.request(
"https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()
tweet["user"]["description"] = ""
tweet["user"]["entities"] = {"description": {}}
if "video" in tweet:
video = tweet["video"]
del video["variants"][:-1]
video["variants"][0]["url"] = video["variants"][0]["src"]
tweet["extended_entities"] = {"media": [{
"video_info" : video,
"original_info": {"width" : 0, "height": 0},
}]}
elif "photos" in tweet:
for p in tweet["photos"]:
p["media_url_https"] = p["url"]
p["original_info"] = {
"width" : p["width"],
"height": p["height"],
}
tweet["extended_entities"] = {"media": tweet["photos"]}
return {
"rest_id": tweet["id_str"],
"legacy" : tweet,
"user" : tweet["user"],
}

Loading…
Cancel
Save