|
|
@ -217,23 +217,24 @@ class TwitterExtractor(Extractor):
|
|
|
|
if "legacy" in tweet:
|
|
|
|
if "legacy" in tweet:
|
|
|
|
tweet = tweet["legacy"]
|
|
|
|
tweet = tweet["legacy"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tget = tweet.get
|
|
|
|
entities = tweet["entities"]
|
|
|
|
entities = tweet["entities"]
|
|
|
|
tdata = {
|
|
|
|
tdata = {
|
|
|
|
"tweet_id" : text.parse_int(tweet["id_str"]),
|
|
|
|
"tweet_id" : text.parse_int(tweet["id_str"]),
|
|
|
|
"retweet_id" : text.parse_int(
|
|
|
|
"retweet_id" : text.parse_int(
|
|
|
|
tweet.get("retweeted_status_id_str")),
|
|
|
|
tget("retweeted_status_id_str")),
|
|
|
|
"quote_id" : text.parse_int(
|
|
|
|
"quote_id" : text.parse_int(
|
|
|
|
tweet.get("quoted_status_id_str")),
|
|
|
|
tget("quoted_status_id_str")),
|
|
|
|
"reply_id" : text.parse_int(
|
|
|
|
"reply_id" : text.parse_int(
|
|
|
|
tweet.get("in_reply_to_status_id_str")),
|
|
|
|
tget("in_reply_to_status_id_str")),
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
|
|
|
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
|
|
|
"user" : user,
|
|
|
|
"user" : user,
|
|
|
|
"lang" : tweet["lang"],
|
|
|
|
"lang" : tweet["lang"],
|
|
|
|
"favorite_count": tweet["favorite_count"],
|
|
|
|
"favorite_count": tget("favorite_count"),
|
|
|
|
"quote_count" : tweet["quote_count"],
|
|
|
|
"quote_count" : tget("quote_count"),
|
|
|
|
"reply_count" : tweet["reply_count"],
|
|
|
|
"reply_count" : tget("reply_count"),
|
|
|
|
"retweet_count" : tweet["retweet_count"],
|
|
|
|
"retweet_count" : tget("retweet_count"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
hashtags = entities.get("hashtags")
|
|
|
|
hashtags = entities.get("hashtags")
|
|
|
@ -248,7 +249,7 @@ class TwitterExtractor(Extractor):
|
|
|
|
"nick": u["name"],
|
|
|
|
"nick": u["name"],
|
|
|
|
} for u in mentions]
|
|
|
|
} for u in mentions]
|
|
|
|
|
|
|
|
|
|
|
|
content = tweet["full_text"]
|
|
|
|
content = tget("full_text") or tget("text") or ""
|
|
|
|
urls = entities.get("urls")
|
|
|
|
urls = entities.get("urls")
|
|
|
|
if urls:
|
|
|
|
if urls:
|
|
|
|
for url in urls:
|
|
|
|
for url in urls:
|
|
|
@ -269,33 +270,36 @@ class TwitterExtractor(Extractor):
|
|
|
|
return tdata
|
|
|
|
return tdata
|
|
|
|
|
|
|
|
|
|
|
|
def _transform_user(self, user):
|
|
|
|
def _transform_user(self, user):
|
|
|
|
|
|
|
|
uid = user.get("rest_id") or user["id_str"]
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
return self._user_cache[user.get("rest_id") or user["id_str"]]
|
|
|
|
return self._user_cache[uid]
|
|
|
|
except KeyError:
|
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
uid = user.get("rest_id") or user["id_str"]
|
|
|
|
|
|
|
|
if "legacy" in user:
|
|
|
|
if "legacy" in user:
|
|
|
|
user = user["legacy"]
|
|
|
|
user = user["legacy"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uget = user.get
|
|
|
|
entities = user["entities"]
|
|
|
|
entities = user["entities"]
|
|
|
|
|
|
|
|
|
|
|
|
self._user_cache[uid] = udata = {
|
|
|
|
self._user_cache[uid] = udata = {
|
|
|
|
"id" : text.parse_int(uid),
|
|
|
|
"id" : text.parse_int(uid),
|
|
|
|
"name" : user["screen_name"],
|
|
|
|
"name" : user["screen_name"],
|
|
|
|
"nick" : user["name"],
|
|
|
|
"nick" : user["name"],
|
|
|
|
"location" : user["location"],
|
|
|
|
"location" : uget("location"),
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
|
|
|
uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
|
|
|
|
"verified" : user.get("verified", False),
|
|
|
|
"verified" : uget("verified", False),
|
|
|
|
"profile_banner" : user.get("profile_banner_url", ""),
|
|
|
|
"profile_banner" : uget("profile_banner_url", ""),
|
|
|
|
"profile_image" : user.get(
|
|
|
|
"profile_image" : uget(
|
|
|
|
"profile_image_url_https", "").replace("_normal.", "."),
|
|
|
|
"profile_image_url_https", "").replace("_normal.", "."),
|
|
|
|
"favourites_count": user["favourites_count"],
|
|
|
|
"favourites_count": uget("favourites_count"),
|
|
|
|
"followers_count" : user["followers_count"],
|
|
|
|
"followers_count" : uget("followers_count"),
|
|
|
|
"friends_count" : user["friends_count"],
|
|
|
|
"friends_count" : uget("friends_count"),
|
|
|
|
"listed_count" : user["listed_count"],
|
|
|
|
"listed_count" : uget("listed_count"),
|
|
|
|
"media_count" : user["media_count"],
|
|
|
|
"media_count" : uget("media_count"),
|
|
|
|
"statuses_count" : user["statuses_count"],
|
|
|
|
"statuses_count" : uget("statuses_count"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
descr = user["description"]
|
|
|
|
descr = user["description"]
|
|
|
@ -653,6 +657,11 @@ class TwitterTweetExtractor(TwitterExtractor):
|
|
|
|
("https://twitter.com/i/web/status/1486373748911575046", {
|
|
|
|
("https://twitter.com/i/web/status/1486373748911575046", {
|
|
|
|
"count": 4,
|
|
|
|
"count": 4,
|
|
|
|
}),
|
|
|
|
}),
|
|
|
|
|
|
|
|
# age-restricted (#2354)
|
|
|
|
|
|
|
|
("https://twitter.com/mightbecursed/status/1492954264909479936", {
|
|
|
|
|
|
|
|
"options": (("syndication", True),),
|
|
|
|
|
|
|
|
"count": 1,
|
|
|
|
|
|
|
|
}),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
def __init__(self, match):
|
|
|
@ -770,6 +779,7 @@ class TwitterAPI():
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
self._nsfw_warning = True
|
|
|
|
self._nsfw_warning = True
|
|
|
|
|
|
|
|
self._syndication = extractor.config("syndication")
|
|
|
|
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
|
|
|
|
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
|
|
|
|
self._user = None
|
|
|
|
self._user = None
|
|
|
|
|
|
|
|
|
|
|
@ -1153,9 +1163,10 @@ class TwitterAPI():
|
|
|
|
elif esw("conversationthread-"):
|
|
|
|
elif esw("conversationthread-"):
|
|
|
|
tweets.extend(entry["content"]["items"])
|
|
|
|
tweets.extend(entry["content"]["items"])
|
|
|
|
elif esw("tombstone-"):
|
|
|
|
elif esw("tombstone-"):
|
|
|
|
self._report_tombstone(
|
|
|
|
item = entry["content"]["itemContent"]
|
|
|
|
entry,
|
|
|
|
item["tweet_results"] = \
|
|
|
|
entry["content"]["itemContent"]["tombstoneInfo"])
|
|
|
|
{"result": {"tombstone": item["tombstoneInfo"]}}
|
|
|
|
|
|
|
|
tweets.append(entry)
|
|
|
|
elif esw("cursor-bottom-"):
|
|
|
|
elif esw("cursor-bottom-"):
|
|
|
|
cursor = entry["content"]
|
|
|
|
cursor = entry["content"]
|
|
|
|
if not cursor.get("stopOnEmptyResponse", True):
|
|
|
|
if not cursor.get("stopOnEmptyResponse", True):
|
|
|
@ -1168,7 +1179,9 @@ class TwitterAPI():
|
|
|
|
tweet = ((entry.get("content") or entry["item"])
|
|
|
|
tweet = ((entry.get("content") or entry["item"])
|
|
|
|
["itemContent"]["tweet_results"]["result"])
|
|
|
|
["itemContent"]["tweet_results"]["result"])
|
|
|
|
if "tombstone" in tweet:
|
|
|
|
if "tombstone" in tweet:
|
|
|
|
self._report_tombstone(entry, tweet["tombstone"])
|
|
|
|
tweet = self._process_tombstone(
|
|
|
|
|
|
|
|
entry, tweet["tombstone"])
|
|
|
|
|
|
|
|
if not tweet:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
if "tweet" in tweet:
|
|
|
|
if "tweet" in tweet:
|
|
|
|
tweet = tweet["tweet"]
|
|
|
|
tweet = tweet["tweet"]
|
|
|
@ -1259,10 +1272,45 @@ class TwitterAPI():
|
|
|
|
return
|
|
|
|
return
|
|
|
|
variables["cursor"] = cursor
|
|
|
|
variables["cursor"] = cursor
|
|
|
|
|
|
|
|
|
|
|
|
def _report_tombstone(self, entry, tombstone):
|
|
|
|
def _process_tombstone(self, entry, tombstone):
|
|
|
|
text = (tombstone.get("richText") or tombstone["text"])["text"]
|
|
|
|
text = (tombstone.get("richText") or tombstone["text"])["text"]
|
|
|
|
if text.startswith("Age-restricted") and self._nsfw_warning:
|
|
|
|
tweet_id = entry["entryId"].rpartition("-")[2]
|
|
|
|
self.extractor.log.warning(text)
|
|
|
|
|
|
|
|
|
|
|
|
if text.startswith("Age-restricted"):
|
|
|
|
|
|
|
|
if self._syndication:
|
|
|
|
|
|
|
|
return self._syndication_tweet(tweet_id)
|
|
|
|
|
|
|
|
elif self._nsfw_warning:
|
|
|
|
self._nsfw_warning = False
|
|
|
|
self._nsfw_warning = False
|
|
|
|
self.extractor.log.debug(
|
|
|
|
self.extractor.log.warning('"%s"', text)
|
|
|
|
"Skipping %s (%s)", entry["entryId"].rpartition("-")[2], text)
|
|
|
|
|
|
|
|
|
|
|
|
self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _syndication_tweet(self, tweet_id):
|
|
|
|
|
|
|
|
tweet = self.extractor.request(
|
|
|
|
|
|
|
|
"https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tweet["user"]["description"] = ""
|
|
|
|
|
|
|
|
tweet["user"]["entities"] = {"description": {}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "video" in tweet:
|
|
|
|
|
|
|
|
video = tweet["video"]
|
|
|
|
|
|
|
|
del video["variants"][:-1]
|
|
|
|
|
|
|
|
video["variants"][0]["url"] = video["variants"][0]["src"]
|
|
|
|
|
|
|
|
tweet["extended_entities"] = {"media": [{
|
|
|
|
|
|
|
|
"video_info" : video,
|
|
|
|
|
|
|
|
"original_info": {"width" : 0, "height": 0},
|
|
|
|
|
|
|
|
}]}
|
|
|
|
|
|
|
|
elif "photos" in tweet:
|
|
|
|
|
|
|
|
for p in tweet["photos"]:
|
|
|
|
|
|
|
|
p["media_url_https"] = p["url"]
|
|
|
|
|
|
|
|
p["original_info"] = {
|
|
|
|
|
|
|
|
"width" : p["width"],
|
|
|
|
|
|
|
|
"height": p["height"],
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
tweet["extended_entities"] = {"media": tweet["photos"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
|
|
"rest_id": tweet["id_str"],
|
|
|
|
|
|
|
|
"legacy" : tweet,
|
|
|
|
|
|
|
|
"user" : tweet["user"],
|
|
|
|
|
|
|
|
}
|
|
|
|