# -*- coding: utf-8 -*- # Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://twitter.com/""" from .common import Extractor, Message from .. import text, exception from ..cache import cache, memcache import hashlib import time class TwitterExtractor(Extractor): """Base class for twitter extractors""" category = "twitter" directory_fmt = ("{category}", "{user[screen_name]}") filename_fmt = "{id_str}_{num}.{extension}" archive_fmt = "{id_str}_{num}" cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) self.videos = self.config("videos", True) def items(self): self.login() metadata = self.metadata() yield Message.Version, 1 for tweet in self.tweets(): if not self.retweets and "retweeted_status_id_str" in tweet or \ not self.replies and "in_reply_to_user_id_str" in tweet: continue if self.twitpic: self._extract_twitpic(tweet) if "extended_entities" not in tweet: continue tweet.update(metadata) yield Message.Directory, tweet for tweet["num"], media in enumerate( tweet["extended_entities"]["media"], 1): tweet.update(media["original_info"]) if "video_info" in media and self.videos: if self.videos == "ytdl": url = "ytdl:{}/i/web/status/{}".format( self.root, tweet["id_str"]) tweet["extension"] = None yield Message.Url, url, tweet else: video_info = media["video_info"] variant = max( video_info["variants"], key=lambda v: v.get("bitrate", 0), ) tweet["duration"] = video_info.get( "duration_millis", 0) / 1000 tweet["bitrate"] = variant.get("bitrate", 0) url = variant["url"] text.nameext_from_url(url, tweet) yield Message.Url, url, tweet elif "media_url_https" in media: url = media["media_url_https"] urls = [url + size for size in self.sizes] text.nameext_from_url(url, tweet) yield Message.Urllist, urls, tweet else: url = media["media_url"] text.nameext_from_url(url, tweet) yield Message.Url, url, tweet def _extract_twitpic(self, tweet): twitpics = [] for url in tweet["entities"].get("urls", ()): url = url["expanded_url"] if "//twitpic.com/" in url: response = self.request(url, fatal=False) if response.status_code >= 400: continue url = text.extract( response.text, 'name="twitter:image" value="', '"')[0] twitpics.append({ "original_info": {}, "media_url" : url, }) if twitpics: if "extended_entities" in tweet: tweet["extended_entities"]["media"].extend(twitpics) else: tweet["extended_entities"] = {"media": twitpics} def metadata(self): """Return general metadata""" return {} def tweets(self): """Yield all relevant tweet objects""" def login(self): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = "https://mobile.twitter.com/i/nojs_router" params = {"path": "/login"} headers = {"Referer": self.root + "/", "Origin": self.root} page = self.request( url, method="POST", params=params, headers=headers, data={}).text pos = page.index('name="authenticity_token"') token = text.extract(page, 'value="', '"', pos)[0] url = "https://mobile.twitter.com/sessions" data = { "authenticity_token" : token, "session[username_or_email]": username, "session[password]" : password, "remember_me" : "1", "wfa" : "1", "commit" : "+Log+in+", "ui_metrics" : "", } response = self.request(url, method="POST", data=data) cookies = { cookie.name: cookie.value for cookie in self.session.cookies if cookie.domain == self.cookiedomain } if "/error" in response.url or "auth_token" not in cookies: raise exception.AuthenticationError() return cookies class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" r"/(?!search)([^/?&#]+)/?(?:$|[?#])") test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", }), ("https://mobile.twitter.com/supernaturepics?p=i"), ) def tweets(self): return TwitterAPI(self).timeline_profile(self.user) class TwitterMediaExtractor(TwitterExtractor): """Extractor for all images from a user's Media Tweets""" subcategory = "media" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" r"/(?!search)([^/?&#]+)/media(?!\w)") test = ( ("https://twitter.com/supernaturepics/media", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", }), ("https://mobile.twitter.com/supernaturepics/media#t"), ) def tweets(self): return TwitterAPI(self).timeline_media(self.user) class TwitterSearchExtractor(TwitterExtractor): """Extractor for all images from a search timeline""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" r"/search/?\?(?:[^&#]+&)*q=([^&#]+)") test = ("https://twitter.com/search?q=nature", { "range": "1-40", "count": 40, }) def metadata(self): return {"search": self.user} def tweets(self): return TwitterAPI(self).search(self.user) class TwitterTweetExtractor(TwitterExtractor): """Extractor for images from individual tweets""" subcategory = "tweet" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" r"/([^/?&#]+|i/web)/status/(\d+)") test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", }), # 4 images ("https://twitter.com/perrypumas/status/894001459754180609", { "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", }), # video ("https://twitter.com/perrypumas/status/1065692031626829824", { "options": (("videos", True),), "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5", }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/playpokemon/status/1263832915173048321", { "keyword": {"full_text": ( r"re:Gear up for #PokemonSwordShieldEX with special Mystery " "Gifts! \n\nYou’ll be able to receive four Galarian form " "Pokémon with Hidden Abilities, plus some very useful items. " "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ " )}, }), # Reply to another tweet (#403) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { "options": (("videos", "ytdl"),), "pattern": r"ytdl:https://twitter.com/i/web.+/1103767554424598528", }), # 'replies' option (#705) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { "options": (("replies", False),), "count": 0, }), # /i/web/ URL ("https://twitter.com/i/web/status/1155074198240292865", { "pattern": r"https://pbs.twimg.com/media/EAel0vUUYAAZ4Bq.jpg:orig", }), # quoted tweet (#526) ("https://twitter.com/Pistachio/status/1222690391817932803", { "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True),), "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", "count": 3, }), ) def __init__(self, match): TwitterExtractor.__init__(self, match) self.tweet_id = match.group(2) def tweets(self): return TwitterAPI(self).tweet(self.tweet_id) class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" subcategory = "bookmark" pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" test = ("https://twitter.com/i/bookmarks",) def tweets(self): return TwitterAPI(self).bookmarks() class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor self.headers = { "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" "4FA33AGWWjCpTnA", "x-guest-token": None, "x-twitter-client-language": "en", "x-twitter-active-user": "yes", "x-csrf-token": None, "Origin": "https://twitter.com", "Referer": "https://twitter.com/", } self.params = { "include_profile_interstitial_type": "1", "include_blocking": "1", "include_blocked_by": "1", "include_followed_by": "1", "include_want_retweets": "1", "include_mute_edge": "1", "include_can_dm": "1", "include_can_media_tag": "1", "skip_status": "1", "cards_platform": "Web-12", "include_cards": "1", "include_composer_source": "true", "include_ext_alt_text": "true", "include_reply_count": "1", "tweet_mode": "extended", "include_entities": "true", "include_user_entities": "true", "include_ext_media_color": "true", "include_ext_media_availability": "true", "send_error_codes": "true", "simple_quoted_tweet": "true", # "count": "20", "count": "100", "cursor": None, "ext": "mediaStats%2ChighlightedLabel%2CcameraMoment", "include_quote_count": "true", } cookies = self.extractor.session.cookies # CSRF csrf = hashlib.md5(str(time.time()).encode()).hexdigest() self.headers["x-csrf-token"] = csrf cookies.set("ct0", csrf, domain=".twitter.com") if cookies.get("auth_token", domain=".twitter.com"): self.headers["x-twitter-auth-type"] = "OAuth2Session" else: # guest token guest_token = _guest_token(self.extractor, self.headers) self.headers["x-guest-token"] = guest_token cookies.set("gt", guest_token, domain=".twitter.com") def tweet(self, tweet_id): endpoint = "2/timeline/conversation/{}.json".format(tweet_id) for tweet in self._pagination(endpoint): if tweet["id_str"] == tweet_id: return (tweet,) return () def timeline_profile(self, screen_name): user = self.user_by_screen_name(screen_name) endpoint = "2/timeline/profile/{}.json".format(user["rest_id"]) return self._pagination(endpoint) def timeline_media(self, screen_name): user = self.user_by_screen_name(screen_name) endpoint = "2/timeline/media/{}.json".format(user["rest_id"]) return self._pagination(endpoint) def search(self, query): endpoint = "2/search/adaptive.json" params = self.params.copy() params["q"] = query return self._pagination( endpoint, params, "sq-I-t-", "sq-cursor-bottom") def bookmarks(self): endpoint = "2/timeline/bookmark.json" return self._pagination(endpoint) @memcache() def user_by_screen_name(self, screen_name): endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName" params = { "variables": '{"screen_name":"' + screen_name + '"' ',"withHighlightedLabel":true}' } return self._call(endpoint, params)["data"]["user"] def _call(self, endpoint, params): url = "https://api.twitter.com/" + endpoint response = self.extractor.request( url, params=params, headers=self.headers, fatal=None) if response.status_code < 400: return response.json() if response.status_code == 429: self.extractor.wait(until=response.headers["x-rate-limit-reset"]) return self._call(endpoint, params) raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, response.text) def _pagination(self, endpoint, params=None, entry_tweet="tweet-", entry_cursor="cursor-bottom-"): if params is None: params = self.params.copy() while True: cursor = None data = self._call(endpoint, params) tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] instr = data["timeline"]["instructions"][0] for entry in instr["addEntries"]["entries"]: if entry["entryId"].startswith(entry_tweet): tid = entry["content"]["item"]["content"]["tweet"]["id"] if tid not in tweets: self.extractor.log.debug( "Skipping unavailable Tweet %s", tid) continue tweet = tweets[tid] tweet["user"] = users[tweet["user_id_str"]] if "quoted_status_id_str" in tweet: quoted = tweets[tweet["quoted_status_id_str"]] tweet["author"] = tweet["user"] if "extended_entities" in quoted: tweet["extended_entities"] = \ quoted["extended_entities"] elif "retweeted_status_id_str" in tweet: retweet = tweets[tweet["retweeted_status_id_str"]] tweet["author"] = users[retweet["user_id_str"]] else: tweet["author"] = tweet["user"] yield tweet elif entry["entryId"].startswith(entry_cursor): cursor = entry["content"]["operation"]["cursor"]["value"] if not cursor or params["cursor"] == cursor: return params["cursor"] = cursor @cache(maxage=3600) def _guest_token(extr, headers): return extr.request( "https://api.twitter.com/1.1/guest/activate.json", method="POST", headers=headers, ).json().get("guest_token")