# -*- coding: utf-8 -*- # Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://twitter.com/""" from .common import Extractor, Message from .. import text class TwitterExtractor(Extractor): """Base class for twitter extractors""" category = "twitter" directory_fmt = ["{category}", "{user}"] filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" root = "https://twitter.com" def __init__(self, match): Extractor.__init__(self) self.user = match.group(1) self.retweets = self.config("retweets", True) def items(self): yield Message.Version, 1 yield Message.Directory, self.metadata() for tweet in self.tweets(): images = list(text.extract_iter( tweet, 'data-image-url="', '"')) if not images: continue data = self._data_from_tweet(tweet) if not self.retweets and data["retweet_id"]: continue for data["num"], url in enumerate(images, 1): text.nameext_from_url(url, data) yield Message.Url, url + ":orig", data def metadata(self): """Return general metadata""" return {"user": self.user} def tweets(self): """Yield HTML content of all relevant tweets""" return () @staticmethod def _data_from_tweet(tweet): data = text.extract_all(tweet, ( ("tweet_id" , 'data-tweet-id="' , '"'), ("retweet_id", 'data-retweet-id="' , '"'), ("retweeter" , 'data-retweeter="' , '"'), ("user" , 'data-screen-name="', '"'), ("username" , 'data-name="' , '"'), ("user_id" , 'data-user-id="' , '"'), ))[0] for key in ("tweet_id", "retweet_id", "user_id"): data[key] = text.parse_int(data[key]) data["retweeter"] = data["retweeter"] or "" return data def _tweets_from_api(self, url): params = { "include_available_features": "1", "include_entities": "1", "reset_error_state": "false", "lang": "en", } headers = { "X-Requested-With": "XMLHttpRequest", "X-Twitter-Active-User": "yes", "Referer": "{}/{}".format(self.root, self.user) } while True: data = self.request(url, params=params, headers=headers).json() for tweet in text.extract_iter( data["items_html"], '