gallery-dl/gallery_dl/extractor/twitter.py

# -*- coding: utf-8 -*-

# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://twitter.com/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import json

BASE_PATTERN = (
    r"(?:https?://)?(?:www\.|mobile\.)?"
    r"(?:twitter\.com|nitter\.net)"
)


class TwitterExtractor(Extractor):
    """Base class for twitter extractors"""
    category = "twitter"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
    cookiedomain = ".twitter.com"
    root = "https://twitter.com"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1)
        self.retweets = self.config("retweets", True)
        self.replies = self.config("replies", True)
        self.twitpic = self.config("twitpic", False)
        self.quoted = self.config("quoted", True)
        self.videos = self.config("videos", True)
        self.cards = self.config("cards", False)
        self._user_cache = {}

    def items(self):
        self.login()
        metadata = self.metadata()
        yield Message.Version, 1

        for tweet in self.tweets():

            if not self.retweets and "retweeted_status_id_str" in tweet:
                self.log.debug("Skipping %s (retweet)", tweet["id_str"])
                continue
            if not self.replies and "in_reply_to_user_id_str" in tweet:
                self.log.debug("Skipping %s (reply)", tweet["id_str"])
                continue
            if not self.quoted and "quoted" in tweet:
                self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
                continue

            files = []
            if "extended_entities" in tweet:
                self._extract_media(tweet, files)
            if "card" in tweet and self.cards:
                self._extract_card(tweet, files)
            if self.twitpic:
                self._extract_twitpic(tweet, files)
            if not files:
                continue

            tdata = self._transform_tweet(tweet)
            tdata.update(metadata)
            yield Message.Directory, tdata
            for tdata["num"], file in enumerate(files, 1):
                file.update(tdata)
                url = file.pop("url")
                if "extension" not in file:
                    text.nameext_from_url(url, file)
                yield Message.Url, url, file

    def _extract_media(self, tweet, files):
        for media in tweet["extended_entities"]["media"]:
            width = media["original_info"].get("width", 0)
            height = media["original_info"].get("height", 0)

            if "video_info" in media:
                if self.videos == "ytdl":
                    files.append({
                        "url": "ytdl:{}/i/web/status/{}".format(
                            self.root, tweet["id_str"]),
                        "width"    : width,
                        "height"   : height,
                        "extension": None,
                    })
                elif self.videos:
                    video_info = media["video_info"]
                    variant = max(
                        video_info["variants"],
                        key=lambda v: v.get("bitrate", 0),
                    )
                    files.append({
                        "url"     : variant["url"],
                        "width"   : width,
                        "height"  : height,
                        "bitrate" : variant.get("bitrate", 0),
                        "duration": video_info.get(
                            "duration_millis", 0) / 1000,
                    })
            elif "media_url_https" in media:
                url = media["media_url_https"]
                files.append(text.nameext_from_url(url, {
                    "url"      : url + ":orig",
                    "_fallback": [url+":large", url+":medium", url+":small"],
                    "width"    : width,
                    "height"   : height,
                }))
            else:
                files.append({"url": media["media_url"]})

    def _extract_card(self, tweet, files):
        card = tweet["card"]
        if card["name"] in ("summary", "summary_large_image"):
            bvals = card["binding_values"]
            for prefix in ("photo_image_full_size_",
                           "summary_photo_image_",
                           "thumbnail_image_"):
                for size in ("original", "x_large", "large", "small"):
                    key = prefix + size
                    if key in bvals:
                        files.append(bvals[key]["image_value"])
                        return
        else:
            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"])
            files.append({"url": url})

    def _extract_twitpic(self, tweet, files):
        for url in tweet["entities"].get("urls", ()):
            url = url["expanded_url"]
            if "//twitpic.com/" in url and "/photos/" not in url:
                response = self.request(url, fatal=False)
                if response.status_code >= 400:
                    continue
                url = text.extract(
                    response.text, 'name="twitter:image" value="', '"')[0]
                if url:
                    files.append({"url": url})

    def _transform_tweet(self, tweet):
        entities = tweet["entities"]
        tdata = {
            "tweet_id"      : text.parse_int(tweet["id_str"]),
            "retweet_id"    : text.parse_int(
                tweet.get("retweeted_status_id_str")),
            "quote_id"      : text.parse_int(
                tweet.get("quoted_status_id_str")),
            "reply_id"      : text.parse_int(
                tweet.get("in_reply_to_status_id_str")),
            "date"          : text.parse_datetime(
                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
            "user"          : self._transform_user(tweet["user"]),
            "lang"          : tweet["lang"],
            "content"       : tweet["full_text"],
            "favorite_count": tweet["favorite_count"],
            "quote_count"   : tweet["quote_count"],
            "reply_count"   : tweet["reply_count"],
            "retweet_count" : tweet["retweet_count"],
        }

        hashtags = entities.get("hashtags")
        if hashtags:
            tdata["hashtags"] = [t["text"] for t in hashtags]

        mentions = entities.get("user_mentions")
        if mentions:
            tdata["mentions"] = [{
                "id": text.parse_int(u["id_str"]),
                "name": u["screen_name"],
                "nick": u["name"],
            } for u in mentions]

        if "in_reply_to_screen_name" in tweet:
            tdata["reply_to"] = tweet["in_reply_to_screen_name"]

        if "author" in tweet:
            tdata["author"] = self._transform_user(tweet["author"])
        else:
            tdata["author"] = tdata["user"]

        return tdata

    def _transform_user(self, user):
        uid = user["id_str"]
        cache = self._user_cache

        if uid not in cache:
            cache[uid] = {
                "id"              : text.parse_int(uid),
                "name"            : user["screen_name"],
                "nick"            : user["name"],
                "description"     : user["description"],
                "location"        : user["location"],
                "date"            : text.parse_datetime(
                    user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
                "verified"        : user.get("verified", False),
                "profile_banner"  : user.get("profile_banner_url", ""),
                "profile_image"   : user.get(
                    "profile_image_url_https", "").replace("_normal.", "."),
                "favourites_count": user["favourites_count"],
                "followers_count" : user["followers_count"],
                "friends_count"   : user["friends_count"],
                "listed_count"    : user["listed_count"],
                "media_count"     : user["media_count"],
                "statuses_count"  : user["statuses_count"],
            }
        return cache[uid]

    def metadata(self):
        """Return general metadata"""
        return {}

    def tweets(self):
        """Yield all relevant tweet objects"""

    def login(self):
        username, password = self._get_auth_info()
        if username:
            self._update_cookies(self._login_impl(username, password))

    @cache(maxage=360*24*3600, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        url = "https://mobile.twitter.com/i/nojs_router"
        params = {"path": "/login"}
        headers = {"Referer": self.root + "/", "Origin": self.root}
        page = self.request(
            url, method="POST", params=params, headers=headers, data={}).text

        pos = page.index('name="authenticity_token"')
        token = text.extract(page, 'value="', '"', pos)[0]

        url = "https://mobile.twitter.com/sessions"
        data = {
            "authenticity_token"        : token,
            "session[username_or_email]": username,
            "session[password]"         : password,
            "remember_me"               : "1",
            "wfa"                       : "1",
            "commit"                    : "+Log+in+",
            "ui_metrics"                : "",
        }
        response = self.request(url, method="POST", data=data)
        cookies = {
            cookie.name: cookie.value
            for cookie in self.session.cookies
            if cookie.domain == self.cookiedomain
        }

        if "/error" in response.url or "auth_token" not in cookies:
            raise exception.AuthenticationError()
        return cookies


class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for all images from a user's timeline"""
    subcategory = "timeline"
    pattern = BASE_PATTERN + \
        r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))"
    test = (
        ("https://twitter.com/supernaturepics", {
            "range": "1-40",
            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
        }),
        ("https://mobile.twitter.com/supernaturepics?p=i"),
        ("https://www.twitter.com/id:2976459548"),
        ("https://twitter.com/intent/user?user_id=2976459548"),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        uid = match.group(2)
        if uid:
            self.user = "id:" + uid

    def tweets(self):
        return TwitterAPI(self).timeline_profile(self.user)


class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for all images from a user's Media Tweets"""
    subcategory = "media"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/media", {
            "range": "1-40",
            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
        }),
        ("https://mobile.twitter.com/supernaturepics/media#t"),
        ("https://www.twitter.com/id:2976459548/media"),
    )

    def tweets(self):
        return TwitterAPI(self).timeline_media(self.user)


class TwitterLikesExtractor(TwitterExtractor):
    """Extractor for liked tweets"""
    subcategory = "likes"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
    test = ("https://twitter.com/supernaturepics/likes",)

    def tweets(self):
        return TwitterAPI(self).timeline_favorites(self.user)


class TwitterBookmarkExtractor(TwitterExtractor):
    """Extractor for bookmarked tweets"""
    subcategory = "bookmark"
    pattern = BASE_PATTERN + r"/i/bookmarks()"
    test = ("https://twitter.com/i/bookmarks",)

    def tweets(self):
        return TwitterAPI(self).timeline_bookmark()


class TwitterListExtractor(TwitterExtractor):
    """Extractor for Twitter lists"""
    subcategory = "list"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
    test = ("https://twitter.com/i/lists/784214683683127296", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def tweets(self):
        return TwitterAPI(self).timeline_list(self.user)


class TwitterListMembersExtractor(TwitterExtractor):
    """Extractor for members of a Twitter list"""
    subcategory = "list-members"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
    test = ("https://twitter.com/i/lists/784214683683127296/members",)

    def items(self):
        self.login()
        for user in TwitterAPI(self).list_members(self.user):
            user["_extractor"] = TwitterTimelineExtractor
            url = "{}/intent/user?user_id={}".format(
                self.root, user["rest_id"])
            yield Message.Queue, url, user


class TwitterSearchExtractor(TwitterExtractor):
    """Extractor for all images from a search timeline"""
    subcategory = "search"
    directory_fmt = ("{category}", "Search", "{search}")
    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
    test = ("https://twitter.com/search?q=nature", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def metadata(self):
        return {"search": text.unquote(self.user)}

    def tweets(self):
        return TwitterAPI(self).search(text.unquote(self.user))


class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""
    subcategory = "tweet"
    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
    test = (
        ("https://twitter.com/supernaturepics/status/604341487988576256", {
            "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
        }),
        # 4 images
        ("https://twitter.com/perrypumas/status/894001459754180609", {
            "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
        }),
        # video
        ("https://twitter.com/perrypumas/status/1065692031626829824", {
            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
        }),
        # content with emoji, newlines, hashtags (#338)
        ("https://twitter.com/playpokemon/status/1263832915173048321", {
            "keyword": {"content": (
                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
                "Gifts! \n\nYou’ll be able to receive four Galarian form "
                "Pokémon with Hidden Abilities, plus some very useful items. "
                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
            )},
        }),
        # Reply to deleted tweet (#403, #838)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_.jpg:orig",
        }),
        # 'replies' option (#705)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "options": (("replies", False),),
            "count": 0,
        }),
        # quoted tweet (#526, #854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+\.jpg",
            "count": 8,
        }),
        # "quoted" option (#854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "options": (("quoted", False),),
            "pattern": r"https://pbs\.twimg\.com/media/EaK.+\.jpg",
            "count": 4,
        }),
        # TwitPic embeds (#579)
        ("https://twitter.com/i/web/status/112900228289540096", {
            "options": (("twitpic", True),),
            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
            "count": 3,
        }),
        # Nitter tweet (#890)
        ("https://nitter.net/ed1conf/status/1163841619336007680", {
            "url": "0f6a841e23948e4320af7ae41125e0c5b3cadc98",
            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
        }),
        # Twitter card (#1005)
        ("https://twitter.com/billboard/status/1306599586602135555", {
            "options": (("cards", True),),
            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
        }),
        # original retweets (#1026)
        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
            "options": (("retweets", "original"),),
            "count": 2,
            "keyword": {
                "tweet_id": 1296296016002547713,
                "date"    : "dt:2020-08-20 04:00:28",
            },
        }),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        self.tweet_id = match.group(2)

    def tweets(self):
        return TwitterAPI(self).tweet(self.tweet_id)


class TwitterAPI():

    def __init__(self, extractor):
        self.extractor = extractor
        self.headers = {
            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
                             "4FA33AGWWjCpTnA",
            "x-guest-token": None,
            "x-twitter-client-language": "en",
            "x-twitter-active-user": "yes",
            "x-csrf-token": None,
            "Origin": "https://twitter.com",
            "Referer": "https://twitter.com/",
        }
        self.params = {
            "include_profile_interstitial_type": "1",
            "include_blocking": "1",
            "include_blocked_by": "1",
            "include_followed_by": "1",
            "include_want_retweets": "1",
            "include_mute_edge": "1",
            "include_can_dm": "1",
            "include_can_media_tag": "1",
            "skip_status": "1",
            "cards_platform": "Web-12",
            "include_cards": "1",
            "include_composer_source": "true",
            "include_ext_alt_text": "true",
            "include_reply_count": "1",
            "tweet_mode": "extended",
            "include_entities": "true",
            "include_user_entities": "true",
            "include_ext_media_color": "true",
            "include_ext_media_availability": "true",
            "send_error_codes": "true",
            "simple_quoted_tweet": "true",
            #  "count": "20",
            "count": "100",
            "cursor": None,
            "ext": "mediaStats,highlightedLabel,cameraMoment",
            "include_quote_count": "true",
        }

        cookies = self.extractor.session.cookies

        # CSRF
        csrf = util.generate_csrf_token()
        self.headers["x-csrf-token"] = csrf
        cookies.set("ct0", csrf, domain=".twitter.com")

        if cookies.get("auth_token", domain=".twitter.com"):
            # logged in
            self.root = "https://twitter.com/i/api/"
            self.headers["x-twitter-auth-type"] = "OAuth2Session"
        else:
            # guest
            self.root = "https://api.twitter.com/"
            guest_token = self._guest_token()
            self.headers["x-guest-token"] = guest_token
            cookies.set("gt", guest_token, domain=".twitter.com")

    def tweet(self, tweet_id):
        endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
        tweets = []
        for tweet in self._pagination(endpoint):
            if tweet["id_str"] == tweet_id or \
                    tweet.get("_retweet_id_str") == tweet_id:
                tweets.append(tweet)
                if "quoted_status_id_str" in tweet:
                    tweet_id = tweet["quoted_status_id_str"]
                else:
                    break
        return tweets

    def timeline_profile(self, screen_name):
        user_id = self._user_id_by_screen_name(screen_name)
        endpoint = "2/timeline/profile/{}.json".format(user_id)
        return self._pagination(endpoint)

    def timeline_media(self, screen_name):
        user_id = self._user_id_by_screen_name(screen_name)
        endpoint = "2/timeline/media/{}.json".format(user_id)
        return self._pagination(endpoint)

    def timeline_favorites(self, screen_name):
        user_id = self._user_id_by_screen_name(screen_name)
        endpoint = "2/timeline/favorites/{}.json".format(user_id)
        return self._pagination(endpoint)

    def timeline_bookmark(self):
        endpoint = "2/timeline/bookmark.json"
        return self._pagination(endpoint)

    def timeline_list(self, list_id):
        endpoint = "2/timeline/list.json"
        params = self.params.copy()
        params["list_id"] = list_id
        params["ranking_mode"] = "reverse_chronological"
        return self._pagination(endpoint, params)

    def search(self, query):
        endpoint = "2/search/adaptive.json"
        params = self.params.copy()
        params["q"] = query
        params["tweet_search_mode"] = "live"
        params["query_source"] = "typed_query"
        params["pc"] = "1"
        params["spelling_corrections"] = "1"
        return self._pagination(
            endpoint, params, "sq-I-t-", "sq-cursor-bottom")

    def list_members(self, list_id):
        endpoint = "graphql/M74V2EwlxxVYGB4DbyAphQ/ListMembers"
        variables = {
            "listId": list_id,
            "count" : 20,
            "withTweetResult": False,
            "withUserResult" : False,
        }
        return self._pagination_members(endpoint, variables)

    def list_by_rest_id(self, list_id):
        endpoint = "graphql/LXXTUytSX1QY-2p8Xp9BFA/ListByRestId"
        params = {"variables": '{"listId":"' + list_id + '"'
                               ',"withUserResult":false}'}
        try:
            return self._call(endpoint, params)["data"]["list"]
        except KeyError:
            raise exception.NotFoundError("list")

    def user_by_screen_name(self, screen_name):
        endpoint = "graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName"
        params = {"variables": '{"screen_name":"' + screen_name + '"'
                               ',"withHighlightedLabel":true}'}
        try:
            return self._call(endpoint, params)["data"]["user"]
        except KeyError:
            raise exception.NotFoundError("user")

    def _user_id_by_screen_name(self, screen_name):
        if screen_name.startswith("id:"):
            return screen_name[3:]
        return self.user_by_screen_name(screen_name)["rest_id"]

    @cache(maxage=3600)
    def _guest_token(self):
        endpoint = "1.1/guest/activate.json"
        return self._call(endpoint, None, "POST")["guest_token"]

    def _call(self, endpoint, params, method="GET"):
        url = self.root + endpoint
        response = self.extractor.request(
            url, method=method, params=params, headers=self.headers,
            fatal=None)
        if response.status_code < 400:
            return response.json()
        if response.status_code == 429:
            until = response.headers.get("x-rate-limit-reset")
            self.extractor.wait(until=until, seconds=(None if until else 60))
            return self._call(endpoint, params, method)

        try:
            msg = ", ".join(
                '"' + error["message"] + '"'
                for error in response.json()["errors"]
            )
        except Exception:
            msg = response.text
        raise exception.StopExtraction(
            "%s %s (%s)", response.status_code, response.reason, msg)

    def _pagination(self, endpoint, params=None,
                    entry_tweet="tweet-", entry_cursor="cursor-bottom-"):
        if params is None:
            params = self.params.copy()
        original_retweets = (self.extractor.retweets == "original")

        while True:
            cursor = tweet = None
            data = self._call(endpoint, params)

            instr = data["timeline"]["instructions"]
            if not instr:
                return
            tweets = data["globalObjects"]["tweets"]
            users = data["globalObjects"]["users"]

            for entry in instr[0]["addEntries"]["entries"]:

                if entry["entryId"].startswith(entry_tweet):
                    try:
                        tweet = tweets[
                            entry["content"]["item"]["content"]["tweet"]["id"]]
                    except KeyError:
                        self.extractor.log.debug(
                            "Skipping %s (deleted)",
                            entry["entryId"][len(entry_tweet):])
                        continue

                    if "retweeted_status_id_str" in tweet:
                        retweet = tweets.get(tweet["retweeted_status_id_str"])
                        if original_retweets:
                            if not retweet:
                                continue
                            retweet["_retweet_id_str"] = tweet["id_str"]
                            tweet = retweet
                        elif retweet:
                            tweet["author"] = users[retweet["user_id_str"]]
                    tweet["user"] = users[tweet["user_id_str"]]
                    yield tweet

                    if "quoted_status_id_str" in tweet:
                        quoted = tweets.get(tweet["quoted_status_id_str"])
                        if quoted:
                            quoted["author"] = users[quoted["user_id_str"]]
                            quoted["user"] = tweet["user"]
                            quoted["quoted"] = True
                            yield quoted

                elif entry["entryId"].startswith(entry_cursor):
                    cursor = entry["content"]["operation"]["cursor"]
                    if not cursor.get("stopOnEmptyResponse"):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor["value"]

            if "replaceEntry" in instr[-1] :
                cursor = (instr[-1]["replaceEntry"]["entry"]
                          ["content"]["operation"]["cursor"]["value"])

            if not cursor or not tweet:
                return
            params["cursor"] = cursor

    def _pagination_members(self, endpoint, variables):
        while True:
            cursor = entry = stop = None
            params = {"variables": json.dumps(variables)}
            data = self._call(endpoint, params)

            try:
                instructions = (data["data"]["list"]["members_timeline"]
                                ["timeline"]["instructions"])
            except KeyError:
                raise exception.AuthorizationError()

            for instr in instructions:
                if instr["type"] == "TimelineAddEntries":
                    for entry in instr["entries"]:
                        if entry["entryId"].startswith("user-"):
                            yield entry["content"]["itemContent"]["user"]
                        elif entry["entryId"].startswith("cursor-bottom-"):
                            cursor = entry["content"]["value"]
                elif instr["type"] == "TimelineTerminateTimeline":
                    if instr["direction"] == "Bottom":
                        stop = True

            if stop or not cursor or not entry:
                return
            variables["cursor"] = cursor
-												[twitter] add extractor

											
										
										
											8 years ago
+								# -*- coding: utf-8 -*-
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											5 years ago
+								# Copyright 2016-2020 Mike Fährmann
-												[twitter] add extractor

											
										
										
											8 years ago
+								#
 								# This program is free software; you can redistribute it and/or modify
 								# it under the terms of the GNU General Public License version 2 as
 								# published by the Free Software Foundation.
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											5 years ago
+								"""Extractors for https://twitter.com/"""
-												[twitter] add extractor

											
										
										
											8 years ago
 								from .common import Extractor, Message
-												add a general 'generate_csrf_token()' function

											
										
										
											4 years ago
+								from .. import text, util, exception
-												[twitter] don't cache results of 'user_by_screen_name()'

A 'keyarg=1' argument to the memcache decorator would have worked as
well, but keeping the user object in memory isn't useful for the vast
majority of use cases and only wastes space.

(closes #817)

											
										
										
											4 years ago
+								from ..cache import cache
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								import json
-												code adjustments according to pep8 nr2

											
										
										
											8 years ago
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								BASE_PATTERN = (
 								    r"(?:https?://)?(?:www\.|mobile\.)?"
 								    r"(?:twitter\.com|nitter\.net)"
 								)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								class TwitterExtractor(Extractor):
 								    """Base class for twitter extractors"""
-												[twitter] add extractor

											
										
										
											8 years ago
+								    category = "twitter"
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								    directory_fmt = ("{category}", "{user[name]}")
 								    filename_fmt = "{tweet_id}_{num}.{extension}"
 								    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											5 years ago
+								    cookiedomain = ".twitter.com"
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    root = "https://twitter.com"
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								    def __init__(self, match):
-												propagate 'match' to base extractor constructor

											
										
										
											6 years ago
+								        Extractor.__init__(self, match)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								        self.user = match.group(1)
 								        self.retweets = self.config("retweets", True)
-												[twitter] add 'replies' option (closes #705)

											
										
										
											4 years ago
+								        self.replies = self.config("replies", True)
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											5 years ago
+								        self.twitpic = self.config("twitpic", False)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								        self.quoted = self.config("quoted", True)
-												[twitter] change default value for 'videos' to 'true'

Every other 'videos' option defaulted to 'true', except Twitter.

											
										
										
											5 years ago
+								        self.videos = self.config("videos", True)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        self.cards = self.config("cards", False)
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								        self._user_cache = {}
-												[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as
"unsupported" URLs, so that these can then be downloaded with youtube-dl

There are a lot of improvements to be made to the current
implementation, but it works and does what it is supposed to, even if
inefficient as can be ...

											
										
										
											6 years ago
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    def items(self):
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								        self.login()
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											5 years ago
+								        metadata = self.metadata()
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								        yield Message.Version, 1
 								        for tweet in self.tweets():
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											4 years ago
+								            if not self.retweets and "retweeted_status_id_str" in tweet:
 								                self.log.debug("Skipping %s (retweet)", tweet["id_str"])
 								                continue
 								            if not self.replies and "in_reply_to_user_id_str" in tweet:
 								                self.log.debug("Skipping %s (reply)", tweet["id_str"])
 								                continue
 								            if not self.quoted and "quoted" in tweet:
 								                self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								                continue
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            files = []
 								            if "extended_entities" in tweet:
 								                self._extract_media(tweet, files)
 								            if "card" in tweet and self.cards:
 								                self._extract_card(tweet, files)
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
+								            if self.twitpic:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								                self._extract_twitpic(tweet, files)
 								            if not files:
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								                continue
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								            tdata = self._transform_tweet(tweet)
 								            tdata.update(metadata)
 								            yield Message.Directory, tdata
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            for tdata["num"], file in enumerate(files, 1):
 								                file.update(tdata)
 								                url = file.pop("url")
 								                if "extension" not in file:
 								                    text.nameext_from_url(url, file)
 								                yield Message.Url, url, file
 								    def _extract_media(self, tweet, files):
 								        for media in tweet["extended_entities"]["media"]:
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											4 years ago
+								            width = media["original_info"].get("width", 0)
 								            height = media["original_info"].get("height", 0)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
 								            if "video_info" in media:
 								                if self.videos == "ytdl":
 								                    files.append({
 								                        "url": "ytdl:{}/i/web/status/{}".format(
 								                            self.root, tweet["id_str"]),
 								                        "width"    : width,
 								                        "height"   : height,
 								                        "extension": None,
 								                    })
 								                elif self.videos:
 								                    video_info = media["video_info"]
 								                    variant = max(
 								                        video_info["variants"],
 								                        key=lambda v: v.get("bitrate", 0),
 								                    )
 								                    files.append({
 								                        "url"     : variant["url"],
 								                        "width"   : width,
 								                        "height"  : height,
 								                        "bitrate" : variant.get("bitrate", 0),
 								                        "duration": video_info.get(
 								                            "duration_millis", 0) / 1000,
 								                    })
 								            elif "media_url_https" in media:
 								                url = media["media_url_https"]
 								                files.append(text.nameext_from_url(url, {
 								                    "url"      : url + ":orig",
 								                    "_fallback": [url+":large", url+":medium", url+":small"],
 								                    "width"    : width,
 								                    "height"   : height,
 								                }))
 								            else:
 								                files.append({"url": media["media_url"]})
 								    def _extract_card(self, tweet, files):
 								        card = tweet["card"]
 								        if card["name"] in ("summary", "summary_large_image"):
 								            bvals = card["binding_values"]
 								            for prefix in ("photo_image_full_size_",
 								                           "summary_photo_image_",
 								                           "thumbnail_image_"):
 								                for size in ("original", "x_large", "large", "small"):
 								                    key = prefix + size
 								                    if key in bvals:
 								                        files.append(bvals[key]["image_value"])
 								                        return
 								        else:
 								            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"])
 								            files.append({"url": url})
 								    def _extract_twitpic(self, tweet, files):
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
+								        for url in tweet["entities"].get("urls", ()):
 								            url = url["expanded_url"]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											4 years ago
+								            if "//twitpic.com/" in url and "/photos/" not in url:
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
+								                response = self.request(url, fatal=False)
 								                if response.status_code >= 400:
 								                    continue
 								                url = text.extract(
 								                    response.text, 'name="twitter:image" value="', '"')[0]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											4 years ago
+								                if url:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								                    files.append({"url": url})
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								    def _transform_tweet(self, tweet):
 								        entities = tweet["entities"]
 								        tdata = {
 								            "tweet_id"      : text.parse_int(tweet["id_str"]),
 								            "retweet_id"    : text.parse_int(
 								                tweet.get("retweeted_status_id_str")),
 								            "quote_id"      : text.parse_int(
 								                tweet.get("quoted_status_id_str")),
 								            "reply_id"      : text.parse_int(
 								                tweet.get("in_reply_to_status_id_str")),
 								            "date"          : text.parse_datetime(
 								                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								            "user"          : self._transform_user(tweet["user"]),
 								            "lang"          : tweet["lang"],
 								            "content"       : tweet["full_text"],
 								            "favorite_count": tweet["favorite_count"],
 								            "quote_count"   : tweet["quote_count"],
 								            "reply_count"   : tweet["reply_count"],
 								            "retweet_count" : tweet["retweet_count"],
 								        }
 								        hashtags = entities.get("hashtags")
 								        if hashtags:
 								            tdata["hashtags"] = [t["text"] for t in hashtags]
 								        mentions = entities.get("user_mentions")
 								        if mentions:
 								            tdata["mentions"] = [{
 								                "id": text.parse_int(u["id_str"]),
 								                "name": u["screen_name"],
 								                "nick": u["name"],
 								            } for u in mentions]
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											4 years ago
+								        if "in_reply_to_screen_name" in tweet:
 								            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								        if "author" in tweet:
 								            tdata["author"] = self._transform_user(tweet["author"])
-												[twitter] always provide an 'author' field (#831, #833)

The idea was to have less metadata clutter for most Tweets were
'author' and 'user' are the same (non-retweets), and only provide
a 'user' field.

The original Tweet author could be gotten with
{author[…]|user[…]}, but basically no one knows about that.

											
										
										
											4 years ago
+								        else:
 								            tdata["author"] = tdata["user"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
 								        return tdata
 								    def _transform_user(self, user):
 								        uid = user["id_str"]
 								        cache = self._user_cache
 								        if uid not in cache:
 								            cache[uid] = {
 								                "id"              : text.parse_int(uid),
 								                "name"            : user["screen_name"],
 								                "nick"            : user["name"],
 								                "description"     : user["description"],
 								                "location"        : user["location"],
 								                "date"            : text.parse_datetime(
 								                    user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								                "verified"        : user.get("verified", False),
 								                "profile_banner"  : user.get("profile_banner_url", ""),
 								                "profile_image"   : user.get(
 								                    "profile_image_url_https", "").replace("_normal.", "."),
 								                "favourites_count": user["favourites_count"],
 								                "followers_count" : user["followers_count"],
 								                "friends_count"   : user["friends_count"],
 								                "listed_count"    : user["listed_count"],
 								                "media_count"     : user["media_count"],
 								                "statuses_count"  : user["statuses_count"],
 								            }
 								        return cache[uid]
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    def metadata(self):
 								        """Return general metadata"""
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											5 years ago
+								        return {}
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        """Yield all relevant tweet objects"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								    def login(self):
 								        username, password = self._get_auth_info()
 								        if username:
 								            self._update_cookies(self._login_impl(username, password))
 								    @cache(maxage=360*24*3600, keyarg=1)
 								    def _login_impl(self, username, password):
 								        self.log.info("Logging in as %s", username)
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								        url = "https://mobile.twitter.com/i/nojs_router"
 								        params = {"path": "/login"}
 								        headers = {"Referer": self.root + "/", "Origin": self.root}
 								        page = self.request(
 								            url, method="POST", params=params, headers=headers, data={}).text
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								        pos = page.index('name="authenticity_token"')
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								        token = text.extract(page, 'value="', '"', pos)[0]
-												[twitter] add login support (#214)

											
										
										
											6 years ago
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								        url = "https://mobile.twitter.com/sessions"
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								        data = {
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								            "authenticity_token"        : token,
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								            "session[username_or_email]": username,
 								            "session[password]"         : password,
 								            "remember_me"               : "1",
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								            "wfa"                       : "1",
 								            "commit"                    : "+Log+in+",
 								            "ui_metrics"                : "",
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								        response = self.request(url, method="POST", data=data)
 								        cookies = {
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											5 years ago
+								            cookie.name: cookie.value
 								            for cookie in self.session.cookies
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								            if cookie.domain == self.cookiedomain
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											5 years ago
+								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
 								        if "/error" in response.url or "auth_token" not in cookies:
 								            raise exception.AuthenticationError()
 								        return cookies
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								class TwitterTimelineExtractor(TwitterExtractor):
 								    """Extractor for all images from a user's timeline"""
 								    subcategory = "timeline"
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + \
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								        r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))"
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    test = (
 								        ("https://twitter.com/supernaturepics", {
 								            "range": "1-40",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        }),
 								        ("https://mobile.twitter.com/supernaturepics?p=i"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        ("https://www.twitter.com/id:2976459548"),
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											4 years ago
+								        ("https://twitter.com/intent/user?user_id=2976459548"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											4 years ago
+								    def __init__(self, match):
 								        TwitterExtractor.__init__(self, match)
 								        uid = match.group(2)
 								        if uid:
 								            self.user = "id:" + uid
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        return TwitterAPI(self).timeline_profile(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
 								class TwitterMediaExtractor(TwitterExtractor):
 								    """Extractor for all images from a user's Media Tweets"""
 								    subcategory = "media"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    test = (
 								        ("https://twitter.com/supernaturepics/media", {
 								            "range": "1-40",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        }),
 								        ("https://mobile.twitter.com/supernaturepics/media#t"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        ("https://www.twitter.com/id:2976459548/media"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        return TwitterAPI(self).timeline_media(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								class TwitterLikesExtractor(TwitterExtractor):
 								    """Extractor for liked tweets"""
 								    subcategory = "likes"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								    test = ("https://twitter.com/supernaturepics/likes",)
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_favorites(self.user)
 								class TwitterBookmarkExtractor(TwitterExtractor):
 								    """Extractor for bookmarked tweets"""
 								    subcategory = "bookmark"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/i/bookmarks()"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								    test = ("https://twitter.com/i/bookmarks",)
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_bookmark()
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								class TwitterListExtractor(TwitterExtractor):
 								    """Extractor for Twitter lists"""
 								    subcategory = "list"
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								    test = ("https://twitter.com/i/lists/784214683683127296", {
 								        "range": "1-40",
 								        "count": 40,
 								        "archive": False,
 								    })
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_list(self.user)
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								class TwitterListMembersExtractor(TwitterExtractor):
 								    """Extractor for members of a Twitter list"""
 								    subcategory = "list-members"
 								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
 								    test = ("https://twitter.com/i/lists/784214683683127296/members",)
 								    def items(self):
 								        self.login()
 								        for user in TwitterAPI(self).list_members(self.user):
 								            user["_extractor"] = TwitterTimelineExtractor
 								            url = "{}/intent/user?user_id={}".format(
 								                self.root, user["rest_id"])
 								            yield Message.Queue, url, user
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											5 years ago
+								class TwitterSearchExtractor(TwitterExtractor):
 								    """Extractor for all images from a search timeline"""
 								    subcategory = "search"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
+								    directory_fmt = ("{category}", "Search", "{search}")
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
+								    test = ("https://twitter.com/search?q=nature", {
 								        "range": "1-40",
 								        "count": 40,
-												update extractor test results

											
										
										
											4 years ago
+								        "archive": False,
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
+								    })
 								    def metadata(self):
-												[twitter] improve pagination

											
										
										
											4 years ago
+								        return {"search": text.unquote(self.user)}
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											5 years ago
+								    def tweets(self):
-												[twitter] improve search results (fixes #847)

Adding 'tweet_search_mode=live' to the query parameters
is the most important part here.

											
										
										
											4 years ago
+								        return TwitterAPI(self).search(text.unquote(self.user))
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								class TwitterTweetExtractor(TwitterExtractor):
-												[twitter] changes and improvements

- rename User- to TimelineExtractor
- rename 'userid' to 'user_id' to conform to the other ..._id values
- adjust archive_fmt to deal with retweets
- emulate browser behavior for API calls

											
										
										
											6 years ago
+								    """Extractor for images from individual tweets"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    subcategory = "tweet"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											6 years ago
+								    test = (
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											5 years ago
+								        ("https://twitter.com/supernaturepics/status/604341487988576256", {
 								            "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
 								            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											7 years ago
+								        }),
-												[twitter] extract 'date' metadata (#224)

											
										
										
											5 years ago
+								        # 4 images
-												[twitter] ignore "Promoted Tweets"

											
										
										
											7 years ago
+								        ("https://twitter.com/perrypumas/status/894001459754180609", {
 								            "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
-												[twitter] extract 'date' metadata (#224)

											
										
										
											5 years ago
+								        }),
 								        # video
 								        ("https://twitter.com/perrypumas/status/1065692031626829824", {
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											7 years ago
+								        }),
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											5 years ago
+								        # content with emoji, newlines, hashtags (#338)
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											4 years ago
+								        ("https://twitter.com/playpokemon/status/1263832915173048321", {
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								            "keyword": {"content": (
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											4 years ago
+								                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
 								                "Gifts! \n\nYou’ll be able to receive four Galarian form "
 								                "Pokémon with Hidden Abilities, plus some very useful items. "
 								                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
-												update test results

- twitter:

    Don't test the whole kwdict, only the actual content, since the
    keyword hash changes whenever that user changes his display name.

- khinsider:

    Download host changed

											
										
										
											5 years ago
+								            )},
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											5 years ago
+								        }),
-												[twitter] update tests

											
										
										
											4 years ago
+								        # Reply to deleted tweet (#403, #838)
 								        ("https://twitter.com/i/web/status/1170041925560258560", {
 								            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_.jpg:orig",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        }),
-												[twitter] add 'replies' option (closes #705)

											
										
										
											4 years ago
+								        # 'replies' option (#705)
-												[twitter] update tests

											
										
										
											4 years ago
+								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] add 'replies' option (closes #705)

											
										
										
											4 years ago
+								            "options": (("replies", False),),
 								            "count": 0,
 								        }),
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								        # quoted tweet (#526, #854)
 								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
 								            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+\.jpg",
 								            "count": 8,
 								        }),
 								        # "quoted" option (#854)
 								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
 								            "options": (("quoted", False),),
 								            "pattern": r"https://pbs\.twimg\.com/media/EaK.+\.jpg",
 								            "count": 4,
-												[twitter] handle quoted tweets (#526)

… and categorize them as retweets

											
										
										
											5 years ago
+								        }),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											5 years ago
+								        # TwitPic embeds (#579)
 								        ("https://twitter.com/i/web/status/112900228289540096", {
 								            "options": (("twitpic", True),),
 								            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
 								            "count": 3,
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        # Nitter tweet (#890)
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								        ("https://nitter.net/ed1conf/status/1163841619336007680", {
 								            "url": "0f6a841e23948e4320af7ae41125e0c5b3cadc98",
 								            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        # Twitter card (#1005)
 								        ("https://twitter.com/billboard/status/1306599586602135555", {
 								            "options": (("cards", True),),
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											4 years ago
+								            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        }),
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								        # original retweets (#1026)
 								        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
 								            "options": (("retweets", "original"),),
 								            "count": 2,
 								            "keyword": {
 								                "tweet_id": 1296296016002547713,
 								                "date"    : "dt:2020-08-20 04:00:28",
 								            },
 								        }),
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											6 years ago
+								    )
-												[twitter] add extractor

											
										
										
											8 years ago
 								    def __init__(self, match):
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								        TwitterExtractor.__init__(self, match)
 								        self.tweet_id = match.group(2)
-												[twitter] add extractor

											
										
										
											8 years ago
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        return TwitterAPI(self).tweet(self.tweet_id)
-												[twitter] handle API rate limits (#526)

											
										
										
											5 years ago
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								class TwitterAPI():
 								    def __init__(self, extractor):
 								        self.extractor = extractor
 								        self.headers = {
 								            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
 								                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
 								                             "4FA33AGWWjCpTnA",
 								            "x-guest-token": None,
 								            "x-twitter-client-language": "en",
 								            "x-twitter-active-user": "yes",
 								            "x-csrf-token": None,
 								            "Origin": "https://twitter.com",
 								            "Referer": "https://twitter.com/",
 								        }
 								        self.params = {
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								            "include_profile_interstitial_type": "1",
 								            "include_blocking": "1",
 								            "include_blocked_by": "1",
 								            "include_followed_by": "1",
 								            "include_want_retweets": "1",
 								            "include_mute_edge": "1",
 								            "include_can_dm": "1",
 								            "include_can_media_tag": "1",
 								            "skip_status": "1",
 								            "cards_platform": "Web-12",
 								            "include_cards": "1",
 								            "include_composer_source": "true",
 								            "include_ext_alt_text": "true",
 								            "include_reply_count": "1",
 								            "tweet_mode": "extended",
 								            "include_entities": "true",
 								            "include_user_entities": "true",
 								            "include_ext_media_color": "true",
 								            "include_ext_media_availability": "true",
 								            "send_error_codes": "true",
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            "simple_quoted_tweet": "true",
 								            #  "count": "20",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								            "count": "100",
 								            "cursor": None,
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            "ext": "mediaStats,highlightedLabel,cameraMoment",
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            "include_quote_count": "true",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								        }
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
 								        cookies = self.extractor.session.cookies
 								        # CSRF
-												add a general 'generate_csrf_token()' function

											
										
										
											4 years ago
+								        csrf = util.generate_csrf_token()
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        self.headers["x-csrf-token"] = csrf
 								        cookies.set("ct0", csrf, domain=".twitter.com")
 								        if cookies.get("auth_token", domain=".twitter.com"):
-												[twitter] use 'https://twitter.com/i/api/' for logged in users

Doesn't seem to make a difference from what I can tell,
i.e. downloaded files are the same, but the website does it.

											
										
										
											4 years ago
+								            # logged in
 								            self.root = "https://twitter.com/i/api/"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            self.headers["x-twitter-auth-type"] = "OAuth2Session"
 								        else:
-												[twitter] use 'https://twitter.com/i/api/' for logged in users

Doesn't seem to make a difference from what I can tell,
i.e. downloaded files are the same, but the website does it.

											
										
										
											4 years ago
+								            # guest
 								            self.root = "https://api.twitter.com/"
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											4 years ago
+								            guest_token = self._guest_token()
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            self.headers["x-guest-token"] = guest_token
 								            cookies.set("gt", guest_token, domain=".twitter.com")
 								    def tweet(self, tweet_id):
 								        endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											4 years ago
+								        tweets = []
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        for tweet in self._pagination(endpoint):
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								            if tweet["id_str"] == tweet_id or \
 								                    tweet.get("_retweet_id_str") == tweet_id:
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											4 years ago
+								                tweets.append(tweet)
 								                if "quoted_status_id_str" in tweet:
 								                    tweet_id = tweet["quoted_status_id_str"]
 								                else:
 								                    break
 								        return tweets
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
 								    def timeline_profile(self, screen_name):
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        user_id = self._user_id_by_screen_name(screen_name)
 								        endpoint = "2/timeline/profile/{}.json".format(user_id)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        return self._pagination(endpoint)
 								    def timeline_media(self, screen_name):
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        user_id = self._user_id_by_screen_name(screen_name)
 								        endpoint = "2/timeline/media/{}.json".format(user_id)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        return self._pagination(endpoint)
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								    def timeline_favorites(self, screen_name):
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        user_id = self._user_id_by_screen_name(screen_name)
 								        endpoint = "2/timeline/favorites/{}.json".format(user_id)
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								        return self._pagination(endpoint)
 								    def timeline_bookmark(self):
 								        endpoint = "2/timeline/bookmark.json"
 								        return self._pagination(endpoint)
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								    def timeline_list(self, list_id):
 								        endpoint = "2/timeline/list.json"
 								        params = self.params.copy()
 								        params["list_id"] = list_id
 								        params["ranking_mode"] = "reverse_chronological"
 								        return self._pagination(endpoint, params)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								    def search(self, query):
 								        endpoint = "2/search/adaptive.json"
 								        params = self.params.copy()
-												[twitter] improve search results (fixes #847)

Adding 'tweet_search_mode=live' to the query parameters
is the most important part here.

											
										
										
											4 years ago
+								        params["q"] = query
 								        params["tweet_search_mode"] = "live"
 								        params["query_source"] = "typed_query"
 								        params["pc"] = "1"
 								        params["spelling_corrections"] = "1"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        return self._pagination(
 								            endpoint, params, "sq-I-t-", "sq-cursor-bottom")
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								    def list_members(self, list_id):
 								        endpoint = "graphql/M74V2EwlxxVYGB4DbyAphQ/ListMembers"
 								        variables = {
 								            "listId": list_id,
 								            "count" : 20,
 								            "withTweetResult": False,
 								            "withUserResult" : False,
 								        }
 								        return self._pagination_members(endpoint, variables)
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								    def list_by_rest_id(self, list_id):
 								        endpoint = "graphql/LXXTUytSX1QY-2p8Xp9BFA/ListByRestId"
 								        params = {"variables": '{"listId":"' + list_id + '"'
 								                               ',"withUserResult":false}'}
 								        try:
 								            return self._call(endpoint, params)["data"]["list"]
 								        except KeyError:
 								            raise exception.NotFoundError("list")
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								    def user_by_screen_name(self, screen_name):
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											4 years ago
+								        endpoint = "graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName"
 								        params = {"variables": '{"screen_name":"' + screen_name + '"'
 								                               ',"withHighlightedLabel":true}'}
-												[twitter] raise proper exception if user doesn't exist (#891)

											
										
										
											4 years ago
+								        try:
 								            return self._call(endpoint, params)["data"]["user"]
 								        except KeyError:
 								            raise exception.NotFoundError("user")
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								    def _user_id_by_screen_name(self, screen_name):
 								        if screen_name.startswith("id:"):
 								            return screen_name[3:]
 								        return self.user_by_screen_name(screen_name)["rest_id"]
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											4 years ago
+								    @cache(maxage=3600)
 								    def _guest_token(self):
 								        endpoint = "1.1/guest/activate.json"
 								        return self._call(endpoint, None, "POST")["guest_token"]
 								    def _call(self, endpoint, params, method="GET"):
-												[twitter] use 'https://twitter.com/i/api/' for logged in users

Doesn't seem to make a difference from what I can tell,
i.e. downloaded files are the same, but the website does it.

											
										
										
											4 years ago
+								        url = self.root + endpoint
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        response = self.extractor.request(
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											4 years ago
+								            url, method=method, params=params, headers=self.headers,
 								            fatal=None)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        if response.status_code < 400:
 								            return response.json()
 								        if response.status_code == 429:
-												[twitter] handle 429 responses without x-rate-limit-reset header

											
										
										
											4 years ago
+								            until = response.headers.get("x-rate-limit-reset")
 								            self.extractor.wait(until=until, seconds=(None if until else 60))
 								            return self._call(endpoint, params, method)
-												[twitter] improve error message formatting

											
										
										
											4 years ago
 								        try:
 								            msg = ", ".join(
 								                '"' + error["message"] + '"'
 								                for error in response.json()["errors"]
 								            )
 								        except Exception:
 								            msg = response.text
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        raise exception.StopExtraction(
-												[twitter] improve error message formatting

											
										
										
											4 years ago
+								            "%s %s (%s)", response.status_code, response.reason, msg)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
 								    def _pagination(self, endpoint, params=None,
 								                    entry_tweet="tweet-", entry_cursor="cursor-bottom-"):
 								        if params is None:
 								            params = self.params.copy()
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								        original_retweets = (self.extractor.retweets == "original")
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
 								        while True:
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            cursor = tweet = None
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            data = self._call(endpoint, params)
-												[twitter] improve pagination

											
										
										
											4 years ago
 								            instr = data["timeline"]["instructions"]
 								            if not instr:
 								                return
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								            tweets = data["globalObjects"]["tweets"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            users = data["globalObjects"]["users"]
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            for entry in instr[0]["addEntries"]["entries"]:
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
 								                if entry["entryId"].startswith(entry_tweet):
-												[twitter] improve handling of deleted tweets (fixes #838)

											
										
										
											4 years ago
+								                    try:
 								                        tweet = tweets[
 								                            entry["content"]["item"]["content"]["tweet"]["id"]]
 								                    except KeyError:
-												[twitter] skip unavailable tweets

											
										
										
											4 years ago
+								                        self.extractor.log.debug(
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											4 years ago
+								                            "Skipping %s (deleted)",
 								                            entry["entryId"][len(entry_tweet):])
-												[twitter] skip unavailable tweets

											
										
										
											4 years ago
+								                        continue
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											4 years ago
+								                    if "retweeted_status_id_str" in tweet:
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								                        retweet = tweets.get(tweet["retweeted_status_id_str"])
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								                        if original_retweets:
 								                            if not retweet:
 								                                continue
 								                            retweet["_retweet_id_str"] = tweet["id_str"]
 								                            tweet = retweet
 								                        elif retweet:
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								                            tweet["author"] = users[retweet["user_id_str"]]
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								                    tweet["user"] = users[tweet["user_id_str"]]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								                    yield tweet
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											4 years ago
+								                    if "quoted_status_id_str" in tweet:
 								                        quoted = tweets.get(tweet["quoted_status_id_str"])
 								                        if quoted:
 								                            quoted["author"] = users[quoted["user_id_str"]]
 								                            quoted["user"] = tweet["user"]
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								                            quoted["quoted"] = True
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											4 years ago
+								                            yield quoted
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								                elif entry["entryId"].startswith(entry_cursor):
-												[twitter] improve pagination

											
										
										
											4 years ago
+								                    cursor = entry["content"]["operation"]["cursor"]
 								                    if not cursor.get("stopOnEmptyResponse"):
 								                        # keep going even if there are no tweets
 								                        tweet = True
 								                    cursor = cursor["value"]
 								            if "replaceEntry" in instr[-1] :
 								                cursor = (instr[-1]["replaceEntry"]["entry"]
 								                          ["content"]["operation"]["cursor"]["value"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            if not cursor or not tweet:
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								                return
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            params["cursor"] = cursor
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
 								    def _pagination_members(self, endpoint, variables):
 								        while True:
 								            cursor = entry = stop = None
 								            params = {"variables": json.dumps(variables)}
 								            data = self._call(endpoint, params)
 								            try:
 								                instructions = (data["data"]["list"]["members_timeline"]
 								                                ["timeline"]["instructions"])
 								            except KeyError:
 								                raise exception.AuthorizationError()
 								            for instr in instructions:
 								                if instr["type"] == "TimelineAddEntries":
 								                    for entry in instr["entries"]:
 								                        if entry["entryId"].startswith("user-"):
 								                            yield entry["content"]["itemContent"]["user"]
 								                        elif entry["entryId"].startswith("cursor-bottom-"):
 								                            cursor = entry["content"]["value"]
 								                elif instr["type"] == "TimelineTerminateTimeline":
 								                    if instr["direction"] == "Bottom":
 								                        stop = True
 								            if stop or not cursor or not entry:
 								                return
 								            variables["cursor"] = cursor