gallery-dl/gallery_dl/extractor/twitter.py

# -*- coding: utf-8 -*-

# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://twitter.com/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import json

BASE_PATTERN = (
    r"(?:https?://)?(?:www\.|mobile\.)?"
    r"(?:twitter\.com|nitter\.net)"
)


class TwitterExtractor(Extractor):
    """Base class for twitter extractors"""
    category = "twitter"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
    cookiedomain = ".twitter.com"
    cookienames = ("auth_token",)
    root = "https://twitter.com"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1)
        self.textonly = self.config("text-tweets", False)
        self.retweets = self.config("retweets", False)
        self.replies = self.config("replies", True)
        self.twitpic = self.config("twitpic", False)
        self.pinned = self.config("pinned", False)
        self.quoted = self.config("quoted", False)
        self.videos = self.config("videos", True)
        self.cards = self.config("cards", True)
        self._user_cache = {}
        self._init_sizes()

    def _init_sizes(self):
        size = self.config("size")
        if size is None:
            self._size_image = "orig"
            self._size_fallback = ("4096x4096", "large", "medium", "small")
        else:
            if isinstance(size, str):
                size = size.split(",")
            self._size_image = size[0]
            self._size_fallback = size[1:]

    def items(self):
        self.login()
        self.api = TwitterAPI(self)
        metadata = self.metadata()

        for tweet in self.tweets():

            if "legacy" in tweet:
                data = tweet["legacy"]
            else:
                data = tweet

            if not self.retweets and "retweeted_status_id_str" in data:
                self.log.debug("Skipping %s (retweet)", data["id_str"])
                continue
            if not self.quoted and "quoted_by_id_str" in data:
                self.log.debug("Skipping %s (quoted tweet)", data["id_str"])
                continue
            if "in_reply_to_user_id_str" in data and (
                not self.replies or (
                    self.replies == "self" and
                    data["in_reply_to_user_id_str"] != data["user_id_str"]
                )
            ):
                self.log.debug("Skipping %s (reply)", data["id_str"])
                continue

            files = []
            if "extended_entities" in data:
                self._extract_media(
                    data, data["extended_entities"]["media"], files)
            if "card" in tweet and self.cards:
                self._extract_card(tweet, files)
            if self.twitpic:
                self._extract_twitpic(data, files)
            if not files and not self.textonly:
                continue

            tdata = self._transform_tweet(tweet)
            tdata.update(metadata)
            yield Message.Directory, tdata
            for tdata["num"], file in enumerate(files, 1):
                file.update(tdata)
                url = file.pop("url")
                if "extension" not in file:
                    text.nameext_from_url(url, file)
                yield Message.Url, url, file

    def _extract_media(self, tweet, entities, files):
        for media in entities:
            width = media["original_info"].get("width", 0)
            height = media["original_info"].get("height", 0)

            if "video_info" in media:
                if self.videos == "ytdl":
                    files.append({
                        "url": "ytdl:{}/i/web/status/{}".format(
                            self.root, tweet["id_str"]),
                        "width"    : width,
                        "height"   : height,
                        "extension": None,
                    })
                elif self.videos:
                    video_info = media["video_info"]
                    variant = max(
                        video_info["variants"],
                        key=lambda v: v.get("bitrate", 0),
                    )
                    files.append({
                        "url"     : variant["url"],
                        "width"   : width,
                        "height"  : height,
                        "bitrate" : variant.get("bitrate", 0),
                        "duration": video_info.get(
                            "duration_millis", 0) / 1000,
                    })
            elif "media_url_https" in media:
                url = media["media_url_https"]
                base, _, fmt = url.rpartition(".")
                base += "?format=" + fmt + "&name="
                files.append(text.nameext_from_url(url, {
                    "url"      : base + self._size_image,
                    "width"    : width,
                    "height"   : height,
                    "_fallback": self._image_fallback(base),
                }))
            else:
                files.append({"url": media["media_url"]})

    def _image_fallback(self, base):
        for fmt in self._size_fallback:
            yield base + fmt

    def _extract_card(self, tweet, files):
        card = tweet["card"]
        if "legacy" in card:
            card = card["legacy"]
        name = card["name"]

        if name in ("summary", "summary_large_image"):
            bvals = card["binding_values"]
            if isinstance(bvals, list):
                bvals = {
                    bval["key"]: bval["value"]
                    for bval in card["binding_values"]
                }
            for prefix in ("photo_image_full_size_",
                           "summary_photo_image_",
                           "thumbnail_image_"):
                for size in ("original", "x_large", "large", "small"):
                    key = prefix + size
                    if key in bvals:
                        value = bvals[key].get("image_value")
                        if value and "url" in value:
                            files.append(value)
                            return
        elif name == "unified_card":
            bvals = card["binding_values"]
            if isinstance(bvals, list):
                for bval in card["binding_values"]:
                    if bval["key"] == "unified_card":
                        bval = bval["value"]["string_value"]
                        break
            else:
                bval = bvals["unified_card"]["string_value"]
            data = json.loads(bval)
            if data.get("type") == "image_carousel_website":
                self._extract_media(
                    tweet, data["media_entities"].values(), files)
                return

        if self.cards == "ytdl":
            tweet_id = tweet.get("rest_id") or tweet["id_str"]
            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
            files.append({"url": url})

    def _extract_twitpic(self, tweet, files):
        for url in tweet["entities"].get("urls", ()):
            url = url["expanded_url"]
            if "//twitpic.com/" in url and "/photos/" not in url:
                response = self.request(url, fatal=False)
                if response.status_code >= 400:
                    continue
                url = text.extract(
                    response.text, 'name="twitter:image" value="', '"')[0]
                if url:
                    files.append({"url": url})

    def _transform_tweet(self, tweet):
        if "core" in tweet:
            user = self._transform_user(
                tweet["core"]["user_results"]["result"])
        else:
            user = self._transform_user(tweet["user"])

        if "legacy" in tweet:
            tweet = tweet["legacy"]

        entities = tweet["entities"]
        tdata = {
            "tweet_id"      : text.parse_int(tweet["id_str"]),
            "retweet_id"    : text.parse_int(
                tweet.get("retweeted_status_id_str")),
            "quote_id"      : text.parse_int(
                tweet.get("quoted_status_id_str")),
            "reply_id"      : text.parse_int(
                tweet.get("in_reply_to_status_id_str")),
            "date"          : text.parse_datetime(
                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
            "user"          : user,
            "lang"          : tweet["lang"],
            "favorite_count": tweet["favorite_count"],
            "quote_count"   : tweet["quote_count"],
            "reply_count"   : tweet["reply_count"],
            "retweet_count" : tweet["retweet_count"],
        }

        hashtags = entities.get("hashtags")
        if hashtags:
            tdata["hashtags"] = [t["text"] for t in hashtags]

        mentions = entities.get("user_mentions")
        if mentions:
            tdata["mentions"] = [{
                "id": text.parse_int(u["id_str"]),
                "name": u["screen_name"],
                "nick": u["name"],
            } for u in mentions]

        content = tweet["full_text"]
        urls = entities.get("urls")
        if urls:
            for url in urls:
                content = content.replace(url["url"], url["expanded_url"])
        txt, _, tco = content.rpartition(" ")
        tdata["content"] = txt if tco.startswith("https://t.co/") else content

        if "in_reply_to_screen_name" in tweet:
            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
        if "quoted_by_id_str" in tweet:
            tdata["quote_by"] = text.parse_int(tweet["quoted_by_id_str"])

        if "author" in tweet:
            tdata["author"] = self._transform_user(tweet["author"])
        else:
            tdata["author"] = tdata["user"]

        return tdata

    def _transform_user(self, user):
        try:
            return self._user_cache[user.get("rest_id") or user["id_str"]]
        except KeyError:
            pass

        uid = user.get("rest_id") or user["id_str"]
        if "legacy" in user:
            user = user["legacy"]
        entities = user["entities"]

        self._user_cache[uid] = udata = {
            "id"              : text.parse_int(uid),
            "name"            : user["screen_name"],
            "nick"            : user["name"],
            "location"        : user["location"],
            "date"            : text.parse_datetime(
                user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
            "verified"        : user.get("verified", False),
            "profile_banner"  : user.get("profile_banner_url", ""),
            "profile_image"   : user.get(
                "profile_image_url_https", "").replace("_normal.", "."),
            "favourites_count": user["favourites_count"],
            "followers_count" : user["followers_count"],
            "friends_count"   : user["friends_count"],
            "listed_count"    : user["listed_count"],
            "media_count"     : user["media_count"],
            "statuses_count"  : user["statuses_count"],
        }

        descr = user["description"]
        urls = entities["description"].get("urls")
        if urls:
            for url in urls:
                descr = descr.replace(url["url"], url["expanded_url"])
        udata["description"] = descr

        if "url" in entities:
            url = entities["url"]["urls"][0]
            udata["url"] = url.get("expanded_url") or url.get("url")

        return udata

    def _users_result(self, users):
        userfmt = self.config("users")
        if not userfmt or userfmt == "timeline":
            cls = TwitterTimelineExtractor
            fmt = (self.root + "/i/user/{rest_id}").format_map
        elif userfmt == "media":
            cls = TwitterMediaExtractor
            fmt = (self.root + "/id:{rest_id}/media").format_map
        else:
            cls = None
            fmt = userfmt.format_map

        for user in users:
            user["_extractor"] = cls
            yield Message.Queue, fmt(user), user

    def metadata(self):
        """Return general metadata"""
        return {}

    def tweets(self):
        """Yield all relevant tweet objects"""

    def login(self):
        if not self._check_cookies(self.cookienames):
            username, password = self._get_auth_info()
            if username:
                self._update_cookies(self._login_impl(username, password))

    @cache(maxage=360*24*3600, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        token = util.generate_token()
        self.session.cookies.clear()
        self.request(self.root + "/login")

        url = self.root + "/sessions"
        cookies = {
            "_mb_tk": token,
        }
        data = {
            "redirect_after_login"      : "/",
            "remember_me"               : "1",
            "authenticity_token"        : token,
            "wfa"                       : "1",
            "ui_metrics"                : "{}",
            "session[username_or_email]": username,
            "session[password]"         : password,
        }
        response = self.request(
            url, method="POST", cookies=cookies, data=data)

        if "/account/login_verification" in response.url:
            raise exception.AuthenticationError(
                "Login with two-factor authentication is not supported")

        cookies = {
            cookie.name: cookie.value
            for cookie in self.session.cookies
        }

        if "/error" in response.url or "auth_token" not in cookies:
            raise exception.AuthenticationError()
        return cookies


class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for Tweets from a user's timeline"""
    subcategory = "timeline"
    pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
               r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
    test = (
        ("https://twitter.com/supernaturepics", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        # suspended account (#2216)
        ("https://twitter.com/realDonaldTrump", {
            "exception": exception.NotFoundError,
        }),
        ("https://mobile.twitter.com/supernaturepics?p=i"),
        ("https://www.twitter.com/id:2976459548"),
        ("https://twitter.com/i/user/2976459548"),
        ("https://twitter.com/intent/user?user_id=2976459548"),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        user_id = match.group(2)
        if user_id:
            self.user = "id:" + user_id

    def tweets(self):
        return self.api.user_tweets(self.user)


class TwitterRepliesExtractor(TwitterExtractor):
    """Extractor for Tweets from a user's timeline including replies"""
    subcategory = "replies"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/with_replies", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics/with_replies#t"),
        ("https://www.twitter.com/id:2976459548/with_replies"),
    )

    def tweets(self):
        return self.api.user_tweets_and_replies(self.user)


class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for Tweets from a user's Media timeline"""
    subcategory = "media"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/media", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics/media#t"),
        ("https://www.twitter.com/id:2976459548/media"),
    )

    def tweets(self):
        return self.api.user_media(self.user)


class TwitterLikesExtractor(TwitterExtractor):
    """Extractor for liked tweets"""
    subcategory = "likes"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
    test = ("https://twitter.com/supernaturepics/likes",)

    def metadata(self):
        return {"user_likes": self.user}

    def tweets(self):
        return self.api.user_likes(self.user)


class TwitterBookmarkExtractor(TwitterExtractor):
    """Extractor for bookmarked tweets"""
    subcategory = "bookmark"
    pattern = BASE_PATTERN + r"/i/bookmarks()"
    test = ("https://twitter.com/i/bookmarks",)

    def tweets(self):
        return self.api.user_bookmarks()


class TwitterListExtractor(TwitterExtractor):
    """Extractor for Twitter lists"""
    subcategory = "list"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
    test = ("https://twitter.com/i/lists/784214683683127296", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def tweets(self):
        return self.api.list_latest_tweets_timeline(self.user)


class TwitterListMembersExtractor(TwitterExtractor):
    """Extractor for members of a Twitter list"""
    subcategory = "list-members"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
    test = ("https://twitter.com/i/lists/784214683683127296/members",)

    def items(self):
        self.login()
        return self._users_result(TwitterAPI(self).list_members(self.user))


class TwitterFollowingExtractor(TwitterExtractor):
    """Extractor for followed users"""
    subcategory = "following"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/following"),
        ("https://www.twitter.com/id:2976459548/following"),
    )

    def items(self):
        self.login()
        return self._users_result(TwitterAPI(self).user_following(self.user))


class TwitterSearchExtractor(TwitterExtractor):
    """Extractor for Twitter search results"""
    subcategory = "search"
    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
    test = ("https://twitter.com/search?q=nature", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def metadata(self):
        return {"search": text.unquote(self.user)}

    def tweets(self):
        return self.api.search_adaptive(text.unquote(self.user))


class TwitterEventExtractor(TwitterExtractor):
    """Extractor for Tweets from a Twitter Event"""
    subcategory = "event"
    directory_fmt = ("{category}", "Events",
                     "{event[id]} {event[short_title]}")
    pattern = BASE_PATTERN + r"/i/events/(\d+)"
    test = ("https://twitter.com/i/events/1484669206993903616", {
        "range": "1-20",
        "count": ">5",
    })

    def metadata(self):
        return {"event": self.api.live_event(self.user)}

    def tweets(self):
        return self.api.live_event_timeline(self.user)


class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""
    subcategory = "tweet"
    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
    test = (
        ("https://twitter.com/supernaturepics/status/604341487988576256", {
            "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
        }),
        # 4 images
        ("https://twitter.com/perrypumas/status/894001459754180609", {
            "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
        }),
        # video
        ("https://twitter.com/perrypumas/status/1065692031626829824", {
            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
        }),
        # content with emoji, newlines, hashtags (#338)
        ("https://twitter.com/playpokemon/status/1263832915173048321", {
            "keyword": {"content": (
                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
                "Gifts! \n\nYou’ll be able to receive four Galarian form "
                "Pokémon with Hidden Abilities, plus some very useful items. "
                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
            )},
        }),
        # Reply to deleted tweet (#403, #838)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
        }),
        # 'replies' option (#705)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "options": (("replies", False),),
            "count": 0,
        }),
        # 'replies' to self (#1254)
        ("https://twitter.com/i/web/status/1424882930803908612", {
            "options": (("replies", "self"),),
            "count": 4,
            "keyword": {"user": {
                "description": "re:business email-- rhettaro.bloom@gmail.com "
                               "patreon- http://patreon.com/Princecanary",
                "url": "http://princecanary.tumblr.com",
            }},
        }),
        ("https://twitter.com/i/web/status/1424898916156284928", {
            "options": (("replies", "self"),),
            "count": 0,
        }),
        # "quoted" option (#854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "options": (("quoted", True),),
            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
            "count": 8,
        }),
        # quoted tweet (#526, #854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
            "count": 4,
        }),
        # TwitPic embeds (#579)
        ("https://twitter.com/i/web/status/112900228289540096", {
            "options": (("twitpic", True), ("cards", False)),
            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
            "count": 3,
        }),
        # Nitter tweet (#890)
        ("https://nitter.net/ed1conf/status/1163841619336007680", {
            "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
        }),
        # Twitter card (#1005)
        ("https://twitter.com/billboard/status/1306599586602135555", {
            "options": (("cards", True),),
            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
        }),
        # unified_card with image_carousel_website
        ("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
            "options": (("cards", True),),
            "pattern": r"https://pbs\.twimg\.com/media/F.+=png",
            "count": 6,
        }),
        # unified_card without type
        ("https://twitter.com/i/web/status/1466183847628865544", {
            "count": 0,
        }),
        # original retweets (#1026)
        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
            "options": (("retweets", "original"),),
            "count": 2,
            "keyword": {
                "tweet_id"  : 1296296016002547713,
                "retweet_id": 1296296016002547713,
                "date"      : "dt:2020-08-20 04:00:28",
            },
        }),
        # all Tweets from a conversation (#1319)
        ("https://twitter.com/BlankArts_/status/1323314488611872769", {
            "options": (("conversations", True),),
            "count": ">= 50",
        }),
        # retweet with missing media entities (#1555)
        ("https://twitter.com/morino_ya/status/1392763691599237121", {
            "options": (("retweets", True),),
            "count": 4,
        }),
        # deleted quote tweet (#2225)
        ("https://twitter.com/i/web/status/1460044411165888515", {
            "count": 0,
        }),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        self.tweet_id = match.group(2)

    def tweets(self):
        if self.config("conversations", False):
            return self.api.tweet_detail(self.tweet_id)

        tweets = []
        tweet_id = self.tweet_id
        for tweet in self.api.tweet_detail(tweet_id):
            if tweet["rest_id"] == tweet_id or \
                    tweet.get("_retweet_id_str") == tweet_id:
                tweets.append(tweet)

                tweet_id = tweet["legacy"].get("quoted_status_id_str")
                if not tweet_id:
                    break
        return tweets


class TwitterImageExtractor(Extractor):
    category = "twitter"
    subcategory = "image"
    pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
    test = (
        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", {
            "options": (("size", "4096x4096,orig"),),
            "url": "cb3042a6f6826923da98f0d2b66c427e9385114c",
        }),
        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.id, self.fmt = match.groups()
        TwitterExtractor._init_sizes(self)

    def items(self):
        base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
            self.id, self.fmt)

        data = {
            "filename": self.id,
            "extension": self.fmt,
            "_fallback": TwitterExtractor._image_fallback(self, base),
        }

        yield Message.Directory, data
        yield Message.Url, base + self._size_image, data


class TwitterAPI():

    def __init__(self, extractor):
        self.extractor = extractor

        self.root = "https://twitter.com/i/api"
        self.headers = {
            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
                             "4FA33AGWWjCpTnA",
            "x-guest-token": None,
            "x-twitter-auth-type": None,
            "x-twitter-client-language": "en",
            "x-twitter-active-user": "yes",
            "x-csrf-token": None,
            "Referer": "https://twitter.com/",
        }
        self.params = {
            "include_profile_interstitial_type": "1",
            "include_blocking": "1",
            "include_blocked_by": "1",
            "include_followed_by": "1",
            "include_want_retweets": "1",
            "include_mute_edge": "1",
            "include_can_dm": "1",
            "include_can_media_tag": "1",
            "include_ext_has_nft_avatar": "1",
            "skip_status": "1",
            "cards_platform": "Web-12",
            "include_cards": "1",
            "include_ext_alt_text": "true",
            "include_quote_count": "true",
            "include_reply_count": "1",
            "tweet_mode": "extended",
            "include_entities": "true",
            "include_user_entities": "true",
            "include_ext_media_color": "true",
            "include_ext_media_availability": "true",
            "include_ext_sensitive_media_warning": "true",
            "send_error_codes": "true",
            "simple_quoted_tweet": "true",
            "count": "100",
            "cursor": None,
            "ext": "mediaStats,highlightedLabel,hasNftAvatar,"
                   "voiceInfo,superFollowMetadata",
        }
        self.variables = {
            "includePromotedContent": False,
            "withSuperFollowsUserFields": True,
            "withBirdwatchPivots": False,
            "withDownvotePerspective": False,
            "withReactionsMetadata": False,
            "withReactionsPerspective": False,
            "withSuperFollowsTweetFields": True,
            "withClientEventToken": False,
            "withBirdwatchNotes": False,
            "withVoice": True,
            "withV2Timeline": False,
            "__fs_interactive_text": False,
            "__fs_dont_mention_me_view_api_enabled": False,
        }
        self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode

        cookies = extractor.session.cookies
        cookiedomain = extractor.cookiedomain

        # CSRF
        csrf_token = cookies.get("ct0", domain=cookiedomain)
        if not csrf_token:
            csrf_token = util.generate_token()
            cookies.set("ct0", csrf_token, domain=cookiedomain)
        self.headers["x-csrf-token"] = csrf_token

        if cookies.get("auth_token", domain=cookiedomain):
            # logged in
            self.headers["x-twitter-auth-type"] = "OAuth2Session"
        else:
            # guest
            guest_token = self._guest_token()
            cookies.set("gt", guest_token, domain=cookiedomain)
            self.headers["x-guest-token"] = guest_token

    def tweet_detail(self, tweet_id):
        endpoint = "/graphql/aD0-HB47XIOxiBl5kTkX5Q/TweetDetail"
        variables = {
            "focalTweetId": tweet_id,
            "with_rux_injections": False,
            "withCommunity": True,
            "withQuickPromoteEligibilityTweetFields": True,
            "withBirdwatchNotes": False,
        }
        return self._pagination_tweets(
            endpoint, variables, ("threaded_conversation_with_injections",))

    def user_tweets(self, screen_name):
        endpoint = "/graphql/LNhjy8t3XpIrBYM-ms7sPQ/UserTweets"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
            "withQuickPromoteEligibilityTweetFields": True,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_tweets_and_replies(self, screen_name):
        endpoint = "/graphql/Vg5aF036K40ST3FWvnvRGA/UserTweetsAndReplies"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
            "withCommunity": True,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_media(self, screen_name):
        endpoint = "/graphql/Hl6C7ac051l_QBe3HjGz_A/UserMedia"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_likes(self, screen_name):
        endpoint = "/graphql/smISlRVSnz-GaU_XpU_akw/Likes"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_bookmarks(self):
        endpoint = "/graphql/yKNebSjZKbo2tOd-Qdc7Xg/Bookmarks"
        variables = {
            "count": 100,
        }
        return self._pagination_tweets(
            endpoint, variables, ("bookmark_timeline", "timeline"))

    def list_latest_tweets_timeline(self, list_id):
        endpoint = "/graphql/RxUL5UHi4Msxt_P9O1729w/ListLatestTweetsTimeline"
        variables = {
            "listId": list_id,
            "count": 100,
        }
        return self._pagination_tweets(
            endpoint, variables, ("list", "tweets_timeline", "timeline"))

    def search_adaptive(self, query):
        endpoint = "/2/search/adaptive.json"
        params = self.params.copy()
        params["q"] = query
        params["tweet_search_mode"] = "live"
        params["query_source"] = "typed_query"
        params["pc"] = "1"
        params["spelling_corrections"] = "1"
        return self._pagination_legacy(endpoint, params)

    def live_event_timeline(self, event_id):
        endpoint = "/2/live_event/timeline/{}.json".format(event_id)
        params = self.params.copy()
        params["timeline_id"] = "recap"
        params["urt"] = "true"
        params["get_annotations"] = "true"
        return self._pagination_legacy(endpoint, params)

    def live_event(self, event_id):
        endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
        params = self.params.copy()
        params["count"] = "0"
        params["urt"] = "true"
        return (self._call(endpoint, params)
                ["twitter_objects"]["live_events"][event_id])

    def list_by_rest_id(self, list_id):
        endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId"
        params = {"variables": self._json_dumps({
            "listId": list_id,
            "withSuperFollowsUserFields": True,
        })}
        try:
            return self._call(endpoint, params)["data"]["list"]
        except KeyError:
            raise exception.NotFoundError("list")

    def list_members(self, list_id):
        endpoint = "/graphql/kk9RQtSa2sc-4_9figZVBw/ListMembers"
        variables = {
            "listId": list_id,
            "count": 100,
            "withSafetyModeUserFields": True,
        }
        return self._pagination_users(
            endpoint, variables, ("list", "members_timeline", "timeline"))

    def user_following(self, screen_name):
        endpoint = "/graphql/kz464_e4MAOXc3bGOA9kow/Following"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
        }
        return self._pagination_users(endpoint, variables)

    def user_by_screen_name(self, screen_name):
        endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName"
        params = {"variables": self._json_dumps({
            "screen_name": screen_name,
            "withSafetyModeUserFields": True,
            "withSuperFollowsUserFields": True,
        })}
        return self._call(endpoint, params)["data"]["user"]["result"]

    def _user_id_by_screen_name(self, screen_name):
        if screen_name.startswith("id:"):
            return screen_name[3:]

        user = ()
        try:
            user = self.user_by_screen_name(screen_name)
            return user["rest_id"]
        except KeyError:
            if "unavailable_message" in user:
                raise exception.NotFoundError("{} ({})".format(
                    user["unavailable_message"].get("text"),
                    user.get("reason")), False)
            else:
                raise exception.NotFoundError("user")

    @cache(maxage=3600)
    def _guest_token(self):
        root = "https://api.twitter.com"
        endpoint = "/1.1/guest/activate.json"
        return str(self._call(endpoint, None, root, "POST")["guest_token"])

    def _call(self, endpoint, params, root=None, method="GET", warning=True):
        if root is None:
            root = self.root

        while True:
            response = self.extractor.request(
                root + endpoint, method=method, params=params,
                headers=self.headers, fatal=None)

            # update 'x-csrf-token' header (#1170)
            csrf_token = response.cookies.get("ct0")
            if csrf_token:
                self.headers["x-csrf-token"] = csrf_token

            data = response.json()
            if "errors" in data:
                try:
                    errors = ", ".join(e["message"] for e in data["errors"])
                except Exception:
                    errors = data["errors"]
            else:
                errors = ""

            if response.status_code < 400:
                # success
                if errors and warning:
                    self.extractor.log.warning(errors)
                return data

            if response.status_code == 429:
                # rate limit exceeded
                until = response.headers.get("x-rate-limit-reset")
                seconds = None if until else 60
                self.extractor.wait(until=until, seconds=seconds)
                continue

            if response.status_code == 401 and \
                    "have been blocked from viewing" in errors:
                # account blocked
                extr = self.extractor
                if self.headers["x-twitter-auth-type"] and \
                        extr.config("logout"):
                    guest_token = self._guest_token()
                    extr.session.cookies.set(
                        "gt", guest_token, domain=extr.cookiedomain)
                    extr._cookiefile = None
                    del extr.session.cookies["auth_token"]
                    self.headers["x-guest-token"] = guest_token
                    self.headers["x-twitter-auth-type"] = None
                    extr.log.info("Retrying API request as guest")
                    continue

            # error
            raise exception.StopExtraction(
                "%s %s (%s)", response.status_code, response.reason, errors)

    def _pagination_legacy(self, endpoint, params):
        original_retweets = (self.extractor.retweets == "original")

        while True:
            cursor = tweet = None
            data = self._call(endpoint, params)

            instr = data["timeline"]["instructions"]
            if not instr:
                return
            tweet_ids = []
            tweets = data["globalObjects"]["tweets"]
            users = data["globalObjects"]["users"]

            # collect tweet IDs and cursor value
            for entry in instr[0]["addEntries"]["entries"]:
                entry_startswith = entry["entryId"].startswith

                if entry_startswith(("tweet-", "sq-I-t-")):
                    tweet_ids.append(
                        entry["content"]["item"]["content"]["tweet"]["id"])

                elif entry_startswith("homeConversation-"):
                    tweet_ids.extend(
                        entry["content"]["timelineModule"]["metadata"]
                        ["conversationMetadata"]["allTweetIds"][::-1])

                elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
                    cursor = entry["content"]["operation"]["cursor"]
                    if not cursor.get("stopOnEmptyResponse", True):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor["value"]

                elif entry_startswith("conversationThread-"):
                    tweet_ids.extend(
                        item["entryId"][6:]
                        for item in entry["content"]["timelineModule"]["items"]
                        if item["entryId"].startswith("tweet-")
                    )

            # process tweets
            for tweet_id in tweet_ids:
                try:
                    tweet = tweets[tweet_id]
                except KeyError:
                    self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
                    continue

                if "retweeted_status_id_str" in tweet:
                    retweet = tweets.get(tweet["retweeted_status_id_str"])
                    if original_retweets:
                        if not retweet:
                            continue
                        retweet["retweeted_status_id_str"] = retweet["id_str"]
                        retweet["_retweet_id_str"] = tweet["id_str"]
                        tweet = retweet
                    elif retweet:
                        tweet["author"] = users[retweet["user_id_str"]]
                        if "extended_entities" in retweet and \
                                "extended_entities" not in tweet:
                            tweet["extended_entities"] = \
                                retweet["extended_entities"]
                tweet["user"] = users[tweet["user_id_str"]]
                yield tweet

                if "quoted_status_id_str" in tweet:
                    quoted = tweets.get(tweet["quoted_status_id_str"])
                    if quoted:
                        quoted = quoted.copy()
                        quoted["author"] = users[quoted["user_id_str"]]
                        quoted["user"] = tweet["user"]
                        quoted["quoted_by_id_str"] = tweet["id_str"]
                        yield quoted

            # update cursor value
            if "replaceEntry" in instr[-1] :
                cursor = (instr[-1]["replaceEntry"]["entry"]
                          ["content"]["operation"]["cursor"]["value"])

            if not cursor or not tweet:
                return
            params["cursor"] = cursor

    def _pagination_tweets(self, endpoint, variables, path=None):
        variables.update(self.variables)
        original_retweets = (self.extractor.retweets == "original")
        pinned_tweet = self.extractor.pinned

        while True:
            params = {"variables": self._json_dumps(variables)}
            data = self._call(endpoint, params)["data"]

            try:
                if path is None:
                    instructions = (data["user"]["result"]["timeline"]
                                    ["timeline"]["instructions"])
                else:
                    for key in path:
                        data = data[key]
                    instructions = data["instructions"]

                entries = instructions[0]["entries"]
            except (KeyError, IndexError):
                return

            tweets = []
            tweet = cursor = None

            if pinned_tweet:
                pinned_tweet = False
                if instructions[-1]["type"] == "TimelinePinEntry":
                    tweets.append(instructions[-1]["entry"])

            for entry in entries:
                esw = entry["entryId"].startswith

                if esw("tweet-"):
                    tweets.append(entry)
                elif esw("homeConversation-"):
                    tweets.extend(entry["content"]["items"])
                elif esw("conversationthread-"):
                    tweets.extend(entry["content"]["items"])
                elif esw("cursor-bottom-"):
                    cursor = entry["content"]
                    if not cursor.get("stopOnEmptyResponse", True):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor.get("value")

            for entry in tweets:
                try:
                    tweet = ((entry.get("content") or entry["item"])
                             ["itemContent"]["tweet_results"]["result"])
                    legacy = tweet["legacy"]
                except KeyError:
                    self.extractor.log.debug(
                        "Skipping %s (deleted)",
                        (entry.get("entryId") or "").rpartition("-")[2])
                    continue

                if "retweeted_status_result" in legacy:
                    retweet = legacy["retweeted_status_result"]["result"]
                    if original_retweets:
                        try:
                            retweet["legacy"]["retweeted_status_id_str"] = \
                                retweet["rest_id"]
                            retweet["_retweet_id_str"] = tweet["rest_id"]
                            tweet = retweet
                        except KeyError:
                            continue
                    else:
                        try:
                            legacy["retweeted_status_id_str"] = \
                                retweet["rest_id"]
                            legacy["author"] = \
                                retweet["core"]["user_results"]["result"]
                            if "extended_entities" in retweet["legacy"] and \
                                    "extended_entities" not in legacy:
                                legacy["extended_entities"] = \
                                    retweet["legacy"]["extended_entities"]
                        except KeyError:
                            pass

                yield tweet

                if "quoted_status_result" in tweet:
                    try:
                        quoted = tweet["quoted_status_result"]["result"]
                        quoted["legacy"]["author"] = \
                            quoted["core"]["user_results"]["result"]
                        quoted["core"] = tweet["core"]
                        quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
                        yield quoted
                    except KeyError:
                        self.extractor.log.debug(
                            "Skipping quote of %s (deleted)",
                            tweet.get("rest_id"))
                        continue

            if not tweet or not cursor:
                return
            variables["cursor"] = cursor

    def _pagination_users(self, endpoint, variables, path=None):
        variables.update(self.variables)

        while True:
            cursor = entry = stop = None
            params = {"variables": self._json_dumps(variables)}
            data = self._call(endpoint, params)["data"]

            try:
                if path is None:
                    instructions = (data["user"]["result"]["timeline"]
                                    ["timeline"]["instructions"])
                else:
                    for key in path:
                        data = data[key]
                    instructions = data["instructions"]
            except KeyError:
                return

            for instr in instructions:
                if instr["type"] == "TimelineAddEntries":
                    for entry in instr["entries"]:
                        if entry["entryId"].startswith("user-"):
                            user = (entry["content"]["itemContent"]
                                    ["user_results"]["result"])
                            if "rest_id" in user:
                                yield user
                        elif entry["entryId"].startswith("cursor-bottom-"):
                            cursor = entry["content"]["value"]
                elif instr["type"] == "TimelineTerminateTimeline":
                    if instr["direction"] == "Bottom":
                        stop = True

            if stop or not cursor or not entry:
                return
            variables["cursor"] = cursor
-												[twitter] add extractor

											
										
										
											8 years ago
+								# -*- coding: utf-8 -*-
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								# Copyright 2016-2022 Mike Fährmann
-												[twitter] add extractor

											
										
										
											8 years ago
+								#
 								# This program is free software; you can redistribute it and/or modify
 								# it under the terms of the GNU General Public License version 2 as
 								# published by the Free Software Foundation.
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											5 years ago
+								"""Extractors for https://twitter.com/"""
-												[twitter] add extractor

											
										
										
											8 years ago
 								from .common import Extractor, Message
-												add a general 'generate_csrf_token()' function

											
										
										
											4 years ago
+								from .. import text, util, exception
-												[twitter] don't cache results of 'user_by_screen_name()'

A 'keyarg=1' argument to the memcache decorator would have worked as
well, but keeping the user object in memory isn't useful for the vast
majority of use cases and only wastes space.

(closes #817)

											
										
										
											4 years ago
+								from ..cache import cache
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								import json
-												code adjustments according to pep8 nr2

											
										
										
											8 years ago
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								BASE_PATTERN = (
 								    r"(?:https?://)?(?:www\.|mobile\.)?"
 								    r"(?:twitter\.com|nitter\.net)"
 								)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								class TwitterExtractor(Extractor):
 								    """Base class for twitter extractors"""
-												[twitter] add extractor

											
										
										
											8 years ago
+								    category = "twitter"
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								    directory_fmt = ("{category}", "{user[name]}")
 								    filename_fmt = "{tweet_id}_{num}.{extension}"
 								    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											5 years ago
+								    cookiedomain = ".twitter.com"
-												[twitter] skip login if 'auth_token' cookie is present

											
										
										
											4 years ago
+								    cookienames = ("auth_token",)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    root = "https://twitter.com"
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								    def __init__(self, match):
-												propagate 'match' to base extractor constructor

											
										
										
											6 years ago
+								        Extractor.__init__(self, match)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								        self.user = match.group(1)
-												[twitter] rename 'text-only' to 'text-tweets' (#570)

											
										
										
											3 years ago
+								        self.textonly = self.config("text-tweets", False)
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											3 years ago
+								        self.retweets = self.config("retweets", False)
-												[twitter] add 'replies' option (closes #705)

											
										
										
											4 years ago
+								        self.replies = self.config("replies", True)
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											5 years ago
+								        self.twitpic = self.config("twitpic", False)
-												[twitter] add 'pinned' option

											
										
										
											3 years ago
+								        self.pinned = self.config("pinned", False)
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											3 years ago
+								        self.quoted = self.config("quoted", False)
-												[twitter] change default value for 'videos' to 'true'

Every other 'videos' option defaulted to 'true', except Twitter.

											
										
										
											5 years ago
+								        self.videos = self.config("videos", True)
-												[twitter] changes to 'cards' option

- change default value to 'true'
- only invoke youtube-dl for cards unsupported by gallery
  when 'cards' is set to "ytdl"

"cards": true   --> only download card images
"cards": "ytdl" --> download card images and
                    use youtube_dl on otherwise unsupported cards

											
										
										
											3 years ago
+								        self.cards = self.config("cards", True)
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								        self._user_cache = {}
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											3 years ago
+								        self._init_sizes()
-												[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as
"unsupported" URLs, so that these can then be downloaded with youtube-dl

There are a lot of improvements to be made to the current
implementation, but it works and does what it is supposed to, even if
inefficient as can be ...

											
										
										
											6 years ago
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											3 years ago
+								    def _init_sizes(self):
-												[twitter] add 'size' option (#1881)

											
										
										
											3 years ago
+								        size = self.config("size")
 								        if size is None:
 								            self._size_image = "orig"
-												[twitter] include '4096x4096' as a default image fallback

(closes #2107, closes #1881)

											
										
										
											3 years ago
+								            self._size_fallback = ("4096x4096", "large", "medium", "small")
-												[twitter] add 'size' option (#1881)

											
										
										
											3 years ago
+								        else:
 								            if isinstance(size, str):
 								                size = size.split(",")
 								            self._size_image = size[0]
 								            self._size_fallback = size[1:]
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    def items(self):
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								        self.login()
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        self.api = TwitterAPI(self)
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											5 years ago
+								        metadata = self.metadata()
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
 								        for tweet in self.tweets():
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if "legacy" in tweet:
 								                data = tweet["legacy"]
 								            else:
 								                data = tweet
 								            if not self.retweets and "retweeted_status_id_str" in data:
 								                self.log.debug("Skipping %s (retweet)", data["id_str"])
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											4 years ago
+								                continue
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if not self.quoted and "quoted_by_id_str" in data:
 								                self.log.debug("Skipping %s (quoted tweet)", data["id_str"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								                continue
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if "in_reply_to_user_id_str" in data and (
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											3 years ago
+								                not self.replies or (
 								                    self.replies == "self" and
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                    data["in_reply_to_user_id_str"] != data["user_id_str"]
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											3 years ago
+								                )
 								            ):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                self.log.debug("Skipping %s (reply)", data["id_str"])
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											3 years ago
+								                continue
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            files = []
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if "extended_entities" in data:
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								                self._extract_media(
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                    data, data["extended_entities"]["media"], files)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            if "card" in tweet and self.cards:
 								                self._extract_card(tweet, files)
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
+								            if self.twitpic:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                self._extract_twitpic(data, files)
-												[twitter] add 'text-only' option (#570)

											
										
										
											3 years ago
+								            if not files and not self.textonly:
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								                continue
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								            tdata = self._transform_tweet(tweet)
 								            tdata.update(metadata)
 								            yield Message.Directory, tdata
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            for tdata["num"], file in enumerate(files, 1):
 								                file.update(tdata)
 								                url = file.pop("url")
 								                if "extension" not in file:
 								                    text.nameext_from_url(url, file)
 								                yield Message.Url, url, file
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								    def _extract_media(self, tweet, entities, files):
 								        for media in entities:
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											4 years ago
+								            width = media["original_info"].get("width", 0)
 								            height = media["original_info"].get("height", 0)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
 								            if "video_info" in media:
 								                if self.videos == "ytdl":
 								                    files.append({
 								                        "url": "ytdl:{}/i/web/status/{}".format(
 								                            self.root, tweet["id_str"]),
 								                        "width"    : width,
 								                        "height"   : height,
 								                        "extension": None,
 								                    })
 								                elif self.videos:
 								                    video_info = media["video_info"]
 								                    variant = max(
 								                        video_info["variants"],
 								                        key=lambda v: v.get("bitrate", 0),
 								                    )
 								                    files.append({
 								                        "url"     : variant["url"],
 								                        "width"   : width,
 								                        "height"  : height,
 								                        "bitrate" : variant.get("bitrate", 0),
 								                        "duration": video_info.get(
 								                            "duration_millis", 0) / 1000,
 								                    })
 								            elif "media_url_https" in media:
 								                url = media["media_url_https"]
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								                base, _, fmt = url.rpartition(".")
 								                base += "?format=" + fmt + "&name="
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								                files.append(text.nameext_from_url(url, {
-												[twitter] add 'size' option (#1881)

											
										
										
											3 years ago
+								                    "url"      : base + self._size_image,
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								                    "width"    : width,
 								                    "height"   : height,
-												[twitter] remove old-style URLs from image fallback lists

											
										
										
											3 years ago
+								                    "_fallback": self._image_fallback(base),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								                }))
 								            else:
 								                files.append({"url": media["media_url"]})
-												[twitter] add 'size' option (#1881)

											
										
										
											3 years ago
+								    def _image_fallback(self, base):
 								        for fmt in self._size_fallback:
 								            yield base + fmt
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								    def _extract_card(self, tweet, files):
 								        card = tweet["card"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        if "legacy" in card:
 								            card = card["legacy"]
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								        name = card["name"]
 								        if name in ("summary", "summary_large_image"):
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            bvals = card["binding_values"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if isinstance(bvals, list):
 								                bvals = {
 								                    bval["key"]: bval["value"]
 								                    for bval in card["binding_values"]
 								                }
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            for prefix in ("photo_image_full_size_",
 								                           "summary_photo_image_",
 								                           "thumbnail_image_"):
 								                for size in ("original", "x_large", "large", "small"):
 								                    key = prefix + size
 								                    if key in bvals:
-												[twitter] ensure card entries have a 'url' (#1868)

											
										
										
											3 years ago
+								                        value = bvals[key].get("image_value")
 								                        if value and "url" in value:
 								                            files.append(value)
 								                            return
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								        elif name == "unified_card":
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            bvals = card["binding_values"]
 								            if isinstance(bvals, list):
 								                for bval in card["binding_values"]:
 								                    if bval["key"] == "unified_card":
-												[twitter] fix unified cards from search results

											
										
										
											3 years ago
+								                        bval = bval["value"]["string_value"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                        break
 								            else:
-												[twitter] fix unified cards from search results

											
										
										
											3 years ago
+								                bval = bvals["unified_card"]["string_value"]
 								            data = json.loads(bval)
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								            if data.get("type") == "image_carousel_website":
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								                self._extract_media(
 								                    tweet, data["media_entities"].values(), files)
 								                return
-												[twitter] changes to 'cards' option

- change default value to 'true'
- only invoke youtube-dl for cards unsupported by gallery
  when 'cards' is set to "ytdl"

"cards": true   --> only download card images
"cards": "ytdl" --> download card images and
                    use youtube_dl on otherwise unsupported cards

											
										
										
											3 years ago
+								        if self.cards == "ytdl":
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            tweet_id = tweet.get("rest_id") or tweet["id_str"]
 								            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								            files.append({"url": url})
 								    def _extract_twitpic(self, tweet, files):
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
+								        for url in tweet["entities"].get("urls", ()):
 								            url = url["expanded_url"]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											4 years ago
+								            if "//twitpic.com/" in url and "/photos/" not in url:
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
+								                response = self.request(url, fatal=False)
 								                if response.status_code >= 400:
 								                    continue
 								                url = text.extract(
 								                    response.text, 'name="twitter:image" value="', '"')[0]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											4 years ago
+								                if url:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								                    files.append({"url": url})
-												[twitter] restore TwitPic support

											
										
										
											4 years ago
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								    def _transform_tweet(self, tweet):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        if "core" in tweet:
 								            user = self._transform_user(
 								                tweet["core"]["user_results"]["result"])
 								        else:
 								            user = self._transform_user(tweet["user"])
 								        if "legacy" in tweet:
 								            tweet = tweet["legacy"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								        entities = tweet["entities"]
 								        tdata = {
 								            "tweet_id"      : text.parse_int(tweet["id_str"]),
 								            "retweet_id"    : text.parse_int(
 								                tweet.get("retweeted_status_id_str")),
 								            "quote_id"      : text.parse_int(
 								                tweet.get("quoted_status_id_str")),
 								            "reply_id"      : text.parse_int(
 								                tweet.get("in_reply_to_status_id_str")),
 								            "date"          : text.parse_datetime(
 								                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            "user"          : user,
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								            "lang"          : tweet["lang"],
 								            "favorite_count": tweet["favorite_count"],
 								            "quote_count"   : tweet["quote_count"],
 								            "reply_count"   : tweet["reply_count"],
 								            "retweet_count" : tweet["retweet_count"],
 								        }
 								        hashtags = entities.get("hashtags")
 								        if hashtags:
 								            tdata["hashtags"] = [t["text"] for t in hashtags]
 								        mentions = entities.get("user_mentions")
 								        if mentions:
 								            tdata["mentions"] = [{
 								                "id": text.parse_int(u["id_str"]),
 								                "name": u["screen_name"],
 								                "nick": u["name"],
 								            } for u in mentions]
-												[twitter] resolve t.co URLs in 'content' (#1532)

											
										
										
											3 years ago
+								        content = tweet["full_text"]
 								        urls = entities.get("urls")
 								        if urls:
 								            for url in urls:
 								                content = content.replace(url["url"], url["expanded_url"])
-												[twitter] strip useless t.co links (#1532)

The 'full_text' of Tweets with media content usually ends with a t.co
link to itself. This commit removes those.

											
										
										
											3 years ago
+								        txt, _, tco = content.rpartition(" ")
 								        tdata["content"] = txt if tco.startswith("https://t.co/") else content
-												[twitter] resolve t.co URLs in 'content' (#1532)

											
										
										
											3 years ago
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											4 years ago
+								        if "in_reply_to_screen_name" in tweet:
 								            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
-												[twitter] add 'quote_by' metadata field (#1481)

Only present for tweets quoted by another tweet.
Represents the tweet_id of said tweet quoting this one.

											
										
										
											3 years ago
+								        if "quoted_by_id_str" in tweet:
 								            tdata["quote_by"] = text.parse_int(tweet["quoted_by_id_str"])
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											4 years ago
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								        if "author" in tweet:
 								            tdata["author"] = self._transform_user(tweet["author"])
-												[twitter] always provide an 'author' field (#831, #833)

The idea was to have less metadata clutter for most Tweets were
'author' and 'user' are the same (non-retweets), and only provide
a 'user' field.

The original Tweet author could be gotten with
{author[…]|user[…]}, but basically no one knows about that.

											
										
										
											4 years ago
+								        else:
 								            tdata["author"] = tdata["user"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
 								        return tdata
 								    def _transform_user(self, user):
-												[twitter] slightly improve '_transform_user()'

											
										
										
											3 years ago
+								        try:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            return self._user_cache[user.get("rest_id") or user["id_str"]]
-												[twitter] slightly improve '_transform_user()'

											
										
										
											3 years ago
+								        except KeyError:
 								            pass
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        uid = user.get("rest_id") or user["id_str"]
 								        if "legacy" in user:
 								            user = user["legacy"]
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											3 years ago
+								        entities = user["entities"]
-												[twitter] slightly improve '_transform_user()'

											
										
										
											3 years ago
+								        self._user_cache[uid] = udata = {
 								            "id"              : text.parse_int(uid),
 								            "name"            : user["screen_name"],
 								            "nick"            : user["name"],
 								            "location"        : user["location"],
 								            "date"            : text.parse_datetime(
 								                user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								            "verified"        : user.get("verified", False),
 								            "profile_banner"  : user.get("profile_banner_url", ""),
 								            "profile_image"   : user.get(
 								                "profile_image_url_https", "").replace("_normal.", "."),
 								            "favourites_count": user["favourites_count"],
 								            "followers_count" : user["followers_count"],
 								            "friends_count"   : user["friends_count"],
 								            "listed_count"    : user["listed_count"],
 								            "media_count"     : user["media_count"],
 								            "statuses_count"  : user["statuses_count"],
 								        }
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											3 years ago
-												[twitter] expand t.co links in user descriptions (#1532, #1787)

											
										
										
											3 years ago
+								        descr = user["description"]
 								        urls = entities["description"].get("urls")
 								        if urls:
 								            for url in urls:
 								                descr = descr.replace(url["url"], url["expanded_url"])
 								        udata["description"] = descr
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											3 years ago
+								        if "url" in entities:
-												[twitter] fix 'url' extraction for users without 'expanded_url'

(#1532, #1787)

											
										
										
											3 years ago
+								            url = entities["url"]["urls"][0]
 								            udata["url"] = url.get("expanded_url") or url.get("url")
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											3 years ago
-												[twitter] slightly improve '_transform_user()'

											
										
										
											3 years ago
+								        return udata
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
+								    def _users_result(self, users):
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											4 years ago
+								        userfmt = self.config("users")
 								        if not userfmt or userfmt == "timeline":
 								            cls = TwitterTimelineExtractor
 								            fmt = (self.root + "/i/user/{rest_id}").format_map
 								        elif userfmt == "media":
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
+								            cls = TwitterMediaExtractor
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											4 years ago
+								            fmt = (self.root + "/id:{rest_id}/media").format_map
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
+								        else:
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											4 years ago
+								            cls = None
 								            fmt = userfmt.format_map
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
 								        for user in users:
 								            user["_extractor"] = cls
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											4 years ago
+								            yield Message.Queue, fmt(user), user
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    def metadata(self):
 								        """Return general metadata"""
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											5 years ago
+								        return {}
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        """Yield all relevant tweet objects"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								    def login(self):
-												[twitter] skip login if 'auth_token' cookie is present

											
										
										
											4 years ago
+								        if not self._check_cookies(self.cookienames):
 								            username, password = self._get_auth_info()
 								            if username:
 								                self._update_cookies(self._login_impl(username, password))
-												[twitter] add login support (#214)

											
										
										
											6 years ago
 								    @cache(maxage=360*24*3600, keyarg=1)
 								    def _login_impl(self, username, password):
 								        self.log.info("Logging in as %s", username)
-												rename 'generate_csrf_token()' to just 'generate_token()'

and add a 'size' argument

											
										
										
											4 years ago
+								        token = util.generate_token()
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											4 years ago
+								        self.session.cookies.clear()
 								        self.request(self.root + "/login")
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											4 years ago
+								        url = self.root + "/sessions"
 								        cookies = {
 								            "_mb_tk": token,
 								        }
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								        data = {
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											4 years ago
+								            "redirect_after_login"      : "/",
 								            "remember_me"               : "1",
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								            "authenticity_token"        : token,
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											4 years ago
+								            "wfa"                       : "1",
 								            "ui_metrics"                : "{}",
-												[twitter] add login support (#214)

											
										
										
											6 years ago
+								            "session[username_or_email]": username,
 								            "session[password]"         : password,
 								        }
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											4 years ago
+								        response = self.request(
 								            url, method="POST", cookies=cookies, data=data)
-												[twitter] better error message when logging in with 2FA (#1409)

											
										
										
											4 years ago
+								        if "/account/login_verification" in response.url:
 								            raise exception.AuthenticationError(
 								                "Login with two-factor authentication is not supported")
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
+								        cookies = {
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											5 years ago
+								            cookie.name: cookie.value
 								            for cookie in self.session.cookies
 								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											4 years ago
 								        if "/error" in response.url or "auth_token" not in cookies:
 								            raise exception.AuthenticationError()
 								        return cookies
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								class TwitterTimelineExtractor(TwitterExtractor):
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											3 years ago
+								    """Extractor for Tweets from a user's timeline"""
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								    subcategory = "timeline"
-												[twitter] match '/i/user/ID' URLs

											
										
										
											4 years ago
+								    pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
 								               r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    test = (
 								        ("https://twitter.com/supernaturepics", {
 								            "range": "1-40",
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        }),
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								        # suspended account (#2216)
 								        ("https://twitter.com/realDonaldTrump", {
 								            "exception": exception.NotFoundError,
 								        }),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        ("https://mobile.twitter.com/supernaturepics?p=i"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        ("https://www.twitter.com/id:2976459548"),
-												[twitter] match '/i/user/ID' URLs

											
										
										
											4 years ago
+								        ("https://twitter.com/i/user/2976459548"),
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											4 years ago
+								        ("https://twitter.com/intent/user?user_id=2976459548"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											4 years ago
+								    def __init__(self, match):
 								        TwitterExtractor.__init__(self, match)
-												[twitter] match '/i/user/ID' URLs

											
										
										
											4 years ago
+								        user_id = match.group(2)
 								        if user_id:
 								            self.user = "id:" + user_id
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											4 years ago
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.user_tweets(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											3 years ago
+								class TwitterRepliesExtractor(TwitterExtractor):
 								    """Extractor for Tweets from a user's timeline including replies"""
 								    subcategory = "replies"
 								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)"
 								    test = (
 								        ("https://twitter.com/supernaturepics/with_replies", {
 								            "range": "1-40",
 								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
 								        }),
 								        ("https://mobile.twitter.com/supernaturepics/with_replies#t"),
 								        ("https://www.twitter.com/id:2976459548/with_replies"),
 								    )
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.user_tweets_and_replies(self.user)
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											3 years ago
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								class TwitterMediaExtractor(TwitterExtractor):
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											3 years ago
+								    """Extractor for Tweets from a user's Media timeline"""
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								    subcategory = "media"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    test = (
 								        ("https://twitter.com/supernaturepics/media", {
 								            "range": "1-40",
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        }),
 								        ("https://mobile.twitter.com/supernaturepics/media#t"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								        ("https://www.twitter.com/id:2976459548/media"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.user_media(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								class TwitterLikesExtractor(TwitterExtractor):
 								    """Extractor for liked tweets"""
 								    subcategory = "likes"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								    test = ("https://twitter.com/supernaturepics/likes",)
-												[twitter] add 'user_likes' metadata field for liked tweets

i.e. the 'screen_name' of the user whose liked tweets get extracted.

Ideally this would replace 'user' or at least be in the same format,
but that would break backwards compatibility or be impossible/too
complicated thanks to API result differences.

(#1421)

											
										
										
											4 years ago
+								    def metadata(self):
 								        return {"user_likes": self.user}
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.user_likes(self.user)
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
 								class TwitterBookmarkExtractor(TwitterExtractor):
 								    """Extractor for bookmarked tweets"""
 								    subcategory = "bookmark"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/i/bookmarks()"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
+								    test = ("https://twitter.com/i/bookmarks",)
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.user_bookmarks()
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											4 years ago
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								class TwitterListExtractor(TwitterExtractor):
 								    """Extractor for Twitter lists"""
 								    subcategory = "list"
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								    test = ("https://twitter.com/i/lists/784214683683127296", {
 								        "range": "1-40",
 								        "count": 40,
 								        "archive": False,
 								    })
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.list_latest_tweets_timeline(self.user)
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								class TwitterListMembersExtractor(TwitterExtractor):
 								    """Extractor for members of a Twitter list"""
 								    subcategory = "list-members"
 								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
 								    test = ("https://twitter.com/i/lists/784214683683127296/members",)
 								    def items(self):
 								        self.login()
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
+								        return self._users_result(TwitterAPI(self).list_members(self.user))
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
+								class TwitterFollowingExtractor(TwitterExtractor):
 								    """Extractor for followed users"""
 								    subcategory = "following"
 								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
 								    test = (
 								        ("https://twitter.com/supernaturepics/following"),
 								        ("https://www.twitter.com/id:2976459548/following"),
 								    )
 								    def items(self):
 								        self.login()
-												[twitter] implement 'users' option (#1337)

											
										
										
											4 years ago
+								        return self._users_result(TwitterAPI(self).user_following(self.user))
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											5 years ago
+								class TwitterSearchExtractor(TwitterExtractor):
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								    """Extractor for Twitter search results"""
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											5 years ago
+								    subcategory = "search"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
+								    test = ("https://twitter.com/search?q=nature", {
 								        "range": "1-40",
 								        "count": 40,
-												update extractor test results

											
										
										
											4 years ago
+								        "archive": False,
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
+								    })
 								    def metadata(self):
-												[twitter] improve pagination

											
										
										
											4 years ago
+								        return {"search": text.unquote(self.user)}
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											5 years ago
+								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        return self.api.search_adaptive(text.unquote(self.user))
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
 								class TwitterEventExtractor(TwitterExtractor):
 								    """Extractor for Tweets from a Twitter Event"""
 								    subcategory = "event"
-												[twitter] define directory format for events (#2109)

											
										
										
											3 years ago
+								    directory_fmt = ("{category}", "Events",
 								                     "{event[id]} {event[short_title]}")
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								    pattern = BASE_PATTERN + r"/i/events/(\d+)"
 								    test = ("https://twitter.com/i/events/1484669206993903616", {
 								        "range": "1-20",
 								        "count": ">5",
 								    })
 								    def metadata(self):
 								        return {"event": self.api.live_event(self.user)}
 								    def tweets(self):
 								        return self.api.live_event_timeline(self.user)
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											5 years ago
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								class TwitterTweetExtractor(TwitterExtractor):
-												[twitter] changes and improvements

- rename User- to TimelineExtractor
- rename 'userid' to 'user_id' to conform to the other ..._id values
- adjust archive_fmt to deal with retweets
- emulate browser behavior for API calls

											
										
										
											6 years ago
+								    """Extractor for images from individual tweets"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    subcategory = "tweet"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											4 years ago
+								    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											6 years ago
+								    test = (
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											5 years ago
+								        ("https://twitter.com/supernaturepics/status/604341487988576256", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											5 years ago
+								            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											7 years ago
+								        }),
-												[twitter] extract 'date' metadata (#224)

											
										
										
											5 years ago
+								        # 4 images
-												[twitter] ignore "Promoted Tweets"

											
										
										
											7 years ago
+								        ("https://twitter.com/perrypumas/status/894001459754180609", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
-												[twitter] extract 'date' metadata (#224)

											
										
										
											5 years ago
+								        }),
 								        # video
 								        ("https://twitter.com/perrypumas/status/1065692031626829824", {
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											7 years ago
+								        }),
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											5 years ago
+								        # content with emoji, newlines, hashtags (#338)
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											4 years ago
+								        ("https://twitter.com/playpokemon/status/1263832915173048321", {
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											4 years ago
+								            "keyword": {"content": (
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											4 years ago
+								                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
 								                "Gifts! \n\nYou’ll be able to receive four Galarian form "
 								                "Pokémon with Hidden Abilities, plus some very useful items. "
 								                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
-												update test results

- twitter:

    Don't test the whole kwdict, only the actual content, since the
    keyword hash changes whenever that user changes his display name.

- khinsider:

    Download host changed

											
										
										
											5 years ago
+								            )},
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											5 years ago
+								        }),
-												[twitter] update tests

											
										
										
											4 years ago
+								        # Reply to deleted tweet (#403, #838)
 								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											5 years ago
+								        }),
-												[twitter] add 'replies' option (closes #705)

											
										
										
											4 years ago
+								        # 'replies' option (#705)
-												[twitter] update tests

											
										
										
											4 years ago
+								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] add 'replies' option (closes #705)

											
										
										
											4 years ago
+								            "options": (("replies", False),),
 								            "count": 0,
 								        }),
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											3 years ago
+								        # 'replies' to self (#1254)
 								        ("https://twitter.com/i/web/status/1424882930803908612", {
 								            "options": (("replies", "self"),),
 								            "count": 4,
-												[twitter] expand t.co links in user descriptions (#1532, #1787)

											
										
										
											3 years ago
+								            "keyword": {"user": {
 								                "description": "re:business email-- rhettaro.bloom@gmail.com "
 								                               "patreon- http://patreon.com/Princecanary",
 								                "url": "http://princecanary.tumblr.com",
 								            }},
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											3 years ago
+								        }),
 								        ("https://twitter.com/i/web/status/1424898916156284928", {
 								            "options": (("replies", "self"),),
 								            "count": 0,
 								        }),
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											3 years ago
+								        # "quoted" option (#854)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											3 years ago
+								            "options": (("quoted", True),),
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								            "count": 8,
 								        }),
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											3 years ago
+								        # quoted tweet (#526, #854)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											4 years ago
+								            "count": 4,
-												[twitter] handle quoted tweets (#526)

… and categorize them as retweets

											
										
										
											5 years ago
+								        }),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											5 years ago
+								        # TwitPic embeds (#579)
 								        ("https://twitter.com/i/web/status/112900228289540096", {
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            "options": (("twitpic", True), ("cards", False)),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											5 years ago
+								            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
 								            "count": 3,
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        # Nitter tweet (#890)
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								        ("https://nitter.net/ed1conf/status/1163841619336007680", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											4 years ago
+								            "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											4 years ago
+								            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        # Twitter card (#1005)
 								        ("https://twitter.com/billboard/status/1306599586602135555", {
 								            "options": (("cards", True),),
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											4 years ago
+								            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											4 years ago
+								        }),
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											3 years ago
+								        # unified_card with image_carousel_website
 								        ("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
 								            "options": (("cards", True),),
 								            "pattern": r"https://pbs\.twimg\.com/media/F.+=png",
 								            "count": 6,
 								        }),
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								        # unified_card without type
 								        ("https://twitter.com/i/web/status/1466183847628865544", {
 								            "count": 0,
 								        }),
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								        # original retweets (#1026)
 								        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
 								            "options": (("retweets", "original"),),
 								            "count": 2,
 								            "keyword": {
-												[twitter] set 'retweet_id' for original retweets (#1481)

											
										
										
											3 years ago
+								                "tweet_id"  : 1296296016002547713,
 								                "retweet_id": 1296296016002547713,
 								                "date"      : "dt:2020-08-20 04:00:28",
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								            },
 								        }),
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											4 years ago
+								        # all Tweets from a conversation (#1319)
 								        ("https://twitter.com/BlankArts_/status/1323314488611872769", {
 								            "options": (("conversations", True),),
 								            "count": ">= 50",
 								        }),
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											3 years ago
+								        # retweet with missing media entities (#1555)
 								        ("https://twitter.com/morino_ya/status/1392763691599237121", {
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											3 years ago
+								            "options": (("retweets", True),),
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											3 years ago
+								            "count": 4,
 								        }),
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								        # deleted quote tweet (#2225)
 								        ("https://twitter.com/i/web/status/1460044411165888515", {
 								            "count": 0,
 								        }),
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											6 years ago
+								    )
-												[twitter] add extractor

											
										
										
											8 years ago
 								    def __init__(self, match):
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											6 years ago
+								        TwitterExtractor.__init__(self, match)
 								        self.tweet_id = match.group(2)
-												[twitter] add extractor

											
										
										
											8 years ago
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											6 years ago
+								    def tweets(self):
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											4 years ago
+								        if self.config("conversations", False):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								            return self.api.tweet_detail(self.tweet_id)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
 								        tweets = []
 								        tweet_id = self.tweet_id
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        for tweet in self.api.tweet_detail(tweet_id):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if tweet["rest_id"] == tweet_id or \
 								                    tweet.get("_retweet_id_str") == tweet_id:
 								                tweets.append(tweet)
 								                tweet_id = tweet["legacy"].get("quoted_status_id_str")
 								                if not tweet_id:
 								                    break
 								        return tweets
-												[twitter] handle API rate limits (#526)

											
										
										
											5 years ago
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											4 years ago
+								class TwitterImageExtractor(Extractor):
 								    category = "twitter"
 								    subcategory = "image"
 								    pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
 								    test = (
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											3 years ago
+								        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", {
 								            "options": (("size", "4096x4096,orig"),),
 								            "url": "cb3042a6f6826923da98f0d2b66c427e9385114c",
 								        }),
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											4 years ago
+								        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
 								    )
 								    def __init__(self, match):
 								        Extractor.__init__(self, match)
 								        self.id, self.fmt = match.groups()
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											3 years ago
+								        TwitterExtractor._init_sizes(self)
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											4 years ago
 								    def items(self):
-												[twitter] remove old-style URLs from image fallback lists

											
										
										
											3 years ago
+								        base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
 								            self.id, self.fmt)
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											4 years ago
 								        data = {
 								            "filename": self.id,
 								            "extension": self.fmt,
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											3 years ago
+								            "_fallback": TwitterExtractor._image_fallback(self, base),
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											4 years ago
+								        }
 								        yield Message.Directory, data
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											3 years ago
+								        yield Message.Url, base + self._size_image, data
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											4 years ago
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								class TwitterAPI():
 								    def __init__(self, extractor):
 								        self.extractor = extractor
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											4 years ago
 								        self.root = "https://twitter.com/i/api"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								        self.headers = {
 								            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
 								                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
 								                             "4FA33AGWWjCpTnA",
 								            "x-guest-token": None,
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											4 years ago
+								            "x-twitter-auth-type": None,
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            "x-twitter-client-language": "en",
 								            "x-twitter-active-user": "yes",
 								            "x-csrf-token": None,
 								            "Referer": "https://twitter.com/",
 								        }
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								        self.params = {
 								            "include_profile_interstitial_type": "1",
 								            "include_blocking": "1",
 								            "include_blocked_by": "1",
 								            "include_followed_by": "1",
 								            "include_want_retweets": "1",
 								            "include_mute_edge": "1",
 								            "include_can_dm": "1",
 								            "include_can_media_tag": "1",
 								            "include_ext_has_nft_avatar": "1",
 								            "skip_status": "1",
 								            "cards_platform": "Web-12",
 								            "include_cards": "1",
 								            "include_ext_alt_text": "true",
 								            "include_quote_count": "true",
 								            "include_reply_count": "1",
 								            "tweet_mode": "extended",
 								            "include_entities": "true",
 								            "include_user_entities": "true",
 								            "include_ext_media_color": "true",
 								            "include_ext_media_availability": "true",
 								            "include_ext_sensitive_media_warning": "true",
 								            "send_error_codes": "true",
 								            "simple_quoted_tweet": "true",
 								            "count": "100",
 								            "cursor": None,
 								            "ext": "mediaStats,highlightedLabel,hasNftAvatar,"
 								                   "voiceInfo,superFollowMetadata",
 								        }
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        self.variables = {
 								            "includePromotedContent": False,
 								            "withSuperFollowsUserFields": True,
 								            "withBirdwatchPivots": False,
 								            "withDownvotePerspective": False,
 								            "withReactionsMetadata": False,
 								            "withReactionsPerspective": False,
 								            "withSuperFollowsTweetFields": True,
 								            "withClientEventToken": False,
 								            "withBirdwatchNotes": False,
 								            "withVoice": True,
 								            "withV2Timeline": False,
 								            "__fs_interactive_text": False,
 								            "__fs_dont_mention_me_view_api_enabled": False,
 								        }
 								        self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
 								        cookies = extractor.session.cookies
 								        cookiedomain = extractor.cookiedomain
 								        # CSRF
 								        csrf_token = cookies.get("ct0", domain=cookiedomain)
 								        if not csrf_token:
 								            csrf_token = util.generate_token()
 								            cookies.set("ct0", csrf_token, domain=cookiedomain)
 								        self.headers["x-csrf-token"] = csrf_token
 								        if cookies.get("auth_token", domain=cookiedomain):
 								            # logged in
 								            self.headers["x-twitter-auth-type"] = "OAuth2Session"
 								        else:
 								            # guest
 								            guest_token = self._guest_token()
 								            cookies.set("gt", guest_token, domain=cookiedomain)
 								            self.headers["x-guest-token"] = guest_token
 								    def tweet_detail(self, tweet_id):
 								        endpoint = "/graphql/aD0-HB47XIOxiBl5kTkX5Q/TweetDetail"
 								        variables = {
 								            "focalTweetId": tweet_id,
 								            "with_rux_injections": False,
 								            "withCommunity": True,
 								            "withQuickPromoteEligibilityTweetFields": True,
 								            "withBirdwatchNotes": False,
 								        }
 								        return self._pagination_tweets(
 								            endpoint, variables, ("threaded_conversation_with_injections",))
 								    def user_tweets(self, screen_name):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/LNhjy8t3XpIrBYM-ms7sPQ/UserTweets"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								            "withQuickPromoteEligibilityTweetFields": True,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_tweets_and_replies(self, screen_name):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/Vg5aF036K40ST3FWvnvRGA/UserTweetsAndReplies"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								            "withCommunity": True,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_media(self, screen_name):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/Hl6C7ac051l_QBe3HjGz_A/UserMedia"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_likes(self, screen_name):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/smISlRVSnz-GaU_XpU_akw/Likes"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_bookmarks(self):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/yKNebSjZKbo2tOd-Qdc7Xg/Bookmarks"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        variables = {
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(
 								            endpoint, variables, ("bookmark_timeline", "timeline"))
 								    def list_latest_tweets_timeline(self, list_id):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/RxUL5UHi4Msxt_P9O1729w/ListLatestTweetsTimeline"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        variables = {
 								            "listId": list_id,
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(
 								            endpoint, variables, ("list", "tweets_timeline", "timeline"))
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								    def search_adaptive(self, query):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        endpoint = "/2/search/adaptive.json"
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								        params = self.params.copy()
 								        params["q"] = query
 								        params["tweet_search_mode"] = "live"
 								        params["query_source"] = "typed_query"
 								        params["pc"] = "1"
 								        params["spelling_corrections"] = "1"
 								        return self._pagination_legacy(endpoint, params)
 								    def live_event_timeline(self, event_id):
 								        endpoint = "/2/live_event/timeline/{}.json".format(event_id)
 								        params = self.params.copy()
 								        params["timeline_id"] = "recap"
 								        params["urt"] = "true"
 								        params["get_annotations"] = "true"
 								        return self._pagination_legacy(endpoint, params)
 								    def live_event(self, event_id):
 								        endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
 								        params = self.params.copy()
 								        params["count"] = "0"
 								        params["urt"] = "true"
 								        return (self._call(endpoint, params)
 								                ["twitter_objects"]["live_events"][event_id])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								    def list_by_rest_id(self, list_id):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId"
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        params = {"variables": self._json_dumps({
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            "listId": list_id,
 								            "withSuperFollowsUserFields": True,
 								        })}
-												[twitter] add 'list' extractor (#1096)

											
										
										
											4 years ago
+								        try:
 								            return self._call(endpoint, params)["data"]["list"]
 								        except KeyError:
 								            raise exception.NotFoundError("list")
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
+								    def list_members(self, list_id):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/kk9RQtSa2sc-4_9figZVBw/ListMembers"
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
+								        variables = {
 								            "listId": list_id,
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            "count": 100,
 								            "withSafetyModeUserFields": True,
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
+								        }
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        return self._pagination_users(
 								            endpoint, variables, ("list", "members_timeline", "timeline"))
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
 								    def user_following(self, screen_name):
-												[twitter] update query hashes

											
										
										
											3 years ago
+								        endpoint = "/graphql/kz464_e4MAOXc3bGOA9kow/Following"
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            "count": 100,
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
+								        }
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        return self._pagination_users(endpoint, variables)
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											4 years ago
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								    def user_by_screen_name(self, screen_name):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName"
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        params = {"variables": self._json_dumps({
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            "screen_name": screen_name,
 								            "withSafetyModeUserFields": True,
 								            "withSuperFollowsUserFields": True,
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        })}
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											3 years ago
+								        return self._call(endpoint, params)["data"]["user"]["result"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
+								    def _user_id_by_screen_name(self, screen_name):
 								        if screen_name.startswith("id:"):
 								            return screen_name[3:]
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											3 years ago
 								        user = ()
 								        try:
 								            user = self.user_by_screen_name(screen_name)
 								            return user["rest_id"]
 								        except KeyError:
 								            if "unavailable_message" in user:
 								                raise exception.NotFoundError("{} ({})".format(
 								                    user["unavailable_message"].get("text"),
 								                    user.get("reason")), False)
 								            else:
 								                raise exception.NotFoundError("user")
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											4 years ago
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											4 years ago
+								    @cache(maxage=3600)
 								    def _guest_token(self):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											4 years ago
+								        root = "https://api.twitter.com"
 								        endpoint = "/1.1/guest/activate.json"
-												[twitter] ensure guest tokens are returned as string (#1665)

											
										
										
											3 years ago
+								        return str(self._call(endpoint, None, root, "POST")["guest_token"])
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											4 years ago
-												[twitter] improve error handling

											
										
										
											3 years ago
+								    def _call(self, endpoint, params, root=None, method="GET", warning=True):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											4 years ago
+								        if root is None:
 								            root = self.root
-												[twitter] improve error message formatting

											
										
										
											4 years ago
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											4 years ago
+								        while True:
 								            response = self.extractor.request(
 								                root + endpoint, method=method, params=params,
 								                headers=self.headers, fatal=None)
 								            # update 'x-csrf-token' header (#1170)
 								            csrf_token = response.cookies.get("ct0")
 								            if csrf_token:
 								                self.headers["x-csrf-token"] = csrf_token
-												[twitter] warn about suspended accounts etc (closes #1759)

											
										
										
											3 years ago
+								            data = response.json()
 								            if "errors" in data:
 								                try:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                    errors = ", ".join(e["message"] for e in data["errors"])
-												[twitter] warn about suspended accounts etc (closes #1759)

											
										
										
											3 years ago
+								                except Exception:
-												[twitter] distinguish between fatal & nonfatal errors (#2020)

only show a warning for nonfatal errors
and do not raise a StopExtraction exception

											
										
										
											3 years ago
+								                    errors = data["errors"]
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
+								            else:
-												[twitter] distinguish between fatal & nonfatal errors (#2020)

only show a warning for nonfatal errors
and do not raise a StopExtraction exception

											
										
										
											3 years ago
+								                errors = ""
-												[twitter] warn about suspended accounts etc (closes #1759)

											
										
										
											3 years ago
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											4 years ago
+								            if response.status_code < 400:
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
+								                # success
-												[twitter] improve error handling

											
										
										
											3 years ago
+								                if errors and warning:
 								                    self.extractor.log.warning(errors)
-												[twitter] warn about suspended accounts etc (closes #1759)

											
										
										
											3 years ago
+								                return data
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											4 years ago
+								            if response.status_code == 429:
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
+								                # rate limit exceeded
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											4 years ago
+								                until = response.headers.get("x-rate-limit-reset")
 								                seconds = None if until else 60
 								                self.extractor.wait(until=until, seconds=seconds)
 								                continue
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
 								            if response.status_code == 401 and \
-												[twitter] distinguish between fatal & nonfatal errors (#2020)

only show a warning for nonfatal errors
and do not raise a StopExtraction exception

											
										
										
											3 years ago
+								                    "have been blocked from viewing" in errors:
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
+								                # account blocked
-												[twitter] fix 'logout' (#1719)

delete 'auth_token' cookie and cookies.txt path

											
										
										
											3 years ago
+								                extr = self.extractor
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
+								                if self.headers["x-twitter-auth-type"] and \
 								                        extr.config("logout"):
 								                    guest_token = self._guest_token()
 								                    extr.session.cookies.set(
 								                        "gt", guest_token, domain=extr.cookiedomain)
-												[twitter] fix 'logout' (#1719)

delete 'auth_token' cookie and cookies.txt path

											
										
										
											3 years ago
+								                    extr._cookiefile = None
 								                    del extr.session.cookies["auth_token"]
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											3 years ago
+								                    self.headers["x-guest-token"] = guest_token
 								                    self.headers["x-twitter-auth-type"] = None
 								                    extr.log.info("Retrying API request as guest")
 								                    continue
 								            # error
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											4 years ago
+								            raise exception.StopExtraction(
-												[twitter] distinguish between fatal & nonfatal errors (#2020)

only show a warning for nonfatal errors
and do not raise a StopExtraction exception

											
										
										
											3 years ago
+								                "%s %s (%s)", response.status_code, response.reason, errors)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								    def _pagination_legacy(self, endpoint, params):
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											4 years ago
+								        original_retweets = (self.extractor.retweets == "original")
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
 								        while True:
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            cursor = tweet = None
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            data = self._call(endpoint, params)
-												[twitter] improve pagination

											
										
										
											4 years ago
 								            instr = data["timeline"]["instructions"]
 								            if not instr:
 								                return
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								            tweet_ids = []
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								            tweets = data["globalObjects"]["tweets"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            users = data["globalObjects"]["users"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								            # collect tweet IDs and cursor value
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            for entry in instr[0]["addEntries"]["entries"]:
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								                entry_startswith = entry["entryId"].startswith
 								                if entry_startswith(("tweet-", "sq-I-t-")):
 								                    tweet_ids.append(
 								                        entry["content"]["item"]["content"]["tweet"]["id"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								                elif entry_startswith("homeConversation-"):
 								                    tweet_ids.extend(
 								                        entry["content"]["timelineModule"]["metadata"]
 								                        ["conversationMetadata"]["allTweetIds"][::-1])
 								                elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
-												[twitter] improve pagination

											
										
										
											4 years ago
+								                    cursor = entry["content"]["operation"]["cursor"]
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								                    if not cursor.get("stopOnEmptyResponse", True):
-												[twitter] improve pagination

											
										
										
											4 years ago
+								                        # keep going even if there are no tweets
 								                        tweet = True
 								                    cursor = cursor["value"]
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											4 years ago
+								                elif entry_startswith("conversationThread-"):
 								                    tweet_ids.extend(
 								                        item["entryId"][6:]
 								                        for item in entry["content"]["timelineModule"]["items"]
 								                        if item["entryId"].startswith("tweet-")
 								                    )
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								            # process tweets
 								            for tweet_id in tweet_ids:
 								                try:
 								                    tweet = tweets[tweet_id]
 								                except KeyError:
 								                    self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
 								                    continue
 								                if "retweeted_status_id_str" in tweet:
 								                    retweet = tweets.get(tweet["retweeted_status_id_str"])
 								                    if original_retweets:
 								                        if not retweet:
 								                            continue
-												[twitter] set 'retweet_id' for original retweets (#1481)

											
										
										
											3 years ago
+								                        retweet["retweeted_status_id_str"] = retweet["id_str"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								                        retweet["_retweet_id_str"] = tweet["id_str"]
 								                        tweet = retweet
 								                    elif retweet:
 								                        tweet["author"] = users[retweet["user_id_str"]]
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											3 years ago
+								                        if "extended_entities" in retweet and \
 								                                "extended_entities" not in tweet:
 								                            tweet["extended_entities"] = \
 								                                retweet["extended_entities"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								                tweet["user"] = users[tweet["user_id_str"]]
 								                yield tweet
 								                if "quoted_status_id_str" in tweet:
 								                    quoted = tweets.get(tweet["quoted_status_id_str"])
 								                    if quoted:
-												[twitter] fix issue when filtering quote tweets (#1792)

When a user quotes his own Tweet and that Tweet gets filtered by
'"quoted": false', it could also get filtered when it appeared later
as regular Tweet.

											
										
										
											3 years ago
+								                        quoted = quoted.copy()
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								                        quoted["author"] = users[quoted["user_id_str"]]
 								                        quoted["user"] = tweet["user"]
-												[twitter] add 'quote_by' metadata field (#1481)

Only present for tweets quoted by another tweet.
Represents the tweet_id of said tweet quoting this one.

											
										
										
											3 years ago
+								                        quoted["quoted_by_id_str"] = tweet["id_str"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											4 years ago
+								                        yield quoted
 								            # update cursor value
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            if "replaceEntry" in instr[-1] :
 								                cursor = (instr[-1]["replaceEntry"]["entry"]
 								                          ["content"]["operation"]["cursor"]["value"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
-												[twitter] improve pagination

											
										
										
											4 years ago
+								            if not cursor or not tweet:
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											5 years ago
+								                return
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											4 years ago
+								            params["cursor"] = cursor
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								    def _pagination_tweets(self, endpoint, variables, path=None):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        variables.update(self.variables)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								        original_retweets = (self.extractor.retweets == "original")
 								        pinned_tweet = self.extractor.pinned
 								        while True:
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								            params = {"variables": self._json_dumps(variables)}
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            data = self._call(endpoint, params)["data"]
-												[twitter] improve error handling

											
										
										
											3 years ago
+								            try:
 								                if path is None:
 								                    instructions = (data["user"]["result"]["timeline"]
 								                                    ["timeline"]["instructions"])
 								                else:
 								                    for key in path:
 								                        data = data[key]
 								                    instructions = data["instructions"]
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											3 years ago
 								                entries = instructions[0]["entries"]
 								            except (KeyError, IndexError):
-												[twitter] improve error handling

											
										
										
											3 years ago
+								                return
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											3 years ago
+								            tweets = []
 								            tweet = cursor = None
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            if pinned_tweet:
 								                pinned_tweet = False
 								                if instructions[-1]["type"] == "TimelinePinEntry":
-												[twitter] fix pinned tweets (#2216)

caused by the changes in dffa440edef9be1e169ef1e2d6bc0a492493ffce

											
										
										
											3 years ago
+								                    tweets.append(instructions[-1]["entry"])
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											3 years ago
+								            for entry in entries:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                esw = entry["entryId"].startswith
 								                if esw("tweet-"):
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                    tweets.append(entry)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                elif esw("homeConversation-"):
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                    tweets.extend(entry["content"]["items"])
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                elif esw("conversationthread-"):
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                    tweets.extend(entry["content"]["items"])
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                elif esw("cursor-bottom-"):
 								                    cursor = entry["content"]
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											3 years ago
+								                    if not cursor.get("stopOnEmptyResponse", True):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                        # keep going even if there are no tweets
 								                        tweet = True
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                    cursor = cursor.get("value")
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								            for entry in tweets:
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                try:
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								                    tweet = ((entry.get("content") or entry["item"])
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                             ["itemContent"]["tweet_results"]["result"])
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								                    legacy = tweet["legacy"]
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                except KeyError:
 								                    self.extractor.log.debug(
 								                        "Skipping %s (deleted)",
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								                        (entry.get("entryId") or "").rpartition("-")[2])
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                    continue
 								                if "retweeted_status_result" in legacy:
 								                    retweet = legacy["retweeted_status_result"]["result"]
 								                    if original_retweets:
-												[twitter] fix deleted/invalid retweets (#2225)

											
										
										
											3 years ago
+								                        try:
 								                            retweet["legacy"]["retweeted_status_id_str"] = \
 								                                retweet["rest_id"]
 								                            retweet["_retweet_id_str"] = tweet["rest_id"]
 								                            tweet = retweet
 								                        except KeyError:
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                            continue
-												[twitter] fix deleted/invalid retweets (#2225)

											
										
										
											3 years ago
+								                    else:
 								                        try:
 								                            legacy["retweeted_status_id_str"] = \
 								                                retweet["rest_id"]
 								                            legacy["author"] = \
 								                                retweet["core"]["user_results"]["result"]
 								                            if "extended_entities" in retweet["legacy"] and \
 								                                    "extended_entities" not in legacy:
 								                                legacy["extended_entities"] = \
 								                                    retweet["legacy"]["extended_entities"]
 								                        except KeyError:
 								                            pass
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											3 years ago
+								                yield tweet
 								                if "quoted_status_result" in tweet:
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											3 years ago
+								                    try:
 								                        quoted = tweet["quoted_status_result"]["result"]
 								                        quoted["legacy"]["author"] = \
 								                            quoted["core"]["user_results"]["result"]
 								                        quoted["core"] = tweet["core"]
 								                        quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
 								                        yield quoted
 								                    except KeyError:
 								                        self.extractor.log.debug(
 								                            "Skipping quote of %s (deleted)",
 								                            tweet.get("rest_id"))
 								                        continue
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											3 years ago
+								            if not tweet or not cursor:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                return
 								            variables["cursor"] = cursor
 								    def _pagination_users(self, endpoint, variables, path=None):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								        variables.update(self.variables)
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								        while True:
 								            cursor = entry = stop = None
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											3 years ago
+								            params = {"variables": self._json_dumps(variables)}
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								            data = self._call(endpoint, params)["data"]
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
-												[twitter] improve error handling

											
										
										
											3 years ago
+								            try:
 								                if path is None:
 								                    instructions = (data["user"]["result"]["timeline"]
 								                                    ["timeline"]["instructions"])
 								                else:
 								                    for key in path:
 								                        data = data[key]
 								                    instructions = data["instructions"]
 								            except KeyError:
 								                return
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
 								            for instr in instructions:
 								                if instr["type"] == "TimelineAddEntries":
 								                    for entry in instr["entries"]:
 								                        if entry["entryId"].startswith("user-"):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											3 years ago
+								                            user = (entry["content"]["itemContent"]
 								                                    ["user_results"]["result"])
 								                            if "rest_id" in user:
 								                                yield user
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											4 years ago
+								                        elif entry["entryId"].startswith("cursor-bottom-"):
 								                            cursor = entry["content"]["value"]
 								                elif instr["type"] == "TimelineTerminateTimeline":
 								                    if instr["direction"] == "Bottom":
 								                        stop = True
 								            if stop or not cursor or not entry:
 								                return
 								            variables["cursor"] = cursor