# -*- coding: utf-8 -*- # Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for Nitter instances""" from .common import BaseExtractor, Message from .. import text import binascii class NitterExtractor(BaseExtractor): """Base class for nitter extractors""" basecategory = "nitter" directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{num}" def __init__(self, match): self.cookies_domain = self.root.partition("://")[2] BaseExtractor.__init__(self, match) lastindex = match.lastindex self.user = match.group(lastindex) self.user_id = match.group(lastindex + 1) self.user_obj = None def items(self): retweets = self.config("retweets", False) videos = self.config("videos", True) if videos: ytdl = (videos == "ytdl") videos = True self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain) for tweet in self.tweets(): if not retweets and tweet["retweet"]: self.log.debug("Skipping %s (retweet)", tweet["tweet_id"]) continue attachments = tweet.pop("_attach", "") if attachments: files = [] append = files.append for url in text.extract_iter( attachments, 'href="', '"'): if "/i/broadcasts/" in url: self.log.debug( "Skipping unsupported broadcast '%s'", url) continue if "/enc/" in url: name = binascii.a2b_base64(url.rpartition( "/")[2]).decode().rpartition("/")[2] else: name = url.rpartition("%2F")[2] if url[0] == "/": url = self.root + url file = {"url": url, "_http_retry": _retry_on_404} file["filename"], _, file["extension"] = \ name.rpartition(".") append(file) if videos and not files: if ytdl: append({ "url": "ytdl:{}/i/status/{}".format( self.root, tweet["tweet_id"]), "extension": None, }) else: for url in text.extract_iter( attachments, 'data-url="', '"'): if "/enc/" in url: name = binascii.a2b_base64(url.rpartition( "/")[2]).decode().rpartition("/")[2] else: name = url.rpartition("%2F")[2] if url[0] == "/": url = self.root + url append({ "url" : "ytdl:" + url, "filename" : name.rpartition(".")[0], "extension": "mp4", }) for url in text.extract_iter( attachments, '")[2], "_attach" : extr('class="attachments', 'class="tweet-stats'), "comments": text.parse_int(extr( 'class="icon-comment', '').rpartition(">")[2]), "retweets": text.parse_int(extr( 'class="icon-retweet', '').rpartition(">")[2]), "quotes" : text.parse_int(extr( 'class="icon-quote', '').rpartition(">")[2]), "likes" : text.parse_int(extr( 'class="icon-heart', '').rpartition(">")[2]), "retweet" : 'class="retweet-header' in html, "quoted" : False, } def _tweet_from_quote(self, html): extr = text.extract_from(html) author = { "name": extr('class="fullname" href="/', '"'), "nick": extr('title="', '"'), } extr('")[2], "_attach" : extr('class="attachments', ''' '''), "retweet" : False, "quoted" : True, } def _user_from_html(self, html): extr = text.extract_from(html, html.index('class="profile-tabs')) banner = extr('class="profile-banner">', '<'), "date" : text.parse_datetime( extr('class="profile-joindate">', '<').replace(",", "")), "friends_count" : text.parse_int(extr( 'class="profile-stat-num">', '<').replace(",", "")), "followers_count" : text.parse_int(extr( 'class="profile-stat-num">', '<').replace(",", "")), "favourites_count": text.parse_int(extr( 'class="profile-stat-num">', '<').replace(",", "")), "verified" : 'title="Verified account"' in html, } def _extract_quote(self, html): html, _, quote = html.partition('class="quote') if quote: quote, _, tail = quote.partition('class="tweet-published') return (html + tail, quote) return (html, None) def _pagination(self, path): quoted = self.config("quoted", False) if self.user_id: self.user = self.request( "{}/i/user/{}".format(self.root, self.user_id), allow_redirects=False, ).headers["location"].rpartition("/")[2] base_url = url = "{}/{}{}".format(self.root, self.user, path) while True: tweets_html = self.request(url).text.split( '