From 34a7fab0e2be42d10489b858be6f8218d1f0745a Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Mon, 6 Mar 2023 19:51:25 +0800 Subject: [PATCH 01/20] [generic] add support for IDNs (internationalized domain name) --- gallery_dl/extractor/directlink.py | 4 ++++ gallery_dl/extractor/generic.py | 26 ++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 8b90250f..4827be52 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -44,6 +44,10 @@ class DirectlinkExtractor(Extractor): ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw" ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP" "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"), + # internationalized domain name + ("https://räksmörgås.josefsson.org/raksmorgas.jpg", { + "content": "f7e00768ab009c969e70d775047cdd302ca51762", + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 9292da3d..d4276e62 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -26,12 +26,34 @@ class GenericExtractor(Extractor): # Based on: https://tools.ietf.org/html/rfc3986#appendix-B pattern += r""" (?Phttps?://)? # optional http(s) scheme - (?P[-\w\.]+) # required domain + (?P[^/?#]+) # required domain (?P/[^?#]*)? # optional path (?:\?(?P[^#]*))? # optional query (?:\#(?P.*))? # optional fragment """ + test = ( + ("generic:https://www.nongnu.org/lzip/", { + "count": 1, + "content": "40be5c77773d3e91db6e1c5df720ee30afb62368", + "keyword": { + "description": "Lossless data compressor", + "imageurl": "https://www.nongnu.org/lzip/lzip.png", + "keywords": "lzip, clzip, plzip, lzlib, LZMA, bzip2, " + "gzip, data compression, GNU, free software", + "pageurl": "https://www.nongnu.org/lzip/", + }, + }), + # internationalized domain name + ("generic:https://räksmörgås.josefsson.org/", { + "count": 2, + "pattern": "^https://räksmörgås.josefsson.org/", + }), + ("generic:https://en.wikipedia.org/Main_Page"), + ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"), + ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), + ) + def __init__(self, match): """Init.""" Extractor.__init__(self, match) @@ -56,7 +78,7 @@ class GenericExtractor(Extractor): self.root = self.scheme + match.group('domain') def items(self): - """Get page, extract metadata & images, yield them in suitable messages. + """Get page, extract metadata & images, yield them in suitable messages Adapted from common.GalleryExtractor.items() From c489aecb3e6fe95f7da16ef5e7a0386495b40e79 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Tue, 7 Mar 2023 15:51:01 +0800 Subject: [PATCH 02/20] [deviantart] add support for fxdeviantart.com URLs fxdeviantart.com is a service that fixes embeds on Discord, similar to fxtwitter.com --- gallery_dl/extractor/deviantart.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 87ac9d2f..37475df2 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -21,8 +21,8 @@ import re BASE_PATTERN = ( r"(?:https?://)?(?:" - r"(?:www\.)?deviantart\.com/(?!watch/)([\w-]+)|" - r"(?!www\.)([\w-]+)\.deviantart\.com)" + r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|" + r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)" ) @@ -997,7 +997,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): subcategory = "deviation" archive_fmt = "g_{_username}_{index}.{extension}" pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" - r"|(?:https?://)?(?:www\.)?deviantart\.com/" + r"|(?:https?://)?(?:www\.)?(?:fx)?deviantart\.com/" r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)" r"(\d+)" # bare deviation ID without slug r"|(?:https?://)?fav\.me/d([0-9a-z]+)") # base36 @@ -1091,6 +1091,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor): # old /view/ URLs from the Wayback Machine ("https://www.deviantart.com/view.php?id=14864502"), ("http://www.deviantart.com/view-full.php?id=100842"), + + ("https://www.fxdeviantart.com/zzz/art/zzz-1234567890"), + ("https://www.fxdeviantart.com/view/1234567890"), ) skip = Extractor.skip From 9abcb2b6e5a918ab5ea7ed99d67b9d9f8b83acf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 8 Mar 2023 17:19:59 +0100 Subject: [PATCH 03/20] update headers and ciphers for '"browser": "chrome"' --- gallery_dl/extractor/common.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 4cefa1c9..8024be9f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -791,15 +791,21 @@ HTTP_HEADERS = { ("TE", "trailers"), ), "chrome": ( + ("Connection", "keep-alive"), ("Upgrade-Insecure-Requests", "1"), ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/92.0.4515.131 Safari/537.36"), + "like Gecko) Chrome/111.0.0.0 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/webp,image/apng,*/*;q=0.8"), + "image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7"), ("Referer", None), + ("Sec-Fetch-Site", "same-origin"), + ("Sec-Fetch-Mode", "no-cors"), + ("Sec-Fetch-Dest", "empty"), ("Accept-Encoding", None), ("Accept-Language", "en-US,en;q=0.9"), - ("Cookie", None), + ("cookie", None), + ("content-length", None), ), } @@ -838,8 +844,7 @@ SSL_CIPHERS = { "AES128-GCM-SHA256:" "AES256-GCM-SHA384:" "AES128-SHA:" - "AES256-SHA:" - "DES-CBC3-SHA" + "AES256-SHA" ), } From 9037128315c25d970a73edcdb820fd5349141b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 8 Mar 2023 18:33:19 +0100 Subject: [PATCH 04/20] [twitter] fix some 'original' retweets not downloading (#3744) --- gallery_dl/extractor/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 03adc15b..29b4ac35 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1440,6 +1440,8 @@ class TwitterAPI(): if "retweeted_status_result" in legacy: retweet = legacy["retweeted_status_result"]["result"] + if "tweet" in retweet: + retweet = retweet["tweet"] if original_retweets: try: retweet["legacy"]["retweeted_status_id_str"] = \ From 4883420e673c68b146511909707e6754c17f2fcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 9 Mar 2023 22:25:23 +0100 Subject: [PATCH 05/20] [generic] revert pattern change --- gallery_dl/extractor/directlink.py | 3 ++- gallery_dl/extractor/generic.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 4827be52..e85eb8db 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -46,7 +46,8 @@ class DirectlinkExtractor(Extractor): "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"), # internationalized domain name ("https://räksmörgås.josefsson.org/raksmorgas.jpg", { - "content": "f7e00768ab009c969e70d775047cdd302ca51762", + "url": "a65667f670b194afbd1e3ea5e7a78938d36747da", + "keyword": "fd5037fe86eebd4764e176cbaf318caec0f700be", }), ) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index d4276e62..99992837 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -26,7 +26,7 @@ class GenericExtractor(Extractor): # Based on: https://tools.ietf.org/html/rfc3986#appendix-B pattern += r""" (?Phttps?://)? # optional http(s) scheme - (?P[^/?#]+) # required domain + (?P[-\w\.]+) # required domain (?P/[^?#]*)? # optional path (?:\?(?P[^#]*))? # optional query (?:\#(?P.*))? # optional fragment From 67ec91cdbd680035eedea70941579036838eb799 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 9 Mar 2023 23:30:15 +0100 Subject: [PATCH 06/20] [downloader:http] change '_http_retry' to accept a Python function and rename '_http_retry_codes' to '_http_retry' (#3569) --- gallery_dl/downloader/http.py | 12 ++++-------- gallery_dl/extractor/nitter.py | 9 +++++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index f14af249..e977320f 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -100,13 +100,6 @@ class HttpDownloader(DownloaderBase): adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) - codes = kwdict.get("_http_retry_codes") - if codes: - retry_codes = list(self.retry_codes) - retry_codes += codes - else: - retry_codes = self.retry_codes - if self.part and not metadata: pathfmt.part_enable(self.partdir) @@ -167,7 +160,10 @@ class HttpDownloader(DownloaderBase): break else: msg = "'{} {}' for '{}'".format(code, response.reason, url) - if code in retry_codes or 500 <= code < 600: + if code in self.retry_codes or 500 <= code < 600: + continue + retry = kwdict.get("_http_retry") + if retry and retry(response): continue self.log.warning(msg) return False diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index f9c6abfc..f5248397 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -59,10 +59,7 @@ class NitterExtractor(BaseExtractor): if url[0] == "/": url = self.root + url - file = { - "url": url, - "_http_retry_codes": (404,), - } + file = {"url": url, "_http_retry": _retry_on_404} file["filename"], _, file["extension"] = \ name.rpartition(".") append(file) @@ -468,3 +465,7 @@ class NitterTweetExtractor(NitterExtractor): quoted["user"] = tweet["user"] return (tweet, quoted) return (tweet,) + + +def _retry_on_404(response): + return response.status_code == 404 From 817fc0fbd1c1d0921c51b23a3b0fef6bb26f2284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 9 Mar 2023 23:47:25 +0100 Subject: [PATCH 07/20] [nitter] remove nitter.pussthecat.org "Shutdown" --- docs/supportedsites.md | 6 ------ gallery_dl/extractor/nitter.py | 24 ++++++++---------------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220ce0f2..0397ff59 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1179,12 +1179,6 @@ Consider all sites to be NSFW unless otherwise known. Media Files, Replies, Search Results, Tweets - - Nitter.pussthecat.org - https://nitter.pussthecat.org/ - Media Files, Replies, Search Results, Tweets - - Nitter.1d4.us https://nitter.1d4.us/ diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index f5248397..9b69694a 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -217,10 +217,6 @@ BASE_PATTERN = NitterExtractor.update({ "root": "https://nitter.lacontrevoie.fr", "pattern": r"nitter\.lacontrevoie\.fr", }, - "nitter.pussthecat.org": { - "root": "https://nitter.pussthecat.org", - "pattern": r"nitter\.pussthecat\.org", - }, "nitter.1d4.us": { "root": "https://nitter.1d4.us", "pattern": r"nitter\.1d4\.us", @@ -280,13 +276,12 @@ class NitterTweetsExtractor(NitterExtractor): }, }, }), - ("https://nitter.pussthecat.org/i/user/2976459548", { - "url": "c740a2683db2c8ed2f350afc0494475c4444025b", - "pattern": r"https://nitter.pussthecat\.org/pic/orig" + ("https://nitter.lacontrevoie.fr/supernaturepics", { + "url": "54f4b55f2099dcc248f3fb7bfacf1349e08d8e2d", + "pattern": r"https://nitter\.lacontrevoie\.fr/pic/orig" r"/media%2FCGMNYZvW0AIVoom\.jpg", "range": "1", }), - ("https://nitter.lacontrevoie.fr/supernaturepics"), ("https://nitter.1d4.us/supernaturepics"), ("https://nitter.kavin.rocks/id:2976459548"), ("https://nitter.unixfox.eu/supernaturepics"), @@ -306,7 +301,6 @@ class NitterRepliesExtractor(NitterExtractor): "range": "1-20", }), ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"), - ("https://nitter.pussthecat.org/supernaturepics/with_replies"), ("https://nitter.1d4.us/supernaturepics/with_replies"), ("https://nitter.kavin.rocks/id:2976459548/with_replies"), ("https://nitter.unixfox.eu/i/user/2976459548/with_replies"), @@ -331,7 +325,6 @@ class NitterMediaExtractor(NitterExtractor): "range": "1-20", }), ("https://nitter.lacontrevoie.fr/supernaturepics/media"), - ("https://nitter.pussthecat.org/supernaturepics/media"), ("https://nitter.1d4.us/supernaturepics/media"), ("https://nitter.unixfox.eu/i/user/2976459548/media"), ) @@ -350,7 +343,6 @@ class NitterSearchExtractor(NitterExtractor): "range": "1-20", }), ("https://nitter.lacontrevoie.fr/supernaturepics/search"), - ("https://nitter.pussthecat.org/supernaturepics/search"), ("https://nitter.1d4.us/supernaturepics/search"), ("https://nitter.kavin.rocks/id:2976459548/search"), ("https://nitter.unixfox.eu/i/user/2976459548/search"), @@ -372,7 +364,7 @@ class NitterTweetExtractor(NitterExtractor): "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", "keyword": { - "comments": 16, + "comments": 19, "content": "Big Wedeene River, Canada", "count": 1, "date": "dt:2015-05-29 17:40:00", @@ -396,9 +388,9 @@ class NitterTweetExtractor(NitterExtractor): "url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff", }), # video - ("https://nitter.pussthecat.org/i/status/1065692031626829824", { - "pattern": r"ytdl:https://nitter.pussthecat.org/video" - r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F" + ("https://nitter.lacontrevoie.fr/i/status/1065692031626829824", { + "pattern": r"ytdl:https://nitter\.lacontrevoie\.fr/video" + r"/[0-9A-F]{10,}/https%3A%2F%2Fvideo.twimg.com%2F" r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F" r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", "keyword": { @@ -443,7 +435,7 @@ class NitterTweetExtractor(NitterExtractor): "count": 0, }), # "Misleading" content - ("https://nitter.pussthecat.org/i/status/1486373748911575046", { + ("https://nitter.lacontrevoie.fr/i/status/1486373748911575046", { "count": 4, }), # age-restricted (#2354) From 4235d412c4c19a0be3b8f03582f7ab0fe58654a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 Mar 2023 22:08:10 +0100 Subject: [PATCH 08/20] implement 'actions' continuation of d37e7f48 but more versatile and extendable Example: "actions": [ # change debug messages to info ["debug", "level ~info"], # change exit status to a non-zero value ["info:^No results for", "status |= 1"], # exit with status 2 on 429 ["warning:429", "exit 2"], # restart extractor when no cookies found ["warning:^[Nn]o .*cookies", "restart"] ] --- gallery_dl/actions.py | 112 ++++++++++++++++++++++++++++++++++++++++++ gallery_dl/job.py | 22 +++------ gallery_dl/output.py | 41 +++++++--------- 3 files changed, 139 insertions(+), 36 deletions(-) create mode 100644 gallery_dl/actions.py diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py new file mode 100644 index 00000000..15ca31ec --- /dev/null +++ b/gallery_dl/actions.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +""" """ + +import re +import sys +import logging +import operator +from . import util, exception + + +def parse(actionspec): + if isinstance(actionspec, dict): + actionspec = actionspec.items() + + actions = {} + actions[logging.DEBUG] = actions_d = [] + actions[logging.INFO] = actions_i = [] + actions[logging.WARNING] = actions_w = [] + actions[logging.ERROR] = actions_e = [] + + for event, spec in actionspec: + level, _, pattern = event.partition(":") + type, _, args = spec.partition(" ") + action = (re.compile(pattern).search, ACTIONS[type](args)) + + level = level.strip() + if not level or level == "*": + actions_d.append(action) + actions_i.append(action) + actions_w.append(action) + actions_e.append(action) + else: + + actions[_level_to_int(level)].append(action) + + return actions + + +def _level_to_int(level): + try: + return logging._nameToLevel[level] + except KeyError: + return int(level) + + +def action_print(opts): + def _print(_): + print(opts) + return _print + + +def action_status(opts): + op, value = re.match(r"\s*([&|^=])=?\s*(\d+)", opts).groups() + + op = { + "&": operator.and_, + "|": operator.or_, + "^": operator.xor, + "=": lambda x, y: y, + }[op] + + value = int(value) + + def _status(args): + args["job"].status = op(args["job"].status, value) + return _status + + +def action_level(opts): + level = _level_to_int(opts.lstrip(" ~=")) + + def _level(args): + args["level"] = level + return _level + + +def action_wait(opts): + def _wait(args): + input("Press Enter to continue") + return _wait + + +def action_restart(opts): + return util.raises(exception.RestartExtraction) + + +def action_exit(opts): + try: + opts = int(opts) + except ValueError: + pass + + def _exit(args): + sys.exit(opts) + return _exit + + +ACTIONS = { + "print" : action_print, + "status" : action_status, + "level" : action_level, + "restart": action_restart, + "wait" : action_wait, + "exit" : action_exit, +} diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 4c4e9259..a64c040f 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -6,7 +6,6 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -import re import sys import errno import logging @@ -33,15 +32,11 @@ class Job(): self.kwdict = {} self.status = 0 - hooks = extr.config("hooks") - if hooks: - if isinstance(hooks, dict): - hooks = hooks.items() - self._wrap_logger = self._wrap_logger_hooks - self._logger_hooks = [ - (re.compile(pattern).search, hook) - for pattern, hook in hooks - ] + actions = extr.config("actions") + if actions: + from .actions import parse + self._logger_actions = parse(actions) + self._wrap_logger = self._wrap_logger_actions path_proxy = output.PathfmtProxy(self) self._logger_extra = { @@ -211,11 +206,10 @@ class Job(): return self._wrap_logger(logging.getLogger(name)) def _wrap_logger(self, logger): - return output.LoggerAdapter(logger, self._logger_extra) + return output.LoggerAdapter(logger, self) - def _wrap_logger_hooks(self, logger): - return output.LoggerAdapterEx( - logger, self._logger_extra, self) + def _wrap_logger_actions(self, logger): + return output.LoggerAdapterActions(logger, self) def _write_unsupported(self, url): if self.ulog: diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 7d74b699..1d53851d 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -12,7 +12,7 @@ import shutil import logging import functools import unicodedata -from . import config, util, formatter, exception +from . import config, util, formatter # -------------------------------------------------------------------- @@ -39,9 +39,9 @@ class LoggerAdapter(): """Trimmed-down version of logging.LoggingAdapter""" __slots__ = ("logger", "extra") - def __init__(self, logger, extra): + def __init__(self, logger, job): self.logger = logger - self.extra = extra + self.extra = job._logger_extra def debug(self, msg, *args, **kwargs): if self.logger.isEnabledFor(logging.DEBUG): @@ -64,12 +64,12 @@ class LoggerAdapter(): self.logger._log(logging.ERROR, msg, args, **kwargs) -class LoggerAdapterEx(): +class LoggerAdapterActions(): - def __init__(self, logger, extra, job): + def __init__(self, logger, job): self.logger = logger - self.extra = extra - self.job = job + self.extra = job._logger_extra + self.actions = job._logger_actions self.debug = functools.partial(self.log, logging.DEBUG) self.info = functools.partial(self.log, logging.INFO) @@ -79,24 +79,21 @@ class LoggerAdapterEx(): def log(self, level, msg, *args, **kwargs): if args: msg = msg % args - args = None - - for search, action in self.job._logger_hooks: - match = search(msg) - if match: - if action == "wait+restart": - kwargs["extra"] = self.extra - self.logger._log(level, msg, args, **kwargs) - input("Press Enter to continue") - raise exception.RestartExtraction() - elif action.startswith("~"): - level = logging._nameToLevel[action[1:]] - elif action.startswith("|"): - self.job.status |= int(action[1:]) + + actions = self.actions[level] + if actions: + args = self.extra.copy() + args["level"] = level + + for cond, action in actions: + if cond(msg): + action(args) + + level = args["level"] if self.logger.isEnabledFor(level): kwargs["extra"] = self.extra - self.logger._log(level, msg, args, **kwargs) + self.logger._log(level, msg, (), **kwargs) class PathfmtProxy(): From a14a2d6e5963d6213e732da9b277731e898036aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 Mar 2023 21:05:28 +0100 Subject: [PATCH 09/20] release version 1.25.0 --- CHANGELOG.md | 67 +++++++++++++++++++++++++++++++++++++++++++ README.rst | 4 +-- gallery_dl/version.py | 2 +- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3beecbbc..5d805c26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,72 @@ # Changelog +## 1.25.0 - 2023-03-11 +### Changes +- [e621] split `e621` extractors from `danbooru` module ([#3425](https://github.com/mikf/gallery-dl/issues/3425)) +- [deviantart] remove mature scraps warning ([#3691](https://github.com/mikf/gallery-dl/issues/3691)) +- [deviantart] use `/collections/all` endpoint for favorites ([#3666](https://github.com/mikf/gallery-dl/issues/3666) ,#3668) +- [newgrounds] update default image and audio archive IDs to prevent ID overlap ([#3681](https://github.com/mikf/gallery-dl/issues/3681)) +- rename `--ignore-config` to `--config-ignore` +### Extractors +- [catbox] add `file` extractor ([#3570](https://github.com/mikf/gallery-dl/issues/3570)) +- [deviantart] add `search` extractor ([#538](https://github.com/mikf/gallery-dl/issues/538), [#1264](https://github.com/mikf/gallery-dl/issues/1264), [#2954](https://github.com/mikf/gallery-dl/issues/2954), [#2970](https://github.com/mikf/gallery-dl/issues/2970), [#3577](https://github.com/mikf/gallery-dl/issues/3577)) +- [deviantart] add `gallery-search` extractor ([#1695](https://github.com/mikf/gallery-dl/issues/1695)) +- [deviantart] support `fxdeviantart.com` URLs (##3740) +- [e621] implement `notes` and `pools` metadata extraction ([#3425](https://github.com/mikf/gallery-dl/issues/3425)) +- [gelbooru] add `favorite` extractor ([#3704](https://github.com/mikf/gallery-dl/issues/3704)) +- [imagetwist] support `phun.imagetwist.com` and `imagehaha.com` domains ([#3622](https://github.com/mikf/gallery-dl/issues/3622)) +- [instagram] add `user` metadata field ([#3107](https://github.com/mikf/gallery-dl/issues/3107)) +- [manganelo] update and fix metadata extraction +- [manganelo] support mobile-only chapters +- [mangasee] extract `author` and `genre` metadata ([#3703](https://github.com/mikf/gallery-dl/issues/3703)) +- [misskey] add `misskey` extractors ([#3717](https://github.com/mikf/gallery-dl/issues/3717)) +- [pornpics] add `gallery` and `search` extractors ([#263](https://github.com/mikf/gallery-dl/issues/263), [#3544](https://github.com/mikf/gallery-dl/issues/3544), [#3654](https://github.com/mikf/gallery-dl/issues/3654)) +- [redgifs] support v3 URLs ([#3588](https://github.com/mikf/gallery-dl/issues/3588). [#3589](https://github.com/mikf/gallery-dl/issues/3589)) +- [redgifs] add `collection` extractors ([#3427](https://github.com/mikf/gallery-dl/issues/3427), [#3662](https://github.com/mikf/gallery-dl/issues/3662)) +- [shopify] support ohpolly.com ([#440](https://github.com/mikf/gallery-dl/issues/440), [#3596](https://github.com/mikf/gallery-dl/issues/3596)) +- [szurubooru] add `tag` and `post` extractors ([#3583](https://github.com/mikf/gallery-dl/issues/3583), [#3713](https://github.com/mikf/gallery-dl/issues/3713)) +- [twitter] add `transform` option +### Options +- [postprocessor:metadata] add `sort` and `separators` options +- [postprocessor:exec] implement archive options ([#3584](https://github.com/mikf/gallery-dl/issues/3584)) +- add `--config-create` command-line option ([#2333](https://github.com/mikf/gallery-dl/issues/2333)) +- add `--config-toml` command-line option to load config files in TOML format +- add `output.stdout`, `output.stdin`, and `output.stderr` options ([#1621](https://github.com/mikf/gallery-dl/issues/1621), [#2152](https://github.com/mikf/gallery-dl/issues/2152), [#2529](https://github.com/mikf/gallery-dl/issues/2529)) +- add `hash_md5` and `hash_sha1` functions ([#3679](https://github.com/mikf/gallery-dl/issues/3679)) +- implement `globals` option to enable defining custom functions for `eval` statements +- implement `archive-pragma` option to use SQLite PRAGMA statements +- implement `actions` to trigger events on logging messages ([#3338](https://github.com/mikf/gallery-dl/issues/3338), [#3630](https://github.com/mikf/gallery-dl/issues/3630)) +- implement ability to load external extractor classes + - `-X/--extractors` command-line options + - `extractor.modules-sources` config option +### Fixes +- [bunkr] fix extraction ([#3636](https://github.com/mikf/gallery-dl/issues/3636), [#3655](https://github.com/mikf/gallery-dl/issues/3655)) +- [danbooru] send gallery-dl User-Agent ([#3665](https://github.com/mikf/gallery-dl/issues/3665)) +- [deviantart] fix crash when handling deleted deviations in status updates ([#3656](https://github.com/mikf/gallery-dl/issues/3656)) +- [fanbox] fix crash with missing images ([#3673](https://github.com/mikf/gallery-dl/issues/3673)) +- [imagefap] update `gallery` URLs ([#3595](https://github.com/mikf/gallery-dl/issues/3595)) +- [imagefap] fix infinite pagination loop ([#3594](https://github.com/mikf/gallery-dl/issues/3594)) +- [imagefap] fix metadata extraction +- [oauth] use default name for browsers without `name` attribute +- [pinterest] unescape search terms ([#3621](https://github.com/mikf/gallery-dl/issues/3621)) +- [pixiv] fix `--write-tags` for `"tags": "original"` ([#3675](https://github.com/mikf/gallery-dl/issues/3675)) +- [poipiku] warn about incorrect passwords ([#3646](https://github.com/mikf/gallery-dl/issues/3646)) +- [reddit] update `videos` option ([#3712](https://github.com/mikf/gallery-dl/issues/3712)) +- [soundgasm] rewrite ([#3578](https://github.com/mikf/gallery-dl/issues/3578)) +- [telegraph] fix extraction when images are not in `
` elements ([#3590](https://github.com/mikf/gallery-dl/issues/3590)) +- [tumblr] raise more detailed errors for dashboard-only blogs ([#3628](https://github.com/mikf/gallery-dl/issues/3628)) +- [twitter] fix some `original` retweets not downloading ([#3744](https://github.com/mikf/gallery-dl/issues/3744)) +- [ytdl] fix `--parse-metadata` ([#3663](https://github.com/mikf/gallery-dl/issues/3663)) +- [downloader:ytdl] prevent exception on empty results +### Improvements +- [downloader:http] use `time.monotonic()` +- [downloader:http] update `_http_retry` to accept a Python function ([#3569](https://github.com/mikf/gallery-dl/issues/3569)) +- [postprocessor:metadata] speed up JSON encoding +- replace `json.loads/dumps` with direct calls to `JSONDecoder.decode/JSONEncoder.encode` +- improve `option.Formatter` performance +### Removals +- [nitter] remove `nitter.pussthecat.org` + ## 1.24.5 - 2023-01-28 ### Additions - [booru] add `url` option diff --git a/README.rst b/README.rst index ed4afa53..c980bced 100644 --- a/README.rst +++ b/README.rst @@ -69,9 +69,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 19e49be0..494b7f5f 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.0-dev" +__version__ = "1.25.0" From f7ce33c85c550307dd505b68240e9f64498aee65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 13 Mar 2023 12:04:24 +0100 Subject: [PATCH 10/20] [output] set 'errors=replace' for output streams (#3765) fixes regression from e480a933 --- gallery_dl/output.py | 11 +++++------ gallery_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 1d53851d..4f2ee269 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -270,16 +270,15 @@ else: def configure_standard_streams(): for name in ("stdout", "stderr", "stdin"): - options = config.get(("output",), name) - if not options: - continue - stream = getattr(sys, name, None) if not stream: continue - if isinstance(options, str): - options = {"encoding": options, "errors": "replace"} + options = config.get(("output",), name) + if not options: + options = {"errors": "replace"} + elif isinstance(options, str): + options = {"errors": "replace", "encoding": options} elif not options.get("errors"): options["errors"] = "replace" diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 494b7f5f..5d4b3709 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.0" +__version__ = "1.25.1-dev" From e7898936dfa294c62fcb00c47aa129a2ebb73399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 14 Mar 2023 23:01:36 +0100 Subject: [PATCH 11/20] add link to 'Get cookies.txt LOCALLY' to README --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c980bced..2f97eb7a 100644 --- a/README.rst +++ b/README.rst @@ -285,7 +285,8 @@ This can be done via the option in your configuration file by specifying - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon - | (e.g. `Export Cookies `__ for Firefox) + | (e.g. `Get cookies.txt LOCALLY `__ for Chrome, + `Export Cookies `__ for Firefox) - | a list of name-value pairs gathered from your browser's web developer tools | (in `Chrome `__, From 17bd053d94f816b5fbf3afd6d3d700b5c6e582b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 15 Mar 2023 14:28:03 +0100 Subject: [PATCH 12/20] [hiperdex] fix extraction (#3768) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/hiperdex.py | 34 +++++++++++--------------------- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0397ff59..22e848a5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -309,7 +309,7 @@ Consider all sites to be NSFW unless otherwise known. Hiperdex - https://1sthiperdex.com/ + https://hiperdex.com/ Artists, Chapters, Manga diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index d61c139b..98641110 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://1sthiperdex.com/""" +"""Extractors for https://hiperdex.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -20,7 +20,7 @@ BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://1sthiperdex.com" + root = "https://hiperdex.com" @memcache(keyarg=1) def manga_data(self, manga, page=None): @@ -31,7 +31,7 @@ class HiperdexBase(): return { "manga" : text.unescape(extr( - "", "<").rpartition("&")[0].strip()), + "<title>", "<").rpartition(" - ")[0].strip()), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( @@ -65,10 +65,10 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): - """Extractor for manga chapters from 1sthiperdex.com""" + """Extractor for manga chapters from hiperdex.com""" pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" test = ( - ("https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/", { + ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", { "pattern": r"https://(1st)?hiperdex\d?.(com|net|info)" r"/wp-content/uploads/WP-manga/data" r"/manga_\w+/[0-9a-f]{32}/\d+\.webp", @@ -86,7 +86,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): "type" : "Manga", }, }), - ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/"), + ("https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex2.com/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex.net/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex.info/manga/domestic-na-kanojo/154-5/"), @@ -109,11 +109,11 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): - """Extractor for manga from 1sthiperdex.com""" + """Extractor for manga from hiperdex.com""" chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" test = ( - ("https://1sthiperdex.com/manga/youre-not-that-special/", { + ("https://hiperdex.com/manga/youre-not-that-special/", { "count": 51, "pattern": HiperdexChapterExtractor.pattern, "keyword": { @@ -130,7 +130,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): "type" : "Manhwa", }, }), - ("https://hiperdex.com/manga/youre-not-that-special/"), + ("https://1sthiperdex.com/manga/youre-not-that-special/"), ("https://hiperdex2.com/manga/youre-not-that-special/"), ("https://hiperdex.net/manga/youre-not-that-special/"), ("https://hiperdex.info/manga/youre-not-that-special/"), @@ -145,19 +145,9 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): self.manga_data(self.manga, page) results = [] - shortlink = text.extr(page, "rel='shortlink' href='", "'") - data = { - "action" : "manga_get_reading_nav", - "manga" : shortlink.rpartition("=")[2], - "chapter" : "", - "volume_id": "", - "style" : "list", - "type" : "manga", - } - url = self.root + "/wp-admin/admin-ajax.php" - page = self.request(url, method="POST", data=data).text - - for url in text.extract_iter(page, 'data-redirect="', '"'): + for html in text.extract_iter( + page, '<li class="wp-manga-chapter', '</li>'): + url = text.extr(html, 'href="', '"') chapter = url.rpartition("/")[2] results.append((url, self.chapter_data(chapter))) From b756dc13aa5c6ca244a3740a09a22bc5205a2bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 15 Mar 2023 14:58:55 +0100 Subject: [PATCH 13/20] [gelbooru] warn about missing cookies for favorites (#3704) and add docstring so it shows up in --list-extractors --- gallery_dl/extractor/gelbooru.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 80b0ae14..a4f62992 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -56,6 +56,9 @@ class GelbooruBase(): params["tags"] = "{} id:<{}".format(self.tags, post["id"]) def _pagination_html(self, params): + if self.cookiedomain and not self._check_cookies(self.cookienames): + self.log.warning("no 'user_id' or 'pass_hash' cookies set") + url = self.root + "/index.php" params["pid"] = self.page_start * self.per_page @@ -158,7 +161,10 @@ class GelbooruPoolExtractor(GelbooruBase, class GelbooruFavoriteExtractor(GelbooruBase, gelbooru_v02.GelbooruV02FavoriteExtractor): + """Extractor for gelbooru favorites""" pattern = BASE_PATTERN + r"page=favorites&s=view&id=(\d+)" + cookiedomain = "gelbooru.com" + cookienames = ("user_id", "pass_hash") test = ("https://gelbooru.com/index.php?page=favorites&s=view&id=12345",) From dcb8af659a204f357cb110787716edf4bf8c79be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 15 Mar 2023 18:10:53 +0100 Subject: [PATCH 14/20] [gelbooru] extract favorites without needing cookies (#3704) TODO: fix --range --- gallery_dl/extractor/gelbooru.py | 44 ++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index a4f62992..586987c6 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -22,17 +22,19 @@ class GelbooruBase(): basecategory = "booru" root = "https://gelbooru.com" - def _api_request(self, params): + def _api_request(self, params, key="post"): + if "s" not in params: + params["s"] = "post" params["api_key"] = self.api_key params["user_id"] = self.user_id - url = self.root + "/index.php?page=dapi&s=post&q=index&json=1" + url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() - if "post" not in data: + if key not in data: return () - posts = data["post"] + posts = data[key] if not isinstance(posts, list): return (posts,) return posts @@ -56,9 +58,6 @@ class GelbooruBase(): params["tags"] = "{} id:<{}".format(self.tags, post["id"]) def _pagination_html(self, params): - if self.cookiedomain and not self._check_cookies(self.cookienames): - self.log.warning("no 'user_id' or 'pass_hash' cookies set") - url = self.root + "/index.php" params["pid"] = self.page_start * self.per_page @@ -162,10 +161,35 @@ class GelbooruPoolExtractor(GelbooruBase, class GelbooruFavoriteExtractor(GelbooruBase, gelbooru_v02.GelbooruV02FavoriteExtractor): """Extractor for gelbooru favorites""" + per_page = 100 pattern = BASE_PATTERN + r"page=favorites&s=view&id=(\d+)" - cookiedomain = "gelbooru.com" - cookienames = ("user_id", "pass_hash") - test = ("https://gelbooru.com/index.php?page=favorites&s=view&id=12345",) + test = ("https://gelbooru.com/index.php?page=favorites&s=view&id=279415", { + "count": 3, + }) + + def posts(self): + # get number of favorites + params = { + "s" : "favorite", + "id" : self.favorite_id, + "limit": "1" + } + count = self._api_request(params, "@attributes")[0]["count"] + + # paginate over them in reverse + params["pid"] = count // self.per_page + params["limit"] = self.per_page + + while True: + favs = self._api_request(params, "favorite") + + favs.reverse() + for fav in favs: + yield from self._api_request({"id": fav["favorite"]}) + + params["pid"] -= 1 + if params["pid"] < 0: + return class GelbooruPostExtractor(GelbooruBase, From a1ca2404f97d5fc334bc0fc2a3c03c0046307dcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 16 Mar 2023 18:37:00 +0100 Subject: [PATCH 15/20] add 'globals' instead of overwriting the default (#3773) --- docs/configuration.rst | 14 ++++++-------- gallery_dl/__init__.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index fbb0416b..190e8981 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4740,15 +4740,13 @@ Type Example * ``"~/.local/share/gdl-globals.py"`` * ``"gdl-globals"`` -Default - The ``GLOBALS`` dict in - `util.py <../gallery_dl/util.py>`__ Description - Path to or name of an - `importable <https://docs.python.org/3/reference/import.html>`__ - Python module whose namespace gets used as an alternative - |globals parameter|__ - for compiled Python expressions. + | Path to or name of an + `importable <https://docs.python.org/3/reference/import.html>`__ + Python module, + | whose namespace, + in addition to the ``GLOBALS`` dict in `util.py <../gallery_dl/util.py>`__, + gets used as |globals parameter|__ for compiled Python expressions. .. |globals parameter| replace:: ``globals`` parameter .. __: https://docs.python.org/3/library/functions.html#eval diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 116ca5dd..a430f131 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -120,7 +120,7 @@ def main(): # eval globals path = config.get((), "globals") if path: - util.GLOBALS = util.import_file(path).__dict__ + util.GLOBALS.update(util.import_file(path).__dict__) # loglevels output.configure_logging(args.loglevel) From 3dcabc97edc05b33e08b91064583bf4686746c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 17 Mar 2023 19:25:53 +0100 Subject: [PATCH 16/20] [twitter] update API endpoints and parameters --- gallery_dl/extractor/twitter.py | 142 ++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 44 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 29b4ac35..a9c157bb 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -951,6 +951,10 @@ class TwitterAPI(): self.extractor = extractor self.root = "https://api.twitter.com" + self._nsfw_warning = True + self._syndication = self.extractor.syndication + self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + cookies = extractor.session.cookies cookiedomain = extractor.cookiedomain @@ -966,6 +970,7 @@ class TwitterAPI(): auth_token = cookies.get("auth_token", domain=cookiedomain) self.headers = { + "Accept": "*/*", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" "4FA33AGWWjCpTnA", @@ -1019,73 +1024,112 @@ class TwitterAPI(): "collab_control,vibe", } self.variables = { - "includePromotedContent": False, - "withSuperFollowsUserFields": True, - "withBirdwatchPivots": False, "withDownvotePerspective": False, "withReactionsMetadata": False, "withReactionsPerspective": False, - "withSuperFollowsTweetFields": True, - "withClientEventToken": False, - "withBirdwatchNotes": False, - "withVoice": True, - "withV2Timeline": False, - "__fs_interactive_text": False, - "__fs_dont_mention_me_view_api_enabled": False, } - - self._nsfw_warning = True - self._syndication = self.extractor.syndication - self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + self.features = { + "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "responsive_web_graphql_skip_user_profile_" + "image_extensions_enabled": False, + "responsive_web_graphql_timeline_navigation_enabled": True, + } + self.features_pagination = { + "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_graphql_skip_user_profile_" + "image_extensions_enabled": False, + "tweetypie_unmention_optimization_enabled": True, + "vibe_api_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": False, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_" + "limited_actions_policy_enabled": False, + "interactive_text_enabled": True, + "responsive_web_text_conversations_enabled": False, + "longform_notetweets_richtext_consumption_enabled": False, + "responsive_web_enhance_cards_enabled": False, + } def tweet_detail(self, tweet_id): - endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail" + endpoint = "/graphql/zXaXQgfyR4GxE21uwYQSyA/TweetDetail" variables = { "focalTweetId": tweet_id, + "referrer": "profile", "with_rux_injections": False, + "includePromotedContent": True, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, "withBirdwatchNotes": False, + "withSuperFollowsUserFields": True, + "withSuperFollowsTweetFields": True, + "withVoice": True, + "withV2Timeline": True, } return self._pagination_tweets( - endpoint, variables, ("threaded_conversation_with_injections",)) + endpoint, variables, ("threaded_conversation_with_injections_v2",)) def user_tweets(self, screen_name): - endpoint = "/graphql/WZT7sCTrLvSOaWOXLDsWbQ/UserTweets" + endpoint = "/graphql/9rys0A7w1EyqVd2ME0QCJg/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, + "includePromotedContent": True, "withQuickPromoteEligibilityTweetFields": True, + "withVoice": True, + "withV2Timeline": True, } return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/t4wEKVulW4Mbv1P0kgxTEw/UserTweetsAndReplies" + endpoint = "/graphql/ehMCHF3Mkgjsfz_aImqOsg/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, + "includePromotedContent": True, "withCommunity": True, + "withVoice": True, + "withV2Timeline": True, } return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia" + endpoint = "/graphql/MA_EP2a21zpzNWKRkaPBMg/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, + "includePromotedContent": False, + "withClientEventToken": False, + "withBirdwatchNotes": False, + "withVoice": True, + "withV2Timeline": True, } return self._pagination_tweets(endpoint, variables) def user_likes(self, screen_name): - endpoint = "/graphql/9MSTt44HoGjVFSg_u3rHDw/Likes" + endpoint = "/graphql/XbHBYpgURwtklXj8NNxTDw/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, + "includePromotedContent": False, + "withClientEventToken": False, + "withBirdwatchNotes": False, + "withVoice": True, + "withV2Timeline": True, } return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/uKP9v_I31k0_VSBmlpq2Xg/Bookmarks" + endpoint = "/graphql/Xq0wQSWHlcfnXARLJGqTxg/Bookmarks" variables = { "count": 100, } @@ -1093,7 +1137,7 @@ class TwitterAPI(): endpoint, variables, ("bookmark_timeline", "timeline"), False) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline" + endpoint = "/graphql/FDI9EiIp54KxEOWGiv3B4A/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1128,18 +1172,21 @@ class TwitterAPI(): ["twitter_objects"]["live_events"][event_id]) def list_by_rest_id(self, list_id): - endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId" - params = {"variables": self._json_dumps({ - "listId": list_id, - "withSuperFollowsUserFields": True, - })} + endpoint = "/graphql/KlGpwq5CAt9tCfHkV2mwYQ/ListByRestId" + params = { + "variables": self._json_dumps({ + "listId": list_id, + "withSuperFollowsUserFields": True, + }), + "features": self._json_dumps(self.features), + } try: return self._call(endpoint, params)["data"]["list"] except KeyError: raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/snESM0DPs3c7M1SBm4rvVw/ListMembers" + endpoint = "/graphql/XsAJX17RLgLYU8GALIWg2g/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1149,29 +1196,34 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/mIwX8GogcobVlRwlgpHNYA/Following" + endpoint = "/graphql/vTZwBbd_gz6aI8v6Wze21A/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, + "includePromotedContent": False, } return self._pagination_users(endpoint, variables) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId" - params = {"variables": self._json_dumps({ - "userId": rest_id, - "withSafetyModeUserFields": True, - "withSuperFollowsUserFields": True, - })} + endpoint = "/graphql/QPSxc9lxrmrwnBzYkJI8eA/UserByRestId" + params = { + "variables": self._json_dumps({ + "userId": rest_id, + "withSafetyModeUserFields": True, + }), + "features": self._json_dumps(self.features), + } return self._call(endpoint, params)["data"]["user"]["result"] def user_by_screen_name(self, screen_name): - endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName" - params = {"variables": self._json_dumps({ - "screen_name": screen_name, - "withSafetyModeUserFields": True, - "withSuperFollowsUserFields": True, - })} + endpoint = "/graphql/nZjSkpOpSL5rWyIVdsKeLA/UserByScreenName" + params = { + "variables": self._json_dumps({ + "screen_name": screen_name, + "withSafetyModeUserFields": True, + }), + "features": self._json_dumps(self.features), + } return self._call(endpoint, params)["data"]["user"]["result"] def _user_id_by_screen_name(self, screen_name): @@ -1344,12 +1396,13 @@ class TwitterAPI(): pinned_tweet = extr.pinned while True: - params = {"variables": self._json_dumps(variables)} + params = {"variables": self._json_dumps(variables), + "features" : self._json_dumps(self.features_pagination)} data = self._call(endpoint, params)["data"] try: if path is None: - instructions = (data["user"]["result"]["timeline"] + instructions = (data["user"]["result"]["timeline_v2"] ["timeline"]["instructions"]) else: instructions = data @@ -1490,7 +1543,8 @@ class TwitterAPI(): while True: cursor = entry = stop = None - params = {"variables": self._json_dumps(variables)} + params = {"variables": self._json_dumps(variables), + "features" : self._json_dumps(self.features_pagination)} data = self._call(endpoint, params)["data"] try: From b68094d3267c20632f15294e4b7a85bb1de492ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 17 Mar 2023 19:36:07 +0100 Subject: [PATCH 17/20] [twitter] support 'note_tweet's --- gallery_dl/extractor/twitter.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a9c157bb..0465d806 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -248,11 +248,15 @@ class TwitterExtractor(Extractor): author = tweet["user"] author = self._transform_user(author) + if "note_tweet" in tweet: + note = tweet["note_tweet"]["note_tweet_results"]["result"] + else: + note = None + if "legacy" in tweet: tweet = tweet["legacy"] tget = tweet.get - entities = tweet["entities"] tdata = { "tweet_id" : text.parse_int(tweet["id_str"]), "retweet_id" : text.parse_int( @@ -272,6 +276,8 @@ class TwitterExtractor(Extractor): "retweet_count" : tget("retweet_count"), } + entities = note["entity_set"] if note else tweet["entities"] + hashtags = entities.get("hashtags") if hashtags: tdata["hashtags"] = [t["text"] for t in hashtags] @@ -284,7 +290,8 @@ class TwitterExtractor(Extractor): "nick": u["name"], } for u in mentions] - content = text.unescape(tget("full_text") or tget("text") or "") + content = text.unescape( + note["text"] if note else tget("full_text") or tget("text") or "") urls = entities.get("urls") if urls: for url in urls: @@ -803,6 +810,23 @@ class TwitterTweetExtractor(TwitterExtractor): r"\?format=(jpg|png)&name=orig$", "range": "1-2", }), + # note tweet with long 'content' + ("https://twitter.com/i/web/status/1629193457112686592", { + "keyword": { + "content": """\ +BREAKING - DEADLY LIES: Independent researchers at Texas A&M University have \ +just contradicted federal government regulators, saying that toxic air \ +pollutants in East Palestine, Ohio, could pose long-term risks. \n\nThe \ +Washington Post writes, "Three weeks after the toxic train derailment in \ +Ohio, an analysis of Environmental Protection Agency data has found nine air \ +pollutants at levels that could raise long-term health concerns in and around \ +East Palestine, according to an independent analysis. \n\n\"The analysis by \ +Texas A&M University seems to contradict statements by state and federal \ +regulators that air near the crash site is completely safe, despite residents \ +complaining about rashes, breathing problems and other health effects." \ +Your reaction.""", + }, + }), ) def __init__(self, match): From 2bb937014f1cf7b08409eafe9fc13d124fec665b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 17 Mar 2023 20:54:35 +0100 Subject: [PATCH 18/20] [twitter] fall back to legacy /media endpoint when not logged in --- gallery_dl/extractor/twitter.py | 37 ++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 0465d806..43b39c59 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -993,6 +993,9 @@ class TwitterAPI(): auth_token = cookies.get("auth_token", domain=cookiedomain) + if not auth_token: + self.user_media = self.user_media_legacy + self.headers = { "Accept": "*/*", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" @@ -1139,6 +1142,26 @@ class TwitterAPI(): } return self._pagination_tweets(endpoint, variables) + def user_media_legacy(self, screen_name): + endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + "includePromotedContent": False, + "withSuperFollowsUserFields": True, + "withBirdwatchPivots": False, + "withSuperFollowsTweetFields": True, + "withClientEventToken": False, + "withBirdwatchNotes": False, + "withVoice": True, + "withV2Timeline": False, + "__fs_interactive_text": False, + "__fs_dont_mention_me_view_api_enabled": False, + } + return self._pagination_tweets( + endpoint, variables, ("user", "result", "timeline", "timeline"), + features=False) + def user_likes(self, screen_name): endpoint = "/graphql/XbHBYpgURwtklXj8NNxTDw/Likes" variables = { @@ -1413,15 +1436,18 @@ class TwitterAPI(): params["cursor"] = cursor def _pagination_tweets(self, endpoint, variables, - path=None, stop_tweets=True): + path=None, stop_tweets=True, features=True): extr = self.extractor variables.update(self.variables) original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned + params = {"variables": None} + if features: + params["features"] = self._json_dumps(self.features_pagination) + while True: - params = {"variables": self._json_dumps(variables), - "features" : self._json_dumps(self.features_pagination)} + params["variables"] = self._json_dumps(variables) data = self._call(endpoint, params)["data"] try: @@ -1564,11 +1590,12 @@ class TwitterAPI(): def _pagination_users(self, endpoint, variables, path=None): variables.update(self.variables) + params = {"variables": None, + "features" : self._json_dumps(self.features_pagination)} while True: cursor = entry = stop = None - params = {"variables": self._json_dumps(variables), - "features" : self._json_dumps(self.features_pagination)} + params["variables"] = self._json_dumps(variables) data = self._call(endpoint, params)["data"] try: From 00f0233b2890ddf68bc9887aa5714c838cb12203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 17 Mar 2023 23:16:52 +0100 Subject: [PATCH 19/20] [postprocessor:metadata] add 'skip' option (#3786) --- docs/configuration.rst | 10 ++++++++ gallery_dl/postprocessor/metadata.py | 4 ++++ test/test_postprocessor.py | 35 ++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 190e8981..e2390c06 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4411,6 +4411,16 @@ Description i.e. fields whose name starts with an underscore. +metadata.skip +------------- +Type + ``bool`` +Default + ``false`` +Description + Do not overwrite already existing files. + + metadata.archive ---------------- Type diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 9667a413..714f4fef 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -87,6 +87,7 @@ class MetadataPP(PostProcessor): self.omode = options.get("open", omode) self.encoding = options.get("encoding", "utf-8") self.private = options.get("private", False) + self.skip = options.get("skip", False) def run(self, pathfmt): archive = self.archive @@ -96,6 +97,9 @@ class MetadataPP(PostProcessor): directory = self._directory(pathfmt) path = directory + self._filename(pathfmt) + if self.skip and os.path.exists(path): + return + try: with open(path, self.omode, encoding=self.encoding) as fp: self.write(fp, pathfmt.kwdict) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 650bf596..c78d7b03 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -428,11 +428,46 @@ class MetadataTest(BasePostprocessorTest): self.assertNotIn("baz", pdict["bar"]) self.assertEqual(kwdict["bar"], pdict["bar"]) + # no errors for deleted/undefined fields self._trigger() self.assertNotIn("foo", pdict) self.assertNotIn("baz", pdict["bar"]) self.assertEqual(kwdict["bar"], pdict["bar"]) + def test_metadata_option_skip(self): + self._create({"skip": True}) + + with patch("builtins.open", mock_open()) as m, \ + patch("os.path.exists") as e: + e.return_value = True + self._trigger() + + self.assertTrue(e.called) + self.assertTrue(not m.called) + self.assertTrue(not len(self._output(m))) + + with patch("builtins.open", mock_open()) as m, \ + patch("os.path.exists") as e: + e.return_value = False + self._trigger() + + self.assertTrue(e.called) + self.assertTrue(m.called) + self.assertGreater(len(self._output(m)), 0) + + path = self.pathfmt.realdirectory + "file.ext.json" + m.assert_called_once_with(path, "w", encoding="utf-8") + + def test_metadata_option_skip_false(self): + self._create({"skip": False}) + + with patch("builtins.open", mock_open()) as m, \ + patch("os.path.exists") as e: + self._trigger() + + self.assertTrue(not e.called) + self.assertTrue(m.called) + @staticmethod def _output(mock): return "".join( From 72f1f16eb2b360b4ca592116f10e66fc21ccc093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 18 Mar 2023 15:19:25 +0100 Subject: [PATCH 20/20] [weibo] support 'mix_media_info' entries (#3793) --- gallery_dl/extractor/weibo.py | 44 ++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 68bd1366..388ee035 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -79,6 +79,18 @@ class WeiboExtractor(Extractor): def _extract_status(self, status, files): append = files.append + if "mix_media_info" in status: + for item in status["mix_media_info"]["items"]: + type = item.get("type") + if type == "video": + if self.videos: + append(self._extract_video(item["data"]["media_info"])) + elif type == "pic": + append(item["data"]["largest"].copy()) + else: + self.log.warning("Unknown media type '%s'", type) + return + pic_ids = status.get("pic_ids") if pic_ids: pics = status["pic_infos"] @@ -100,18 +112,20 @@ class WeiboExtractor(Extractor): else: append(pic["largest"].copy()) - if "page_info" in status and self.videos: - try: - media = max(status["page_info"]["media_info"]["playback_list"], - key=lambda m: m["meta"]["quality_index"]) - except KeyError: - pass - except ValueError: - info = status["page_info"]["media_info"] - append({"url": (info.get("stream_url_hd") or - info["stream_url"])}) - else: - append(media["play_info"].copy()) + if "page_info" in status: + info = status["page_info"] + if "media_info" in info and self.videos: + append(self._extract_video(info["media_info"])) + + def _extract_video(self, info): + try: + media = max(info["playback_list"], + key=lambda m: m["meta"]["quality_index"]) + except Exception: + return {"url": (info.get("stream_url_hd") or + info["stream_url"])} + else: + return media["play_info"].copy() def _status_by_id(self, status_id): url = "{}/ajax/statuses/show?id={}".format(self.root, status_id) @@ -380,7 +394,7 @@ class WeiboStatusExtractor(WeiboExtractor): }), # missing 'playback_list' (#2792) ("https://weibo.com/2909128931/4409545658754086", { - "count": 9, + "count": 10, }), # empty 'playback_list' (#3301) ("https://weibo.com/1501933722/4142890299009993", { @@ -389,6 +403,10 @@ class WeiboStatusExtractor(WeiboExtractor): r"=0&ps=1CwnkDw1GXwCQx.+&KID=unistore,video", "count": 1, }), + # mix_media_info (#3793) + ("https://weibo.com/2427303621/MxojLlLgQ", { + "count": 9, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), )