From 50e2ebaff051fab31fd78e9b9842a5f9cc3b9510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Sep 2023 20:58:38 +0200 Subject: [PATCH 001/344] [danbooru] support 'donmai.moe' URLs --- gallery_dl/extractor/danbooru.py | 5 +++-- test/results/danbooru.py | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index f69b0e25..56d81e5b 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -150,7 +150,8 @@ class DanbooruExtractor(BaseExtractor): BASE_PATTERN = DanbooruExtractor.update({ "danbooru": { "root": None, - "pattern": r"(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us", + "pattern": r"(?:(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us" + r"|donmai\.moe)", }, "atfbooru": { "root": "https://booru.allthefallen.moe", @@ -158,7 +159,7 @@ BASE_PATTERN = DanbooruExtractor.update({ }, "aibooru": { "root": None, - "pattern": r"(?:safe.)?aibooru\.online", + "pattern": r"(?:safe\.)?aibooru\.online", }, "booruvar": { "root": "https://booru.borvar.art", diff --git a/test/results/danbooru.py b/test/results/danbooru.py index c64c693e..1bebf1ca 100644 --- a/test/results/danbooru.py +++ b/test/results/danbooru.py @@ -50,6 +50,12 @@ __tests__ = ( "#class" : danbooru.DanbooruTagExtractor, }, +{ + "#url" : "https://donmai.moe/posts?tags=bonocho", + "#category": ("Danbooru", "danbooru", "tag"), + "#class" : danbooru.DanbooruTagExtractor, +}, + { "#url" : "https://danbooru.donmai.us/pools/7659", "#category": ("Danbooru", "danbooru", "pool"), From 1d2fd0b831583855a331859fbd4c1029206ca96a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Sep 2023 00:05:26 +0200 Subject: [PATCH 002/344] [pillowfort] extract 'b2_lg_url' media (#4570) --- gallery_dl/extractor/pillowfort.py | 2 +- test/results/pillowfort.py | 81 ++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index 8cfefa97..65bd6b68 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -56,7 +56,7 @@ class PillowfortExtractor(Extractor): post["num"] = 0 for file in files: - url = file["url"] + url = file["url"] or file.get("b2_lg_url") if not url: continue diff --git a/test/results/pillowfort.py b/test/results/pillowfort.py index a7c73beb..b07a7dee 100644 --- a/test/results/pillowfort.py +++ b/test/results/pillowfort.py @@ -63,8 +63,88 @@ __tests__ = ( "username" : "Staff", }, +{ + "#url" : "https://www.pillowfort.social/posts/1124584", + "#comment" : "'b2_lg_url' media URL (#4570)", + "#category": ("", "pillowfort", "post"), + "#class" : pillowfort.PillowfortPostExtractor, + "#pattern" : r"https://img2\.pillowfort\.social/posts/c8e834bc09e6_Brandee\.png", + "#count" : 1, + + "avatar_frame" : None, + "avatar_id" : None, + "avatar_url" : "https://img3.pillowfort.social/avatars/000/037/139/original/437.jpg?1545015697", + "b2_lg_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee.png", + "b2_sm_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee_small.png", + "cached_tag_list": "art, digital art, mermaid, mermaids, underwater, seaweed, illustration, speed paint", + "col" : 0, + "comm_screening_status": "not_applicable", + "commentable" : True, + "comments_count": 0, + "community_id" : None, + "concealed_comment_warning": None, + "content" : "

Sea Bed

", + "created_at" : r"re:2020-02-.+", + "currentuser_default_avatar_url": None, + "currentuser_multi_avi": None, + "date" : "dt:2020-02-29 17:09:03", + "deleted" : None, + "deleted_at" : None, + "deleted_by_mod": None, + "deleted_for_flag_id": None, + "embed_code" : None, + "extension" : "png", + "filename" : "Brandee", + "hash" : "c8e834bc09e6", + "id" : 720167, + "last_activity" : r"re:2020-02-.+", + "last_activity_elapsed": r"re:\d+ months", + "last_edited_at": None, + "likes_count" : 8, + "media_type" : "picture", + "nsfw" : False, + "num" : 1, + "original_post_id": None, + "original_post_user_id": None, + "pic_row_last" : 1, + "picture_content_type": None, + "picture_file_name": None, + "picture_file_size": None, + "picture_updated_at": None, + "post_id" : 1124584, + "post_type" : "picture", + "privacy" : "public", + "reblog_copy_info": [], + "rebloggable" : True, + "reblogged_from_post_id": None, + "reblogged_from_user_id": None, + "reblogs_count" : int, + "row" : 1, + "small_image_url": None, + "tag_list" : None, + "tags" : [ + "art", + "digital art", + "mermaid", + "mermaids", + "underwater", + "seaweed", + "illustration", + "speed paint" + ], + "time_elapsed" : r"re:\d+ months", + "timestamp" : str, + "title" : "", + "updated_at" : r"re:2020-02-.+", + "url" : "", + "user_concealed": None, + "user_id" : 37201, + "username" : "Maclanahan", +}, + { "#url" : "https://www.pillowfort.social/posts/1557500", + "#comment" : "'external' option", "#category": ("", "pillowfort", "post"), "#class" : pillowfort.PillowfortPostExtractor, "#options" : { @@ -76,6 +156,7 @@ __tests__ = ( { "#url" : "https://www.pillowfort.social/posts/1672518", + "#comment" : "'inline' option", "#category": ("", "pillowfort", "post"), "#class" : pillowfort.PillowfortPostExtractor, "#options" : {"inline": True}, From 1e31fce37b116b845ea6e39b26871d9538f45b7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Sep 2023 00:11:01 +0200 Subject: [PATCH 003/344] [pillowfort] support '/tagged/' URLs (#4570) --- gallery_dl/extractor/pillowfort.py | 2 +- test/results/pillowfort.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index 65bd6b68..ff591fb1 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -132,7 +132,7 @@ class PillowfortPostExtractor(PillowfortExtractor): class PillowfortUserExtractor(PillowfortExtractor): """Extractor for all posts of a pillowfort user""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+)" + pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)" example = "https://www.pillowfort.social/USER" def posts(self): diff --git a/test/results/pillowfort.py b/test/results/pillowfort.py index b07a7dee..fea09746 100644 --- a/test/results/pillowfort.py +++ b/test/results/pillowfort.py @@ -172,4 +172,12 @@ __tests__ = ( "#count" : 15, }, +{ + "#url" : "https://www.pillowfort.social/Staff/tagged/funding", + "#category": ("", "pillowfort", "user"), + "#class" : pillowfort.PillowfortUserExtractor, + "#pattern" : r"https://img\d+\.pillowfort\.social/posts/", + "#count" : 6, +}, + ) From d7aac9fc06d0bb1338d8d8a8f6cd2d080fd2489e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Sep 2023 00:13:05 +0200 Subject: [PATCH 004/344] [reddit] ignore '/message/compose' URLs (#4482) --- gallery_dl/extractor/reddit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 4a15eff3..7af8d322 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -115,6 +115,9 @@ class RedditExtractor(Extractor): continue if url[0] == "/": url = "https://www.reddit.com" + url + if url.startswith( + "https://www.reddit.com/message/compose"): + continue match = match_submission(url) if match: From 642998504db8511c773b10ce22e4904b91999428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Sep 2023 17:54:53 +0200 Subject: [PATCH 005/344] [tests] support 'range()' for #count and metadata checks --- test/test_results.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index 512ca219..f9ec9aeb 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -48,6 +48,13 @@ class TestExtractorResults(unittest.TestCase): for url, exc in cls._skipped: print('- {} ("{}")'.format(url, exc)) + def assertRange(self, value, range, msg=None): + if range.step > 1: + self.assertIn(value, range, msg=msg) + else: + self.assertLessEqual(value, range.stop, msg=msg) + self.assertGreaterEqual(value, range.start, msg=msg) + def _run_test(self, result): result.pop("#comment", None) only_matching = (len(result) <= 3) @@ -129,12 +136,15 @@ class TestExtractorResults(unittest.TestCase): if "#count" in result: count = result["#count"] + len_urls = len(tjob.url_list) if isinstance(count, str): self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$") - expr = "{} {}".format(len(tjob.url_list), count) + expr = "{} {}".format(len_urls, count) self.assertTrue(eval(expr), msg=expr) + elif isinstance(count, range): + self.assertRange(len_urls, count) else: # assume integer - self.assertEqual(len(tjob.url_list), count) + self.assertEqual(len_urls, count) if "#pattern" in result: self.assertGreater(len(tjob.url_list), 0) @@ -159,6 +169,8 @@ class TestExtractorResults(unittest.TestCase): self._test_kwdict(value, test) elif isinstance(test, type): self.assertIsInstance(value, test, msg=key) + elif isinstance(test, range): + self.assertRange(value, test, msg=key) elif isinstance(test, list): subtest = False for idx, item in enumerate(test): From dbd820d7c5922d57c6f9dab20a365bbe6bd4d693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Sep 2023 19:26:54 +0200 Subject: [PATCH 006/344] [tests] allow checking for exact URL results --- test/results/imgur.py | 22 ++++++++++++++++++++++ test/test_results.py | 6 ++++++ 2 files changed, 28 insertions(+) diff --git a/test/results/imgur.py b/test/results/imgur.py index 4fedc600..9024bad2 100644 --- a/test/results/imgur.py +++ b/test/results/imgur.py @@ -16,6 +16,7 @@ __tests__ = ( "#class" : imgur.ImgurImageExtractor, "#sha1_url" : "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", "#sha1_content": "0c8768055e4e20e7c7259608b67799171b691140", + "#urls" : "https://i.imgur.com/21yMxCS.png", "account_id" : 0, "comment_count" : int, @@ -159,6 +160,27 @@ __tests__ = ( "#category": ("", "imgur", "album"), "#class" : imgur.ImgurAlbumExtractor, "#sha1_url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", + "#urls" : ( + "https://i.imgur.com/693j2Kr.jpg", + "https://i.imgur.com/ZNalkAC.jpg", + "https://i.imgur.com/lMox9Ek.jpg", + "https://i.imgur.com/6PryGOv.jpg", + "https://i.imgur.com/ecasnH2.jpg", + "https://i.imgur.com/NlJDmFG.jpg", + "https://i.imgur.com/aCwKs8S.jpg", + "https://i.imgur.com/Oz4rpxo.jpg", + "https://i.imgur.com/hE93Xsn.jpg", + "https://i.imgur.com/A5uBLSx.jpg", + "https://i.imgur.com/zZghWiD.jpg", + "https://i.imgur.com/ALV4fYV.jpg", + "https://i.imgur.com/FDd90t9.jpg", + "https://i.imgur.com/Txw37NO.jpg", + "https://i.imgur.com/DcLw7Cl.jpg", + "https://i.imgur.com/a4VChy8.jpg", + "https://i.imgur.com/auCwCig.jpg", + "https://i.imgur.com/Z8VihIb.jpg", + "https://i.imgur.com/6WDRFne.jpg", + ), "album" : { "account_id" : 0, diff --git a/test/test_results.py b/test/test_results.py index f9ec9aeb..0eec92a2 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -151,6 +151,12 @@ class TestExtractorResults(unittest.TestCase): for url in tjob.url_list: self.assertRegex(url, result["#pattern"]) + if "#urls" in result: + expected = result["#urls"] + if isinstance(expected, str): + expected = (expected,) + self.assertSequenceEqual(tjob.url_list, expected) + metadata = {k: v for k, v in result.items() if k[0] != "#"} if metadata: for kwdict in tjob.kwdict_list: From 20d1683c47af81f024b8d8bc09894f54e2a393b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 24 Sep 2023 14:45:34 +0200 Subject: [PATCH 007/344] [deviantart] fix JWT replacement (#293, #4548, #4563) And again, a huge thank you to @Ironchest337 for discovering this. --- docs/configuration.rst | 4 +--- gallery_dl/extractor/deviantart.py | 7 +++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 369407b8..9260c916 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1354,14 +1354,12 @@ extractor.deviantart.jwt Type ``bool`` Default - ``false`` + ``true`` Description Update `JSON Web Tokens `__ (the ``token`` URL parameter) of otherwise non-downloadable, low-resolution images to be able to download them in full resolution. - Note: This got patched by DeviantArt on 2023-09-19 and no longer works. - extractor.deviantart.mature --------------------------- diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 6ceb9371..9c0814cf 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -42,7 +42,7 @@ class DeviantartExtractor(Extractor): self.offset = 0 def _init(self): - self.jwt = self.config("jwt", False) + self.jwt = self.config("jwt", True) self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.original = self.config("original", True) @@ -355,6 +355,9 @@ class DeviantartExtractor(Extractor): if not sep: return + # 'images-wixmp' returns 401 errors, but just 'wixmp' still works + url = url.replace("//images-wixmp", "//wixmp", 1) + # header = b'{"typ":"JWT","alg":"none"}' payload = ( b'{"sub":"urn:app:","iss":"urn:app:","obj":[[{"path":"/f/' + @@ -367,7 +370,7 @@ class DeviantartExtractor(Extractor): "{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format( url, # base64 of 'header' is precomputed as 'eyJ0eX...' - # binascii.a2b_base64(header).rstrip(b"=\n").decode(), + # binascii.b2a_base64(header).rstrip(b"=\n").decode(), binascii.b2a_base64(payload).rstrip(b"=\n").decode()) ) From 0c5d8b1505bc0a165ee5077e62f3a113544c9c92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 24 Sep 2023 17:36:05 +0200 Subject: [PATCH 008/344] [deviantart] re-add 'quality' option and 'intermediary' transform --- docs/configuration.rst | 13 +++++++++++++ docs/gallery-dl.conf | 3 +++ gallery_dl/extractor/deviantart.py | 16 ++++++++++++++++ test/results/deviantart.py | 11 ++++++++++- 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9260c916..2cb28981 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1427,6 +1427,19 @@ Description when a `refresh token `__ is provided. +extractor.deviantart.quality +---------------------------- +Type + ``integer`` +Default + ``100`` +Description + JPEG quality level of newer images for which + an original file download is not available. + + Note: Only has an effect when `deviantart.jwt `__ is disabled. + + extractor.deviantart.refresh-token ---------------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 6345ded7..2eac0a1c 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -75,6 +75,7 @@ "client-id": null, "client-secret": null, "refresh-token": null, + "auto-watch": false, "auto-unwatch": false, "comments": false, @@ -84,11 +85,13 @@ "group": true, "include": "gallery", "journals": "html", + "jwt": true, "mature": true, "metadata": false, "original": true, "pagination": "api", "public": true, + "quality": 100, "wait-min": 0 }, "e621": diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9c0814cf..eb1ad302 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -45,6 +45,7 @@ class DeviantartExtractor(Extractor): self.jwt = self.config("jwt", True) self.flat = self.config("flat", True) self.extra = self.config("extra", False) + self.quality = self.config("quality", "100") self.original = self.config("original", True) self.comments = self.config("comments", False) @@ -59,6 +60,9 @@ class DeviantartExtractor(Extractor): else: self.unwatch = None + if self.quality: + self.quality = ",q_{}".format(self.quality) + if self.original != "image": self._update_content = self._update_content_default else: @@ -125,6 +129,18 @@ class DeviantartExtractor(Extractor): self._update_content(deviation, content) elif self.jwt: self._update_token(deviation, content) + elif content["src"].startswith("https://images-wixmp-"): + if deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + intermediary, count = re.subn( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", content["src"], 1) + if count: + deviation["_fallback"] = (content["src"],) + content["src"] = intermediary + if self.quality: + content["src"] = re.sub( + r",q_\d+", self.quality, content["src"], 1) yield self.commit(deviation, content) diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 876ccfeb..91c6a0d7 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -524,7 +524,16 @@ __tests__ = ( "#comment" : "wixmp URL rewrite", "#category": ("", "deviantart", "deviation"), "#class" : deviantart.DeviantartDeviationExtractor, - "#pattern" : r"https://images-wixmp-\w+\.wixmp\.com/f/[^/]+/[^.]+\.jpg\?token=", + "#pattern" : r"https://wixmp-\w+\.wixmp\.com/f/[^/]+/[^.]+\.jpg\?token=", +}, + +{ + "#url" : "https://www.deviantart.com/citizenfresh/art/Hverarond-789295466", + "#comment" : "wixmp URL rewrite /intermediary/", + "#category": ("", "deviantart", "deviation"), + "#class" : deviantart.DeviantartDeviationExtractor, + "#options" : {"jwt": False}, + "#pattern" : r"https://images-wixmp-\w+\.wixmp\.com/intermediary/f/[^/]+/[^.]+\.jpg", }, { From bb39779e1ae14f3dee5490237ce69ce7f1dbb74b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 24 Sep 2023 19:15:35 +0200 Subject: [PATCH 009/344] [deviantart] use private tokens for 'is_mature' posts (#4563) --- gallery_dl/extractor/deviantart.py | 38 +++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index eb1ad302..bf78033c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -348,7 +348,11 @@ class DeviantartExtractor(Extractor): yield url, folder def _update_content_default(self, deviation, content): - public = False if "premium_folder_data" in deviation else None + if "premium_folder_data" in deviation or deviation.get("is_mature"): + public = False + else: + public = None + data = self.api.deviation_download(deviation["deviationid"], public) content.update(data) deviation["is_original"] = True @@ -382,6 +386,7 @@ class DeviantartExtractor(Extractor): ) deviation["_fallback"] = (content["src"],) + deviation["is_original"] = True content["src"] = ( "{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format( url, @@ -1077,7 +1082,12 @@ class DeviantartOAuthAPI(): def deviation(self, deviation_id, public=None): """Query and return info about a single Deviation""" endpoint = "/deviation/" + deviation_id + deviation = self._call(endpoint, public=public) + if deviation.get("is_mature") and public is None and \ + self.refresh_token_key: + deviation = self._call(endpoint, public=False) + if self.metadata: self._metadata((deviation,)) if self.folders: @@ -1233,8 +1243,12 @@ class DeviantartOAuthAPI(): return data if not fatal and status != 429: return None - if data.get("error_description") == "User not found.": + + error = data.get("error_description") + if error == "User not found.": raise exception.NotFoundError("user or group") + if error == "Deviation not downloadable.": + raise exception.AuthorizationError() self.log.debug(response.text) msg = "API responded with {} {}".format( @@ -1258,6 +1272,17 @@ class DeviantartOAuthAPI(): self.log.error(msg) return data + def _switch_tokens(self, results, params): + if len(results) < params["limit"]: + return True + + if not self.extractor.jwt: + for item in results: + if item.get("is_mature"): + return True + + return False + def _pagination(self, endpoint, params, extend=True, public=None, unpack=False, key="results"): warn = True @@ -1276,7 +1301,7 @@ class DeviantartOAuthAPI(): results = [item["journal"] for item in results if "journal" in item] if extend: - if public and len(results) < params["limit"]: + if public and self._switch_tokens(results, params): if self.refresh_token_key: self.log.debug("Switching to private access token") public = False @@ -1284,9 +1309,10 @@ class DeviantartOAuthAPI(): elif data["has_more"] and warn: warn = False self.log.warning( - "Private deviations detected! Run 'gallery-dl " - "oauth:deviantart' and follow the instructions to " - "be able to access them.") + "Private or mature deviations detected! " + "Run 'gallery-dl oauth:deviantart' and follow the " + "instructions to be able to access them.") + # "statusid" cannot be used instead if results and "deviationid" in results[0]: if self.metadata: From 66613c3a323dd228531429bb4423c2d78cdbd935 Mon Sep 17 00:00:00 2001 From: HRXN Date: Mon, 25 Sep 2023 02:24:44 +0200 Subject: [PATCH 010/344] [reddit] ignore '/message/compose' URLs without www subdomain --- gallery_dl/extractor/reddit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 7af8d322..aa8c6b81 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -115,8 +115,10 @@ class RedditExtractor(Extractor): continue if url[0] == "/": url = "https://www.reddit.com" + url - if url.startswith( - "https://www.reddit.com/message/compose"): + if (url.startswith( + "https://www.reddit.com/message/compose") or + url.startswith( + "https://reddit.com/message/compose")): continue match = match_submission(url) From ec91eeb7ef950cd50234ec1a63474c2ce244fc7b Mon Sep 17 00:00:00 2001 From: HRXN Date: Tue, 26 Sep 2023 14:00:29 +0200 Subject: [PATCH 011/344] Update gallery_dl/extractor/reddit.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mike Fährmann --- gallery_dl/extractor/reddit.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index aa8c6b81..85533126 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -115,10 +115,10 @@ class RedditExtractor(Extractor): continue if url[0] == "/": url = "https://www.reddit.com" + url - if (url.startswith( - "https://www.reddit.com/message/compose") or - url.startswith( - "https://reddit.com/message/compose")): + if url.startswith(( + "https://www.reddit.com/message/compose", + "https://reddit.com/message/compose", + )): continue match = match_submission(url) From 7150c4c76c6a64a9dc7a1ebf7137826bc9b73ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 26 Sep 2023 22:07:50 +0200 Subject: [PATCH 012/344] fix imports when using the gallery_dl directory as argument (#4581) --- gallery_dl/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/__main__.py b/gallery_dl/__main__.py index 441009fe..9832190d 100644 --- a/gallery_dl/__main__.py +++ b/gallery_dl/__main__.py @@ -9,7 +9,7 @@ import sys -if __package__ is None and not hasattr(sys, "frozen"): +if not __package__ and not hasattr(sys, "frozen"): import os.path path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.realpath(path)) From be17103e21fd1c58575bf66bd7a8ac651889354d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 28 Sep 2023 21:37:58 +0200 Subject: [PATCH 013/344] [regifs] support 'order' parameter for user URLs (#4583) --- gallery_dl/extractor/redgifs.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 55ba2028..e246405a 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -89,14 +89,20 @@ class RedgifsUserExtractor(RedgifsExtractor): """Extractor for redgifs user profiles""" subcategory = "user" directory_fmt = ("{category}", "{userName}") - pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?$" + pattern = (r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?" + r"(?:\?([^#]+))?$") example = "https://www.redgifs.com/users/USER" + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.query = match.group(2) + def metadata(self): return {"userName": self.key} def gifs(self): - return self.api.user(self.key) + order = text.parse_query(self.query).get("order") + return self.api.user(self.key, order or "new") class RedgifsCollectionExtractor(RedgifsExtractor): @@ -208,7 +214,7 @@ class RedgifsAPI(): endpoint = "/v2/gallery/" + gallery_id return self._call(endpoint) - def user(self, user, order="best"): + def user(self, user, order="new"): endpoint = "/v2/users/{}/search".format(user.lower()) params = {"order": order} return self._pagination(endpoint, params) From bd3f7a5bbc8da534e6e794a17d59702f9094942d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 28 Sep 2023 21:56:09 +0200 Subject: [PATCH 014/344] [tests] support one regex per URL for #pattern --- test/results/redgifs.py | 20 ++++++++++++++++++-- test/test_results.py | 9 +++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/test/results/redgifs.py b/test/results/redgifs.py index 1e76e60a..669f2a23 100644 --- a/test/results/redgifs.py +++ b/test/results/redgifs.py @@ -9,15 +9,31 @@ from gallery_dl.extractor import redgifs __tests__ = ( { - "#url" : "https://www.redgifs.com/users/Natalifiction", + "#url" : "https://www.redgifs.com/users/mmj", "#category": ("", "redgifs", "user"), "#class" : redgifs.RedgifsUserExtractor, "#pattern" : r"https://\w+\.redgifs\.com/[\w-]+\.mp4", - "#count" : ">= 100", + "#count" : range(50, 60), +}, + +{ + "#url" : "https://www.redgifs.com/users/mmj?order=old", + "#comment" : "'order' URL parameter (#4583)", + "#category": ("", "redgifs", "user"), + "#class" : redgifs.RedgifsUserExtractor, + "#range" : "1-5", + "#patterns": ( + r"https://thumbs\d+\.redgifs\.com/ShoddyOilyHarlequinbug\.mp4", + r"https://thumbs\d+\.redgifs\.com/UnevenPrestigiousKilldeer\.mp4", + r"https://thumbs\d+\.redgifs\.com/EveryShockingFlickertailsquirrel\.mp4", + r"https://thumbs\d+\.redgifs\.com/NegativeWarlikeAmericancurl\.mp4", + r"https://thumbs\d+\.redgifs\.com/PopularTerribleFritillarybutterfly\.mp4", + ), }, { "#url" : "https://v3.redgifs.com/users/lamsinka89", + "#comment" : "'v3' subdomain (#3588, #3589)", "#category": ("", "redgifs", "user"), "#class" : redgifs.RedgifsUserExtractor, "#pattern" : r"https://\w+\.redgifs\.com/[\w-]+\.(mp4|jpg)", diff --git a/test/test_results.py b/test/test_results.py index 0eec92a2..483d6968 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -148,8 +148,13 @@ class TestExtractorResults(unittest.TestCase): if "#pattern" in result: self.assertGreater(len(tjob.url_list), 0) - for url in tjob.url_list: - self.assertRegex(url, result["#pattern"]) + pattern = result["#pattern"] + if isinstance(pattern, str): + for url in tjob.url_list: + self.assertRegex(url, pattern, msg="#pattern") + else: + for url, pat in zip(tjob.url_list, pattern): + self.assertRegex(url, pat, msg="#pattern") if "#urls" in result: expected = result["#urls"] From 4477808d1c563d4cbeeeb2838cc1f4940e8de386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Sep 2023 17:02:57 +0200 Subject: [PATCH 015/344] fix symlink resolution in __main__.py adapt ytdl order --- gallery_dl/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/__main__.py b/gallery_dl/__main__.py index 9832190d..db3a34e8 100644 --- a/gallery_dl/__main__.py +++ b/gallery_dl/__main__.py @@ -11,8 +11,8 @@ import sys if not __package__ and not hasattr(sys, "frozen"): import os.path - path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - sys.path.insert(0, os.path.realpath(path)) + path = os.path.realpath(os.path.abspath(__file__)) + sys.path.insert(0, os.path.dirname(os.path.dirname(path))) import gallery_dl From b92645cd370226d4bb1868b0847e525d42a0bddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Sep 2023 18:05:12 +0200 Subject: [PATCH 016/344] [bunkr] fix extraction (#4514, #4532, #4529, #4540) --- gallery_dl/extractor/bunkr.py | 53 ++++++++++++++++++----------------- test/results/bunkr.py | 15 ++++++---- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index dc48090a..5509f5a8 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -38,36 +38,37 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): page, "").partition(">")[2]) count, _, size = info[1].split(None, 2) - # files - cdn = None - files = [] - append = files.append - pos = page.index('class="grid-images') - for url in text.extract_iter(page, ' 2 else "", - "count" : len(files), + "count" : len(urls), } + + def _extract_files(self, urls): + for url in urls: + if url.startswith("/"): + try: + page = self.request(self.root + text.unescape(url)).text + if url[1] == "v": + url = text.extr(page, '-QjgneIQv\.png", + "#urls" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", "#sha1_content": "0c8768055e4e20e7c7259608b67799171b691140", "album_id" : "Lktg9Keq", @@ -29,7 +29,7 @@ __tests__ = ( "#comment" : "mp4 (#2239)", "#category": ("lolisafe", "bunkr", "album"), "#class" : bunkr.BunkrAlbumExtractor, - "#pattern" : r"https://media-files\.bunkr\.ru/_-RnHoW69L\.mp4", + "#urls" : "https://burger.bunkr.ru/_-RnHoW69L.mp4", "#sha1_content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }, @@ -38,14 +38,17 @@ __tests__ = ( "#comment" : "cdn4", "#category": ("lolisafe", "bunkr", "album"), "#class" : bunkr.BunkrAlbumExtractor, - "#pattern" : r"https://(cdn|media-files)4\.bunkr\.ru/", + "#urls" : ( + "https://media-files4.bunkr.ru/video-wFO9FtxG.mp4", + "https://i4.bunkr.ru/image-sZrQUeOx.jpg", + ), "#sha1_content": "da29aae371b7adc8c5ef8e6991b66b69823791e8", "album_id" : "iXTTc1o2", "album_name" : "test2", "album_size" : "691.1 KB", "count" : 2, - "description": "072022", + "description": "", "filename" : r"re:video-wFO9FtxG|image-sZrQUeOx", "id" : r"re:wFO9FtxG|sZrQUeOx", "name" : r"re:video|image", @@ -57,8 +60,8 @@ __tests__ = ( "#comment" : "cdn12 .ru TLD (#4147)", "#category": ("lolisafe", "bunkr", "album"), "#class" : bunkr.BunkrAlbumExtractor, - "#pattern" : r"https://(cdn12.bunkr.ru|media-files12.bunkr.la)/\w+", - "#count" : 8, + "#pattern" : r"https://(i-)?meatballs.bunkr.ru/\w+", + "#count" : 10, }, { From c7bd9925d93aa23d9c31f1834372162e7584ff34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Sep 2023 21:00:55 +0200 Subject: [PATCH 017/344] [tests] use fallback URLs for content tests (#3163) --- test/results/tumblr.py | 11 +++++++++++ test/test_results.py | 17 +++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/test/results/tumblr.py b/test/results/tumblr.py index 99883578..01d8de73 100644 --- a/test/results/tumblr.py +++ b/test/results/tumblr.py @@ -219,10 +219,21 @@ __tests__ = ( "#comment" : "high-quality images (#1344)", "#category": ("", "tumblr", "post"), "#class" : tumblr.TumblrPostExtractor, + "#exception" : exception.NotFoundError, "#count" : 2, "#sha1_content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34", }, +{ + "#url" : "https://k-eke.tumblr.com/post/185341184856", + "#comment" : "wrong extension returned by api (#3095)", + "#category": ("", "tumblr", "post"), + "#class" : tumblr.TumblrPostExtractor, + "#options" : {"retries": 0}, + "#urls" : "https://64.media.tumblr.com/5e9d760aba24c65beaf0e72de5aae4dd/tumblr_psj5yaqV871t1ig6no1_1280.gif", + "#sha1_content": "3508d894b6cc25e364d182a8e1ff370d706965fb", +}, + { "#url" : "https://mikf123.tumblr.com/image/689860196535762944", "#category": ("", "tumblr", "post"), diff --git a/test/test_results.py b/test/test_results.py index 483d6968..46c29290 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -222,6 +222,8 @@ class ResultJob(job.DownloadJob): if content: self.fileobj = TestPathfmt(self.content_hash) + else: + self._update_content = lambda url, kwdict: None self.format_directory = TestFormatter( "".join(self.extractor.directory_fmt)).format_map @@ -269,10 +271,17 @@ class ResultJob(job.DownloadJob): self.archive_hash.update(archive_id.encode()) def _update_content(self, url, kwdict): - if self.content: - scheme = url.partition(":")[0] - self.fileobj.kwdict = kwdict - self.get_downloader(scheme).download(url, self.fileobj) + self.fileobj.kwdict = kwdict + + downloader = self.get_downloader(url.partition(":")[0]) + if downloader.download(url, self.fileobj): + return + + for num, url in enumerate(kwdict.get("_fallback") or (), 1): + self.log.warning("Trying fallback URL #%d", num) + downloader = self.get_downloader(url.partition(":")[0]) + if downloader.download(url, self.fileobj): + return class TestPathfmt(): From 27da3f2958ab37f95e9e57cca8aacad16afa4455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 1 Oct 2023 13:31:23 +0200 Subject: [PATCH 018/344] [tests] re-implement filtering by basecategory --- test/test_results.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_results.py b/test/test_results.py index 46c29290..61524183 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -407,7 +407,13 @@ def generate_tests(): category, _, subcategory = sys.argv[1].partition(":") del sys.argv[1:] - tests = results.category(category) + if category.startswith("+"): + basecategory = category[1:].lower() + tests = [t for t in results.all() + if t["#category"][0].lower() == basecategory] + else: + tests = results.category(category) + if subcategory: tests = [t for t in tests if t["#category"][-1] == subcategory] else: From 0b150d45db5ac5636083ea3df0bf2dd282678e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 1 Oct 2023 13:52:00 +0200 Subject: [PATCH 019/344] [tests] add 'msg' arguments to assert statements --- test/test_results.py | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index 61524183..4fb22c74 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -72,13 +72,13 @@ class TestExtractorResults(unittest.TestCase): content = ("#sha1_content" in result) tjob = ResultJob(result["#url"], content=content) - self.assertEqual(result["#class"], tjob.extractor.__class__) + self.assertEqual(result["#class"], tjob.extractor.__class__, "#class") if only_matching: return if "#exception" in result: - with self.assertRaises(result["#exception"]): + with self.assertRaises(result["#exception"], msg="#exception"): tjob.run() return @@ -98,8 +98,7 @@ class TestExtractorResults(unittest.TestCase): self.assertEqual( len(set(tjob.archive_list)), len(tjob.archive_list), - "archive-id uniqueness", - ) + msg="archive-id uniqueness") if tjob.queue: # test '_extractor' entries @@ -113,41 +112,44 @@ class TestExtractorResults(unittest.TestCase): else: # test 'extension' entries for kwdict in tjob.kwdict_list: - self.assertIn("extension", kwdict) + self.assertIn("extension", kwdict, msg="#extension") # test extraction results if "#sha1_url" in result: self.assertEqual( - result["#sha1_url"], tjob.url_hash.hexdigest()) + result["#sha1_url"], + tjob.url_hash.hexdigest(), + msg="#sha1_url") if "#sha1_content" in result: expected = result["#sha1_content"] digest = tjob.content_hash.hexdigest() if isinstance(expected, str): - self.assertEqual( - expected, digest, "content") + self.assertEqual(expected, digest, msg="#sha1_content") else: # iterable - self.assertIn( - digest, expected, "content") + self.assertIn(digest, expected, msg="#sha1_content") if "#sha1_metadata" in result: self.assertEqual( - result["#sha1_metadata"], tjob.kwdict_hash.hexdigest()) + result["#sha1_metadata"], + tjob.kwdict_hash.hexdigest(), + "#sha1_metadata") if "#count" in result: count = result["#count"] len_urls = len(tjob.url_list) if isinstance(count, str): - self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$") + self.assertRegex( + count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$", msg="#count") expr = "{} {}".format(len_urls, count) self.assertTrue(eval(expr), msg=expr) elif isinstance(count, range): - self.assertRange(len_urls, count) + self.assertRange(len_urls, count, msg="#count") else: # assume integer - self.assertEqual(len_urls, count) + self.assertEqual(len_urls, count, msg="#count") if "#pattern" in result: - self.assertGreater(len(tjob.url_list), 0) + self.assertGreater(len(tjob.url_list), 0, msg="#pattern") pattern = result["#pattern"] if isinstance(pattern, str): for url in tjob.url_list: @@ -159,8 +161,9 @@ class TestExtractorResults(unittest.TestCase): if "#urls" in result: expected = result["#urls"] if isinstance(expected, str): - expected = (expected,) - self.assertSequenceEqual(tjob.url_list, expected) + self.assertEqual(tjob.url_list[0], expected, msg="#urls") + else: + self.assertSequenceEqual(tjob.url_list, expected, msg="#urls") metadata = {k: v for k, v in result.items() if k[0] != "#"} if metadata: @@ -173,7 +176,7 @@ class TestExtractorResults(unittest.TestCase): key = key[1:] if key not in kwdict: continue - self.assertIn(key, kwdict) + self.assertIn(key, kwdict, msg=key) value = kwdict[key] if isinstance(test, dict): From eb230e4b77238b4043e4d8ccd28dfd55cdae7fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 1 Oct 2023 13:55:17 +0200 Subject: [PATCH 020/344] [nsfwalbum] disable Referer headers by default (#4598) --- gallery_dl/extractor/common.py | 13 +++++++------ gallery_dl/extractor/nsfwalbum.py | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 72c7b7c6..0d67df77 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -32,13 +32,14 @@ class Extractor(): directory_fmt = ("{category}",) filename_fmt = "{filename}.{extension}" archive_fmt = "" + root = "" cookies_domain = "" + referer = True + tls12 = True browser = None - root = "" request_interval = 0.0 request_interval_min = 0.0 request_timestamp = 0.0 - tls12 = True def __init__(self, match): self.log = logging.getLogger(self.category) @@ -310,10 +311,10 @@ class Extractor(): else: headers["Accept-Encoding"] = "gzip, deflate" - custom_referer = self.config("referer", True) - if custom_referer: - if isinstance(custom_referer, str): - headers["Referer"] = custom_referer + referer = self.config("referer", self.referer) + if referer: + if isinstance(referer, str): + headers["Referer"] = referer elif self.root: headers["Referer"] = self.root + "/" diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py index 7229bde5..27de15ec 100644 --- a/gallery_dl/extractor/nsfwalbum.py +++ b/gallery_dl/extractor/nsfwalbum.py @@ -20,6 +20,7 @@ class NsfwalbumAlbumExtractor(GalleryExtractor): filename_fmt = "{album_id}_{num:>03}_{id}.{extension}" directory_fmt = ("{category}", "{album_id} {title}") archive_fmt = "{id}" + referer = False pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))" example = "https://nsfwalbum.com/album/12345" From 482f002e1f524627abbb391b2def55568fa6877f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 2 Oct 2023 15:37:17 +0200 Subject: [PATCH 021/344] [nsfwalbum] detect '/error.jpg' images (#4598) --- gallery_dl/extractor/nsfwalbum.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py index 27de15ec..eb5d31f3 100644 --- a/gallery_dl/extractor/nsfwalbum.py +++ b/gallery_dl/extractor/nsfwalbum.py @@ -72,8 +72,8 @@ class NsfwalbumAlbumExtractor(GalleryExtractor): @staticmethod def _validate_response(response): - return not response.request.url.endswith( - ("/no_image.jpg", "/placeholder.png")) + return not response.url.endswith( + ("/no_image.jpg", "/placeholder.png", "/error.jpg")) @staticmethod def _annihilate(value, base=6): From aa77fda78c9264dd2b7170fc1bffe726e5825599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 2 Oct 2023 20:29:10 +0200 Subject: [PATCH 022/344] [instagram] better error message for invalid users (#4606) --- gallery_dl/extractor/instagram.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index bc6ad8ac..c7041833 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -710,7 +710,8 @@ class InstagramRestAPI(): def user_by_name(self, screen_name): endpoint = "/v1/users/web_profile_info/" params = {"username": screen_name} - return self._call(endpoint, params=params)["data"]["user"] + return self._call( + endpoint, params=params, notfound="user")["data"]["user"] @memcache(keyarg=1) def user_by_id(self, user_id): From 84fbbd96aadcb4c93354365fd1fd2312c0c95418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 2 Oct 2023 20:41:25 +0200 Subject: [PATCH 023/344] [shimmie2] remove 'meme.museum' --- docs/supportedsites.md | 6 ----- gallery_dl/extractor/shimmie2.py | 4 ---- scripts/supportedsites.py | 1 - test/results/mememuseum.py | 38 -------------------------------- 4 files changed, 49 deletions(-) delete mode 100644 test/results/mememuseum.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 111cef65..dafd0f27 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1285,12 +1285,6 @@ Consider all sites to be NSFW unless otherwise known. Shimmie2 Instances - - meme.museum - https://meme.museum/ - Posts, Tag Searches - - Loudbooru https://loudbooru.com/ diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 99dc9034..912e6013 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -66,10 +66,6 @@ class Shimmie2Extractor(BaseExtractor): INSTANCES = { - "mememuseum": { - "root": "https://meme.museum", - "pattern": r"meme\.museum", - }, "loudbooru": { "root": "https://loudbooru.com", "pattern": r"loudbooru\.com", diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 59d15192..7dd69c56 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -85,7 +85,6 @@ CATEGORY_MAP = { "mangaread" : "MangaRead", "mangasee" : "MangaSee", "mastodon.social": "mastodon.social", - "mememuseum" : "meme.museum", "myhentaigallery": "My Hentai Gallery", "myportfolio" : "Adobe Portfolio", "naverwebtoon" : "NaverWebtoon", diff --git a/test/results/mememuseum.py b/test/results/mememuseum.py deleted file mode 100644 index 6a93d3f2..00000000 --- a/test/results/mememuseum.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import shimmie2 - - -__tests__ = ( -{ - "#url" : "https://meme.museum/post/list/animated/1", - "#category": ("shimmie2", "mememuseum", "tag"), - "#class" : shimmie2.Shimmie2TagExtractor, - "#pattern" : r"https://meme\.museum/_images/\w+/\d+%20-%20", - "#count" : ">= 30", -}, - -{ - "#url" : "https://meme.museum/post/view/10243", - "#category": ("shimmie2", "mememuseum", "post"), - "#class" : shimmie2.Shimmie2PostExtractor, - "#pattern" : r"https://meme\.museum/_images/105febebcd5ca791ee332adc49971f78/10243%20-%20g%20beard%20open_source%20richard_stallman%20stallman%20tagme%20text\.jpg", - "#sha1_content": "45565f3f141fc960a8ae1168b80e718a494c52d2", - - "extension" : "jpg", - "file_url" : "https://meme.museum/_images/105febebcd5ca791ee332adc49971f78/10243%20-%20g%20beard%20open_source%20richard_stallman%20stallman%20tagme%20text.jpg", - "filename" : "10243 - g beard open_source richard_stallman stallman tagme text", - "height" : 451, - "id" : 10243, - "md5" : "105febebcd5ca791ee332adc49971f78", - "size" : 0, - "subcategory": "post", - "tags" : "/g/ beard open_source richard_stallman stallman tagme text", - "width" : 480, -}, - -) From 64dbc58a5a31f492079f4d3769e8bce71498ee8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 3 Oct 2023 14:58:52 +0200 Subject: [PATCH 024/344] [deviantart] update Eclipse API endpoints 2 (#4615) --- gallery_dl/extractor/deviantart.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index bf78033c..94210967 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1422,12 +1422,14 @@ class DeviantartEclipseAPI(): self.csrf_token = None def deviation_extended_fetch(self, deviation_id, user, kind=None): - endpoint = "/_napi/da-browse/shared_api/deviation/extended_fetch" + endpoint = "/_puppy/dadeviation/init" params = { - "deviationid" : deviation_id, - "username" : user, - "type" : kind, - "include_session": "false", + "deviationid" : deviation_id, + "username" : user, + "type" : kind, + "include_session" : "false", + "expand" : "deviation.related", + "da_minor_version": "20230710", } return self._call(endpoint, params) @@ -1455,7 +1457,7 @@ class DeviantartEclipseAPI(): return self._pagination(endpoint, params) def search_deviations(self, params): - endpoint = "/_napi/da-browse/api/networkbar/search/deviations" + endpoint = "/_puppy/dabrowse/search/deviations" return self._pagination(endpoint, params, key="deviations") def user_info(self, user, expand=False): From 85357c1ef8dcb2543d1e183ebcd1e8c7e332b086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 3 Oct 2023 21:22:01 +0200 Subject: [PATCH 025/344] release version 1.26.0 --- CHANGELOG.md | 117 ++++++++++++++++++++++++++++++++++++++++++ README.rst | 4 +- gallery_dl/version.py | 2 +- 3 files changed, 120 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53034fa5..a4ce4baf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,122 @@ # Changelog +## 1.26.0 - 2023-10-03 +- ### Extractors + #### Additions + - [behance] add `date` metadata field ([#4417](https://github.com/mikf/gallery-dl/issues/4417)) + - [danbooru] support `booru.borvar.art` ([#4096](https://github.com/mikf/gallery-dl/issues/4096)) + - [danbooru] support `donmai.moe` + - [deviantart] add `is_original` metadata field ([#4559](https://github.com/mikf/gallery-dl/issues/4559)) + - [e621] support `e6ai.net` ([#4320](https://github.com/mikf/gallery-dl/issues/4320)) + - [exhentai] add `fav` option ([#4409](https://github.com/mikf/gallery-dl/issues/4409)) + - [gelbooru_v02] support `xbooru.com` ([#4493](https://github.com/mikf/gallery-dl/issues/4493)) + - [instagram] add `following` extractor ([#1848](https://github.com/mikf/gallery-dl/issues/1848)) + - [pillowfort] support `/tagged/` URLs ([#4570](https://github.com/mikf/gallery-dl/issues/4570)) + - [pornhub] add `gif` support ([#4463](https://github.com/mikf/gallery-dl/issues/4463)) + - [reddit] add `previews` option ([#4322](https://github.com/mikf/gallery-dl/issues/4322)) + - [redgifs] add `niches` extractor ([#4311](https://github.com/mikf/gallery-dl/issues/4311), [#4312](https://github.com/mikf/gallery-dl/issues/4312)) + - [redgifs] support `order` parameter for user URLs ([#4583](https://github.com/mikf/gallery-dl/issues/4583)) + - [twitter] add `user` extractor and `include` option ([#4275](https://github.com/mikf/gallery-dl/issues/4275)) + - [twitter] add `tweet-endpoint` option ([#4307](https://github.com/mikf/gallery-dl/issues/4307)) + - [twitter] add `date_original` metadata for retweets ([#4337](https://github.com/mikf/gallery-dl/issues/4337), [#4443](https://github.com/mikf/gallery-dl/issues/4443)) + - [twitter] extract `source` metadata ([#4459](https://github.com/mikf/gallery-dl/issues/4459)) + - [twitter] support `x.com` URLs ([#4452](https://github.com/mikf/gallery-dl/issues/4452)) + #### Improvements + - include `Referer` header in all HTTP requests ([#4490](https://github.com/mikf/gallery-dl/issues/4490), [#4518](https://github.com/mikf/gallery-dl/issues/4518)) + (can be disabled with `referer` option) + - [behance] show errors for mature content ([#4417](https://github.com/mikf/gallery-dl/issues/4417)) + - [deviantart] re-add `quality` option and `/intermediary/` transform + - [fantia] improve metadata extraction ([#4126](https://github.com/mikf/gallery-dl/issues/4126)) + - [instagram] better error messages for invalid users ([#4606](https://github.com/mikf/gallery-dl/issues/4606)) + - [mangadex] support multiple values for `lang` ([#4093](https://github.com/mikf/gallery-dl/issues/4093)) + - [mastodon] support `/@USER/following` URLs ([#4608](https://github.com/mikf/gallery-dl/issues/4608)) + - [moebooru] match search URLs with empty `tags` ([#4354](https://github.com/mikf/gallery-dl/issues/4354)) + - [pillowfort] extract `b2_lg_url` media ([#4570](https://github.com/mikf/gallery-dl/issues/4570)) + - [reddit] improve comment metadata ([#4482](https://github.com/mikf/gallery-dl/issues/4482)) + - [reddit] ignore `/message/compose` URLs ([#4482](https://github.com/mikf/gallery-dl/issues/4482), [#4581](https://github.com/mikf/gallery-dl/issues/4581)) + - [redgifs] provide `collection` metadata as separate field ([#4508](https://github.com/mikf/gallery-dl/issues/4508)) + - [redgifs] match `gfycat` image URLs ([#4558](https://github.com/mikf/gallery-dl/issues/4558)) + - [twitter] improve error messages for single Tweets ([#4369](https://github.com/mikf/gallery-dl/issues/4369)) + #### Fixes + - [acidimg] fix extraction + - [architizer] fix extraction ([#4537](https://github.com/mikf/gallery-dl/issues/4537)) + - [behance] fix and update `user` extractor ([#4417](https://github.com/mikf/gallery-dl/issues/4417)) + - [behance] fix cookie usage ([#4417](https://github.com/mikf/gallery-dl/issues/4417)) + - [behance] handle videos without `renditions` ([#4523](https://github.com/mikf/gallery-dl/issues/4523)) + - [bunkr] fix media domain for `cdn9` ([#4386](https://github.com/mikf/gallery-dl/issues/4386), [#4412](https://github.com/mikf/gallery-dl/issues/4412)) + - [bunkr] fix extracting `.wmv` files ([#4419](https://github.com/mikf/gallery-dl/issues/4419)) + - [bunkr] fix media domain for `cdn-pizza.bunkr.ru` ([#4489](https://github.com/mikf/gallery-dl/issues/4489)) + - [bunkr] fix extraction ([#4514](https://github.com/mikf/gallery-dl/issues/4514), [#4532](https://github.com/mikf/gallery-dl/issues/4532), [#4529](https://github.com/mikf/gallery-dl/issues/4529), [#4540](https://github.com/mikf/gallery-dl/issues/4540)) + - [deviantart] fix full resolution URLs for non-downloadable images ([#293](https://github.com/mikf/gallery-dl/issues/293), [#4548](https://github.com/mikf/gallery-dl/issues/4548), [#4563](https://github.com/mikf/gallery-dl/issues/4563)) + - [deviantart] fix shortened URLs ([#4316](https://github.com/mikf/gallery-dl/issues/4316)) + - [deviantart] fix search ([#4384](https://github.com/mikf/gallery-dl/issues/4384)) + - [deviantart] update Eclipse API endpoints ([#4553](https://github.com/mikf/gallery-dl/issues/4553), [#4615](https://github.com/mikf/gallery-dl/issues/4615)) + - [deviantart] use private tokens for `is_mature` posts ([#4563](https://github.com/mikf/gallery-dl/issues/4563)) + - [flickr] update default API credentials ([#4332](https://github.com/mikf/gallery-dl/issues/4332)) + - [giantessbooru] fix extraction ([#4373](https://github.com/mikf/gallery-dl/issues/4373)) + - [hiperdex] fix crash for titles containing Unicode characters ([#4325](https://github.com/mikf/gallery-dl/issues/4325)) + - [hiperdex] fix `manga` metadata + - [imagefap] fix pagination ([#3013](https://github.com/mikf/gallery-dl/issues/3013)) + - [imagevenue] fix extraction ([#4473](https://github.com/mikf/gallery-dl/issues/4473)) + - [instagram] fix private posts with long shortcodes ([#4362](https://github.com/mikf/gallery-dl/issues/4362)) + - [instagram] fix video preview archive IDs ([#2135](https://github.com/mikf/gallery-dl/issues/2135), [#4455](https://github.com/mikf/gallery-dl/issues/4455)) + - [instagram] handle exceptions due to missing media ([#4555](https://github.com/mikf/gallery-dl/issues/4555)) + - [issuu] fix extraction ([#4420](https://github.com/mikf/gallery-dl/issues/4420)) + - [jpgfish] update domain to `jpg1.su` ([#4494](https://github.com/mikf/gallery-dl/issues/4494)) + - [kemonoparty] update `favorite` API endpoint ([#4522](https://github.com/mikf/gallery-dl/issues/4522)) + - [lensdump] fix extraction ([#4352](https://github.com/mikf/gallery-dl/issues/4352)) + - [mangakakalot] update domain + - [reddit] fix `preview.redd.it` URLs ([#4470](https://github.com/mikf/gallery-dl/issues/4470)) + - [patreon] fix extraction ([#4547](https://github.com/mikf/gallery-dl/issues/4547)) + - [pixiv] handle errors for private novels ([#4481](https://github.com/mikf/gallery-dl/issues/4481)) + - [pornhub] fix extraction ([#4301](https://github.com/mikf/gallery-dl/issues/4301)) + - [pururin] fix extraction ([#4375](https://github.com/mikf/gallery-dl/issues/4375)) + - [subscribestar] fix preview detection ([#4468](https://github.com/mikf/gallery-dl/issues/4468)) + - [twitter] fix crash on private user ([#4349](https://github.com/mikf/gallery-dl/issues/4349)) + - [twitter] fix `TweetWithVisibilityResults` ([#4369](https://github.com/mikf/gallery-dl/issues/4369)) + - [twitter] fix crash when `sortIndex` is undefined ([#4499](https://github.com/mikf/gallery-dl/issues/4499)) + - [zerochan] fix `tags` extraction ([#4315](https://github.com/mikf/gallery-dl/issues/4315), [#4319](https://github.com/mikf/gallery-dl/issues/4319)) + #### Removals + - [gfycat] remove module + - [shimmie2] remove `meme.museum` +- ### Post Processors + #### Changes + - update `finalize` events + - add `finalize-error` and `finalize-success` events that trigger + depending on whether error(s) did or did not happen + - change `finalize` to always trigger regardless of error status + #### Additions + - add `python` post processor + - add `prepare-after` event ([#4083](https://github.com/mikf/gallery-dl/issues/4083)) + - [ugoira] add `"framerate": "uniform"` ([#4421](https://github.com/mikf/gallery-dl/issues/4421)) + #### Improvements + - [ugoira] extend `ffmpeg-output` ([#4421](https://github.com/mikf/gallery-dl/issues/4421)) + #### Fixes + - [ugoira] restore `libx264-prevent-odd` ([#4407](https://github.com/mikf/gallery-dl/issues/4407)) + - [ugoira] fix high frame rates ([#4421](https://github.com/mikf/gallery-dl/issues/4421)) +- ### Downloaders + #### Fixes + - [http] close connection when file already exists ([#4403](https://github.com/mikf/gallery-dl/issues/4403)) +- ### Options + #### Additions + - support `parent>child` categories for child extractor options, + for example an `imgur` album from a `reddit` thread with `reddit>imgur` + - implement `subconfigs` option ([#4440](https://github.com/mikf/gallery-dl/issues/4440)) + - add `"ascii+"` as a special `path-restrict` value ([#4371](https://github.com/mikf/gallery-dl/issues/4371)) + #### Removals + - remove `pyopenssl` option +- ### Tests + #### Improvements + - move extractor results into their own, separate files ([#4504](https://github.com/mikf/gallery-dl/issues/4504)) + - include fallback URLs in content tests ([#3163](https://github.com/mikf/gallery-dl/issues/3163)) + - various test method improvements +- ### Miscellaneous + #### Fixes + - [formatter] use value of last alternative ([#4492](https://github.com/mikf/gallery-dl/issues/4492)) + - fix imports when running `__main__.py` ([#4581](https://github.com/mikf/gallery-dl/issues/4581)) + - fix symlink resolution in `__main__.py` + - fix default Firefox user agent string + ## 1.25.8 - 2023-07-15 ### Changes - update default User-Agent header to Firefox 115 ESR diff --git a/README.rst b/README.rst index 51e239c1..14cfb095 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 39cfbd1c..d06d9d6e 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.0-dev" +__version__ = "1.26.0" From 3438a3098dae41232ee8a3224a11733a47edcefe Mon Sep 17 00:00:00 2001 From: Nahida Date: Wed, 4 Oct 2023 10:34:02 +0800 Subject: [PATCH 026/344] [twitter] add possible_sensitive field --- gallery_dl/extractor/twitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 3895c74c..553cf6e4 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -305,6 +305,7 @@ class TwitterExtractor(Extractor): "author" : author, "user" : self._user or author, "lang" : tweet["lang"], + "possibly_sensitive" : tweet["possibly_sensitive"], "source" : text.extr(source, ">", "<"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), From efaab4fbfa0071403c6e31565527b4d422332823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 4 Oct 2023 22:59:25 +0200 Subject: [PATCH 027/344] [twitter] fix crash due to missing 'source' (#4620) regression caused by 06aaedde --- gallery_dl/extractor/twitter.py | 1 + gallery_dl/version.py | 2 +- test/results/twitter.py | 18 ++++++++---------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 3895c74c..ab5afd0a 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -451,6 +451,7 @@ class TwitterExtractor(Extractor): "id_str": id_str, "lang": None, "user": user, + "source": "><", "entities": {}, "extended_entities": { "media": [ diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d06d9d6e..ef2afe69 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.0" +__version__ = "1.26.1-dev" diff --git a/test/results/twitter.py b/test/results/twitter.py index 4094bc94..fa95a046 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -254,10 +254,9 @@ __tests__ = ( "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, - "content": r"""re:Gear up for #PokemonSwordShieldEX with special Mystery Gifts! - -You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, plus some very useful items. It’s our \(Mystery\) Gift to you, Trainers! - + "source" : "Sprinklr", + "content": r"""re:Gear up for #PokemonSwordShieldEX with special Mystery Gifts! \n +You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, plus some very useful items. It’s our \(Mystery\) Gift to you, Trainers! \n ❓🎁➡️ """, }, @@ -482,8 +481,9 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "#comment" : "age-restricted (#2354)", "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, - "#options" : {"syndication": True}, - "#count" : 1, + "#exception": exception.AuthorizationError, + "#options" : {"syndication": True}, + "#count" : 1, "date": "dt:2022-02-13 20:10:09", }, @@ -513,10 +513,8 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, - "content": """BREAKING - DEADLY LIES: Independent researchers at Texas A&M University have just contradicted federal government regulators, saying that toxic air pollutants in East Palestine, Ohio, could pose long-term risks. - -The Washington Post writes, "Three weeks after the toxic train derailment in Ohio, an analysis of Environmental Protection Agency data has found nine air pollutants at levels that could raise long-term health concerns in and around East Palestine, according to an independent analysis. - + "content": """BREAKING - DEADLY LIES: Independent researchers at Texas A&M University have just contradicted federal government regulators, saying that toxic air pollutants in East Palestine, Ohio, could pose long-term risks. \n +The Washington Post writes, "Three weeks after the toxic train derailment in Ohio, an analysis of Environmental Protection Agency data has found nine air pollutants at levels that could raise long-term health concerns in and around East Palestine, according to an independent analysis. \n "The analysis by Texas A&M University seems to contradict statements by state and federal regulators that air near the crash site is completely safe, despite residents complaining about rashes, breathing problems and other health effects." Your reaction.""", }, From b846f56c3a4017c0faee6feae95336a8ae412ad9 Mon Sep 17 00:00:00 2001 From: HRXN Date: Thu, 5 Oct 2023 12:18:58 +0200 Subject: [PATCH 028/344] [imgbb] Fix `user` extraction, add `displayname` --- gallery_dl/extractor/imgbb.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 1b741802..282f8be7 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -124,12 +124,14 @@ class ImgbbAlbumExtractor(ImgbbExtractor): self.page_url = "https://ibb.co/album/" + self.album_id def metadata(self, page): - album, pos = text.extract(page, '"og:title" content="', '"') - user , pos = text.extract(page, 'rel="author">', '<', pos) + album , pos = text.extract(page, '"og:title" content="', '"') + user , pos = text.extract(page, ',"username":"', '"', pos) + displayname, pos = text.extract(page, '"user":{"name":"', '"') return { - "album_id" : self.album_id, - "album_name": text.unescape(album), - "user" : user.lower() if user else "", + "album_id" : self.album_id, + "album_name" : text.unescape(album), + "user" : user.lower() if user else "", + "displayname": displayname if displayname else "", } def images(self, page): @@ -158,7 +160,11 @@ class ImgbbUserExtractor(ImgbbExtractor): self.page_url = "https://{}.imgbb.com/".format(self.user) def metadata(self, page): - return {"user": self.user} + displayname, pos = text.extract(page, '"user":{"name":"', '"') + return { + "user" : self.user, + "displayname": displayname if displayname else "", + } def images(self, page): user = text.extr(page, '.obj.resource={"id":"', '"') @@ -185,11 +191,11 @@ class ImgbbImageExtractor(ImgbbExtractor): image = { "id" : self.image_id, - "title" : text.unescape(extr('"og:title" content="', '"')), + "title" : text.unescape(extr('"og:title" content="', ' hosted at ImgBB"')), "url" : extr('"og:image" content="', '"'), "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), - "user" : extr('rel="author">', '<').lower(), + "user" : extr(',"username":"', '"').lower(), } image["extension"] = text.ext_from_url(image["url"]) From 63db54b9054b1d18aeaa789c636f065d0fa0ef07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Oct 2023 09:41:38 +0200 Subject: [PATCH 029/344] [patreon] update 'campaign_id' path (#4639) --- gallery_dl/extractor/patreon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 729ceaf1..6ac9a83b 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -277,7 +277,7 @@ class PatreonCreatorExtractor(PatreonExtractor): try: data = self._extract_bootstrap(page) - campaign_id = data["creator"]["data"]["id"] + campaign_id = data["campaign"]["data"]["id"] except (KeyError, ValueError): raise exception.NotFoundError("creator") From 15f940819b1749130ccdc03cf5cbee97b75c690a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Oct 2023 11:20:10 +0200 Subject: [PATCH 030/344] [newgrounds] support 'art-image' files (#4642) --- gallery_dl/extractor/newgrounds.py | 4 +++- test/results/newgrounds.py | 33 ++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 1bcc915d..d754e653 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -57,7 +57,8 @@ class NewgroundsExtractor(Extractor): yield Message.Url, url, text.nameext_from_url(url, post) for num, url in enumerate(text.extract_iter( - post["_comment"], 'data-smartload-src="', '"'), 1): + post["_images"] + post["_comment"], + 'data-smartload-src="', '"'), 1): post["num"] = num post["_index"] = "{}_{:>02}".format(post["index"], num) url = text.ensure_http_scheme(url) @@ -135,6 +136,7 @@ class NewgroundsExtractor(Extractor): extr = text.extract_from(page) data = extract_data(extr, post_url) + data["_images"] = extr('
').partition(">")[2] data["comment"] = text.unescape(text.remove_html( diff --git a/test/results/newgrounds.py b/test/results/newgrounds.py index 3dac724d..1f956f19 100644 --- a/test/results/newgrounds.py +++ b/test/results/newgrounds.py @@ -12,17 +12,17 @@ __tests__ = ( "#url" : "https://www.newgrounds.com/art/view/tomfulp/ryu-is-hawt", "#category": ("", "newgrounds", "image"), "#class" : newgrounds.NewgroundsImageExtractor, - "#sha1_url" : "57f182bcbbf2612690c3a54f16ffa1da5105245e", + "#urls" : "https://art.ngfiles.com/images/1993000/1993615_4474_tomfulp_ryu-is-hawt.44f81090378ae9c257a5e46a8e17cc4d.gif?f1695674895", "#sha1_content": "8f395e08333eb2457ba8d8b715238f8910221365", "artist" : ["tomfulp"], - "comment" : r"re:Consider this the bottom threshold for ", + "comment" : "Consider this the bottom threshold for scouted artists.In fact consider it BELOW the bottom threshold.", "date" : "dt:2009-06-04 14:44:05", - "description": r"re:Consider this the bottom threshold for ", + "description": "Consider this the bottom threshold for scouted artists. In fact consider it BELOW the bottom threshold. ", "favorites" : int, - "filename" : "94_tomfulp_ryu-is-hawt", + "filename" : "1993615_4474_tomfulp_ryu-is-hawt.44f81090378ae9c257a5e46a8e17cc4d", "height" : 476, - "index" : 94, + "index" : 1993615, "rating" : "e", "score" : float, "tags" : [ @@ -39,15 +39,32 @@ __tests__ = ( "#url" : "https://art.ngfiles.com/images/0/94_tomfulp_ryu-is-hawt.gif", "#category": ("", "newgrounds", "image"), "#class" : newgrounds.NewgroundsImageExtractor, - "#sha1_url": "57f182bcbbf2612690c3a54f16ffa1da5105245e", + "#urls" : "https://art.ngfiles.com/images/1993000/1993615_4474_tomfulp_ryu-is-hawt.44f81090378ae9c257a5e46a8e17cc4d.gif?f1695674895", }, { "#url" : "https://www.newgrounds.com/art/view/sailoryon/yon-dream-buster", + "#comment" : "embedded file in 'comments' (#1033)", "#category": ("", "newgrounds", "image"), "#class" : newgrounds.NewgroundsImageExtractor, - "#count" : 2, - "#sha1_url": "84eec95e663041a80630df72719f231e157e5f5d", + "#urls" : ( + "https://art.ngfiles.com/images/1438000/1438673_sailoryon_yon-dream-buster.jpg?f1601058173", + "https://art.ngfiles.com/comments/172000/iu_172374_7112211.jpg", + ), +}, + +{ + "#url" : "https://www.newgrounds.com/art/view/zedrinbot/lewd-animation-tutorial", + "#comment" : "extra files in 'art-image-row' elements (#4642)", + "#category": ("", "newgrounds", "image"), + "#class" : newgrounds.NewgroundsImageExtractor, + "#urls" : ( + "https://art.ngfiles.com/images/5091000/5091275_45067_zedrinbot_untitled-5091275.0a9d27ed2bc265a7e89478ed6ad6f86f.gif?f1696187399", + "https://art.ngfiles.com/medium_views/5091000/5091275_45071_zedrinbot_untitled-5091275.6fdc62eaef43528fb1c9bda624d30a3d.webp?f1696187437", + "https://art.ngfiles.com/medium_views/5091000/5091275_45070_zedrinbot_untitled-5091275.0d7334746374465bd448908b88d1f810.webp?f1696187435", + "https://art.ngfiles.com/medium_views/5091000/5091275_45072_zedrinbot_untitled-5091275.6fdc62eaef43528fb1c9bda624d30a3d.webp?f1696187438", + "https://art.ngfiles.com/medium_views/5091000/5091275_45073_zedrinbot_untitled-5091275.20aa05c1cd22fd058e8c68ce58f5a302.webp?f1696187439", + ), }, { From 9a008523acad9afc569876b23bb3bf67e7be6db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Oct 2023 11:44:14 +0200 Subject: [PATCH 031/344] [hentaifoundry] fix '.swf' file downloads (#4641) --- gallery_dl/extractor/hentaifoundry.py | 22 ++++++++++++---- test/results/hentaifoundry.py | 38 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 4c02000c..8ba23c2a 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -72,13 +72,11 @@ class HentaifoundryExtractor(Extractor): extr = text.extract_from(page, page.index('id="picBox"')) data = { + "index" : text.parse_int(path.rsplit("/", 2)[1]), "title" : text.unescape(extr('class="imageTitle">', '<')), "artist" : text.unescape(extr('/profile">', '<')), - "width" : text.parse_int(extr('width="', '"')), - "height" : text.parse_int(extr('height="', '"')), - "index" : text.parse_int(path.rsplit("/", 2)[1]), - "src" : text.urljoin(self.root, text.unescape(extr( - 'src="', '"'))), + "_body" : extr( + '
Description
', '') .replace("\r\n", "\n"), "", "")), @@ -92,6 +90,20 @@ class HentaifoundryExtractor(Extractor): ">Tags ", "
")), } + body = data["_body"] + if " Date: Mon, 9 Oct 2023 12:50:10 +0200 Subject: [PATCH 032/344] [pp:exec] support more replacement fields for '--exec' (#4633) - {_directory} - {_filename} - {_path} (alias for {}) --- docs/options.md | 10 ++++++---- gallery_dl/option.py | 5 ++++- gallery_dl/postprocessor/exec.py | 17 ++++++++++++----- test/test_postprocessor.py | 8 ++++++-- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/docs/options.md b/docs/options.md index b0abcf85..2486cbfe 100644 --- a/docs/options.md +++ b/docs/options.md @@ -121,11 +121,13 @@ --write-tags Write image tags to separate text files --mtime-from-date Set file modification times according to 'date' metadata - --exec CMD Execute CMD for each downloaded file. Example: - --exec "convert {} {}.png && rm {}" + --exec CMD Execute CMD for each downloaded file. Supported + replacement fields are {} or {_path}, + {_directory}, {_filename}. Example: --exec + "convert {} {}.png && rm {}" --exec-after CMD Execute CMD after all files were downloaded - successfully. Example: --exec-after "cd {} && - convert * ../doc.pdf" + successfully. Example: --exec-after "cd + {_directory} && convert * ../doc.pdf" -P, --postprocessor NAME Activate the specified post processor -O, --postprocessor-option OPT Additional '=' post processor diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 08e6e701..1982b71d 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -510,6 +510,8 @@ def build_parser(): dest="postprocessors", metavar="CMD", action=AppendCommandAction, const={"name": "exec"}, help=("Execute CMD for each downloaded file. " + "Supported replacement fields are " + "{} or {_path}, {_directory}, {_filename}. " "Example: --exec \"convert {} {}.png && rm {}\""), ) postprocessor.add_argument( @@ -518,7 +520,8 @@ def build_parser(): action=AppendCommandAction, const={ "name": "exec", "event": "finalize"}, help=("Execute CMD after all files were downloaded successfully. " - "Example: --exec-after \"cd {} && convert * ../doc.pdf\""), + "Example: --exec-after \"cd {_directory} " + "&& convert * ../doc.pdf\""), ) postprocessor.add_argument( "-P", "--postprocessor", diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index afa828c0..e7ed2f69 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -12,6 +12,7 @@ from .common import PostProcessor from .. import util, formatter import subprocess import os +import re if util.WINDOWS: @@ -32,6 +33,7 @@ class ExecPP(PostProcessor): args = options["command"] if isinstance(args, str): self.args = args + self._sub = re.compile(r"\{(_directory|_filename|_path|)\}").sub execute = self.exec_string else: self.args = [formatter.parse(arg) for arg in args] @@ -69,11 +71,8 @@ class ExecPP(PostProcessor): if archive and archive.check(pathfmt.kwdict): return - if pathfmt.realpath: - args = self.args.replace("{}", quote(pathfmt.realpath)) - else: - args = self.args.replace("{}", quote(pathfmt.realdirectory)) - + self.pathfmt = pathfmt + args = self._sub(self._replace, self.args) self._exec(args, True) if archive: @@ -90,5 +89,13 @@ class ExecPP(PostProcessor): self.log.debug("Running '%s'", args) subprocess.Popen(args, shell=shell) + def _replace(self, match): + name = match.group(1) + if name == "_directory": + return quote(self.pathfmt.realdirectory) + if name == "_filename": + return quote(self.pathfmt.filename) + return quote(self.pathfmt.realpath) + __postprocessor__ = ExecPP diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index c00144e7..b64df882 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -168,7 +168,7 @@ class ExecTest(BasePostprocessorTest): def test_command_string(self): self._create({ - "command": "echo {} && rm {};", + "command": "echo {} {_path} {_directory} {_filename} && rm {};", }) with patch("subprocess.Popen") as p: @@ -178,7 +178,11 @@ class ExecTest(BasePostprocessorTest): self._trigger(("after",)) p.assert_called_once_with( - "echo {0} && rm {0};".format(self.pathfmt.realpath), shell=True) + "echo {0} {0} {1} {2} && rm {0};".format( + self.pathfmt.realpath, + self.pathfmt.realdirectory, + self.pathfmt.filename), + shell=True) i.wait.assert_called_once_with() def test_command_list(self): From a2daa9befe094d06033c1e908e15d2fbbeba750e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Oct 2023 15:29:17 +0200 Subject: [PATCH 033/344] [imgbb] fix flake8 and username order --- gallery_dl/extractor/imgbb.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 282f8be7..c3341443 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -125,13 +125,13 @@ class ImgbbAlbumExtractor(ImgbbExtractor): def metadata(self, page): album , pos = text.extract(page, '"og:title" content="', '"') - user , pos = text.extract(page, ',"username":"', '"', pos) - displayname, pos = text.extract(page, '"user":{"name":"', '"') + displayname, pos = text.extract(page, '"user":{"name":"', '"', pos) + username , pos = text.extract(page, ',"username":"', '"', pos) return { "album_id" : self.album_id, "album_name" : text.unescape(album), - "user" : user.lower() if user else "", - "displayname": displayname if displayname else "", + "user" : username.lower() if username else "", + "displayname": displayname or "", } def images(self, page): @@ -161,9 +161,10 @@ class ImgbbUserExtractor(ImgbbExtractor): def metadata(self, page): displayname, pos = text.extract(page, '"user":{"name":"', '"') + username , pos = text.extract(page, ',"username":"', '"', pos) return { - "user" : self.user, - "displayname": displayname if displayname else "", + "user" : username or self.user, + "displayname": displayname or "", } def images(self, page): @@ -191,7 +192,8 @@ class ImgbbImageExtractor(ImgbbExtractor): image = { "id" : self.image_id, - "title" : text.unescape(extr('"og:title" content="', ' hosted at ImgBB"')), + "title" : text.unescape(extr( + '"og:title" content="', ' hosted at ImgBB"')), "url" : extr('"og:image" content="', '"'), "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), From 95c280c59b66b4553dd9289fe2f5aaae45f6dd95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Oct 2023 15:33:25 +0200 Subject: [PATCH 034/344] [imgbb] update pagination end condition (#4626) --- gallery_dl/extractor/imgbb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index c3341443..bf7bd8f9 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -99,7 +99,7 @@ class ImgbbExtractor(Extractor): for img in text.extract_iter(page, "data-object='", "'"): yield util.json_loads(text.unquote(img)) if data: - if params["seek"] == data["seekEnd"]: + if not data["seekEnd"] or params["seek"] == data["seekEnd"]: return params["seek"] = data["seekEnd"] params["page"] += 1 From f3d6aaff134e5c9f7fd4c49cc8669128fae71799 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Oct 2023 15:39:09 +0200 Subject: [PATCH 035/344] [twitter] rename to 'sensitive'; use 'tget()' --- gallery_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 553cf6e4..6237a8b9 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -305,8 +305,8 @@ class TwitterExtractor(Extractor): "author" : author, "user" : self._user or author, "lang" : tweet["lang"], - "possibly_sensitive" : tweet["possibly_sensitive"], "source" : text.extr(source, ">", "<"), + "sensitive" : tget("possibly_sensitive"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), "reply_count" : tget("reply_count"), From 8bb7243c1020c07108064ba1b950ac941fe0c376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 10 Oct 2023 18:33:01 +0200 Subject: [PATCH 036/344] [reddit] fix wrong previews (#4649) caused by a failed comment URL using the main submission's preview as fallback 14af15bd 4963bb9b --- gallery_dl/extractor/reddit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 85533126..cd2ba3d2 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -125,7 +125,8 @@ class RedditExtractor(Extractor): if match: extra.append(match.group(1)) elif not match_user(url) and not match_subreddit(url): - if previews and "preview" in data: + if previews and "comment" not in data and \ + "preview" in data: data["_fallback"] = self._previews(data) yield Message.Queue, text.unescape(url), data if "_fallback" in data: From d194ea68a93ae74d3cc9bfa5d4c12e5621149e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 12 Oct 2023 21:16:42 +0200 Subject: [PATCH 037/344] [cookies] open cookie databases in read-only mode bypasses the need to copy the entire database file might solve #4195 --- gallery_dl/cookies.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index c5c5667b..20959a8f 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -47,7 +47,7 @@ def load_cookies(cookiejar, browser_specification): def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): path, container_id = _firefox_cookies_database(profile, container) - with DatabaseCopy(path) as db: + with DatabaseConnection(path) as db: sql = ("SELECT name, value, host, path, isSecure, expiry " "FROM moz_cookies") @@ -100,7 +100,7 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, path = _chrome_cookies_database(profile, config) _log_debug("Extracting cookies from %s", path) - with DatabaseCopy(path) as db: + with DatabaseConnection(path) as db: db.text_factory = bytes decryptor = get_cookie_decryptor( config["directory"], config["keyring"], keyring) @@ -814,7 +814,7 @@ class DataParser: self.skip_to(len(self._data), description) -class DatabaseCopy(): +class DatabaseConnection(): def __init__(self, path): self.path = path @@ -822,13 +822,26 @@ class DatabaseCopy(): self.directory = None def __enter__(self): + try: + # https://www.sqlite.org/uri.html#the_uri_path + path = self.path.replace("?", "%3f").replace("#", "%23") + if util.WINDOWS: + path = "/" + os.path.abspath(path) + + uri = "file:{}?mode=ro&immutable=1".format(path) + self.database = sqlite3.connect( + uri, uri=True, isolation_level=None, check_same_thread=False) + return self.database + except Exception: + _log_debug("Falling back to temporary database copy") + try: self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-") path_copy = os.path.join(self.directory.name, "copy.sqlite") shutil.copyfile(self.path, path_copy) - self.database = db = sqlite3.connect( + self.database = sqlite3.connect( path_copy, isolation_level=None, check_same_thread=False) - return db + return self.database except BaseException: if self.directory: self.directory.cleanup() @@ -836,7 +849,8 @@ class DatabaseCopy(): def __exit__(self, exc, value, tb): self.database.close() - self.directory.cleanup() + if self.directory: + self.directory.cleanup() def Popen_communicate(*args): From 2974b8e3c82b2a53cb6bc4829f5e13f248fc8d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 12 Oct 2023 21:32:41 +0200 Subject: [PATCH 038/344] [moebooru] add 'metadata' option (#4646) for extended 'pool' metadata --- docs/configuration.rst | 12 ++++++++++++ gallery_dl/extractor/moebooru.py | 5 +++++ test/results/konachan.py | 10 ++++++++++ test/results/yandere.py | 21 +++++++++++++++++++++ 4 files changed, 48 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 2cb28981..5988d7be 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2247,6 +2247,18 @@ Description Fetch media from replies to other notes. +extractor.[moebooru].pool.metadata +---------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract extended ``pool`` metadata. + + Note: Not supported by all ``moebooru`` instances. + + extractor.newgrounds.flash -------------------------- Type diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 145dd51d..e97d2736 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -124,6 +124,11 @@ class MoebooruPoolExtractor(MoebooruExtractor): self.pool_id = match.group(match.lastindex) def metadata(self): + if self.config("metadata"): + url = "{}/pool/show/{}.json".format(self.root, self.pool_id) + pool = self.request(url).json() + pool.pop("posts", None) + return {"pool": pool} return {"pool": text.parse_int(self.pool_id)} def posts(self): diff --git a/test/results/konachan.py b/test/results/konachan.py index 576acdaa..ed4d2d78 100644 --- a/test/results/konachan.py +++ b/test/results/konachan.py @@ -5,6 +5,7 @@ # published by the Free Software Foundation. from gallery_dl.extractor import moebooru +from gallery_dl import exception __tests__ = ( @@ -54,6 +55,15 @@ __tests__ = ( "#sha1_content": "cf0546e38a93c2c510a478f8744e60687b7a8426", }, +{ + "#url" : "https://konachan.com/pool/show/95", + "#comment" : "'metadata' option (#4646)", + "#category": ("moebooru", "konachan", "pool"), + "#class" : moebooru.MoebooruPoolExtractor, + "#options" : {"metadata": True}, + "#exception": exception.HttpError, +}, + { "#url" : "https://konachan.net/pool/show/95", "#category": ("moebooru", "konachan", "pool"), diff --git a/test/results/yandere.py b/test/results/yandere.py index 3b4d9a96..74194bb4 100644 --- a/test/results/yandere.py +++ b/test/results/yandere.py @@ -62,6 +62,27 @@ __tests__ = ( "#sha1_content": "2a35b9d6edecce11cc2918c6dce4de2198342b68", }, +{ + "#url" : "https://yande.re/pool/show/318", + "#comment" : "'metadata' option (#4646)", + "#category": ("moebooru", "yandere", "pool"), + "#class" : moebooru.MoebooruPoolExtractor, + "#options" : {"metadata": True}, + "#count" : 3, + + "pool": { + "created_at" : "2008-12-13T15:56:10.728Z", + "description": "Dengeki Hime's posts are in pool #97.", + "id" : 318, + "is_public" : True, + "name" : "Galgame_Mag_08", + "post_count" : 3, + "updated_at" : "2012-03-11T14:31:00.935Z", + "user_id" : 1305, + }, + +}, + { "#url" : "https://yande.re/post/popular_by_month?month=6&year=2014", "#category": ("moebooru", "yandere", "popular"), From 992e86ec94bebabe2ba90142572f65ef7a792edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 12 Oct 2023 21:37:10 +0200 Subject: [PATCH 039/344] [deviantart] disable 'jwt' (#4652) --- docs/configuration.rst | 4 +++- docs/gallery-dl.conf | 2 +- gallery_dl/extractor/deviantart.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 5988d7be..2c6c6809 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1354,12 +1354,14 @@ extractor.deviantart.jwt Type ``bool`` Default - ``true`` + ``false`` Description Update `JSON Web Tokens `__ (the ``token`` URL parameter) of otherwise non-downloadable, low-resolution images to be able to download them in full resolution. + Note: No longer functional as of 2023-10-11 + extractor.deviantart.mature --------------------------- diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 2eac0a1c..9f126524 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -85,7 +85,7 @@ "group": true, "include": "gallery", "journals": "html", - "jwt": true, + "jwt": false, "mature": true, "metadata": false, "original": true, diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 94210967..69953adb 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -42,7 +42,7 @@ class DeviantartExtractor(Extractor): self.offset = 0 def _init(self): - self.jwt = self.config("jwt", True) + self.jwt = self.config("jwt", False) self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.quality = self.config("quality", "100") From a9c3442d4e669f446c0af36fbd22712fe82244d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 12 Oct 2023 21:40:10 +0200 Subject: [PATCH 040/344] [deviantart] add a couple 'deactivated account' test URLs --- test/results/deviantart.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 91c6a0d7..82513ee3 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -93,6 +93,27 @@ __tests__ = ( "#count" : 1, }, +{ + "#url" : "https://www.deviantart.com/AlloyRabbit/gallery", + "#comment" : "deactivated account", + "#category": ("", "deviantart", "gallery"), + "#class" : deviantart.DeviantartGalleryExtractor, +}, + +{ + "#url" : "https://www.deviantart.com/Shydude/gallery", + "#comment" : "deactivated account", + "#category": ("", "deviantart", "gallery"), + "#class" : deviantart.DeviantartGalleryExtractor, +}, + +{ + "#url" : "https://www.deviantart.com/zapor666/gallery", + "#comment" : "deactivated account", + "#category": ("", "deviantart", "gallery"), + "#class" : deviantart.DeviantartGalleryExtractor, +}, + { "#url" : "https://www.deviantart.com/yakuzafc/gallery", "#comment" : "group", @@ -702,6 +723,13 @@ __tests__ = ( "#count" : 12, }, +{ + "#url" : "https://www.deviantart.com/chain-man/gallery/scraps", + "#comment" : "deactivated account" + "#category": ("", "deviantart", "scraps"), + "#class" : deviantart.DeviantartScrapsExtractor, +}, + { "#url" : "https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", "#category": ("", "deviantart", "scraps"), From 2d4170276250e3ce85a0e7a86ab124dd7eddc9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 12 Oct 2023 22:07:11 +0200 Subject: [PATCH 041/344] [deviantart] implement '"group": "skip"' (#4630) --- docs/configuration.rst | 10 +++++++++- gallery_dl/extractor/deviantart.py | 22 ++++++++++++++-------- test/results/deviantart.py | 12 +++++++++++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 2c6c6809..64d1a0af 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1306,13 +1306,21 @@ Description extractor.deviantart.group -------------------------- Type - ``bool`` + * ``bool`` + * ``string`` Default ``true`` Description Check whether the profile name in a given URL belongs to a group or a regular user. + When disabled, assume every given profile name + belongs to a regular user. + + Special values: + + * ``"skip"``: Skip groups + extractor.deviantart.include ---------------------------- diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 69953adb..78d8473a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -91,14 +91,20 @@ class DeviantartExtractor(Extractor): return True def items(self): - if self.user and self.config("group", True): - profile = self.api.user_profile(self.user) - self.group = not profile - if self.group: - self.subcategory = "group-" + self.subcategory - self.user = self.user.lower() - else: - self.user = profile["user"]["username"] + if self.user: + group = self.config("group", True) + if group: + profile = self.api.user_profile(self.user) + if profile: + self.user = profile["user"]["username"] + self.group = False + elif group == "skip": + self.log.info("Skipping group '%s'", self.user) + raise exception.StopExtraction() + else: + self.subcategory = "group-" + self.subcategory + self.user = self.user.lower() + self.group = True for deviation in self.deviations(): if isinstance(deviation, tuple): diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 82513ee3..e7ca59bb 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -123,6 +123,16 @@ __tests__ = ( "#count" : ">= 15", }, +{ + "#url" : "https://www.deviantart.com/yakuzafc/gallery", + "#comment" : "'group': 'skip' (#4630)", + "#category" : ("", "deviantart", "gallery"), + "#class" : deviantart.DeviantartGalleryExtractor, + "#options" : {"group": "skip"}, + "#exception": exception.StopExtraction, + "#count" : 0, +}, + { "#url" : "https://www.deviantart.com/justatest235723/gallery", "#comment" : "'folders' option (#276)", @@ -725,7 +735,7 @@ __tests__ = ( { "#url" : "https://www.deviantart.com/chain-man/gallery/scraps", - "#comment" : "deactivated account" + "#comment" : "deactivated account", "#category": ("", "deviantart", "scraps"), "#class" : deviantart.DeviantartScrapsExtractor, }, From 833dce141f3fe8b0bbb3a441c37bfd8c68989551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 13 Oct 2023 20:06:39 +0200 Subject: [PATCH 042/344] [fantia] add 'content_count' and 'content_num' metadata fields (#4627) --- gallery_dl/extractor/fantia.py | 7 ++++++- test/results/fantia.py | 8 ++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index f1d51e21..8a59e096 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -42,7 +42,11 @@ class FantiaExtractor(Extractor): post = self._get_post_data(post_id) post["num"] = 0 - for content in self._get_post_contents(post): + contents = self._get_post_contents(post) + post["content_count"] = len(contents) + post["content_num"] = 0 + + for content in contents: files = self._process_content(post, content) yield Message.Directory, post @@ -131,6 +135,7 @@ class FantiaExtractor(Extractor): post["content_filename"] = content.get("filename") or "" post["content_id"] = content["id"] post["content_comment"] = content.get("comment") or "" + post["content_num"] += 1 post["plan"] = content["plan"] or self._empty_plan files = [] diff --git a/test/results/fantia.py b/test/results/fantia.py index 706b89d6..70773fb0 100644 --- a/test/results/fantia.py +++ b/test/results/fantia.py @@ -27,12 +27,12 @@ __tests__ = ( "#pattern" : r"https://(c\.fantia\.jp/uploads/post/file/1166373/|cc\.fantia\.jp/uploads/post_content_photo/file/732549[01]|fantia\.jp/posts/1166373/album_image\?)", "blogpost_text" : r"re:^$|This is a test.\n\nThis is a test.\n\n|Link to video:\nhttps://www.youtube.com/watch\?v=5SSdvNcAagI\n\nhtml img from another site:\n\n\n\n\n\n", - "comment" : """ - -""", + "comment" : "\n\n", "content_category": r"re:thumb|blog|photo_gallery", "content_comment" : str, + "content_count" : 5, "content_filename": r"re:|", + "content_num" : range(1, 5), "content_title" : r"re:Test (Blog Content \d+|Image Gallery)|thumb", "date" : "dt:2022-03-09 16:46:12", "fanclub_id" : 356320, @@ -56,7 +56,7 @@ __tests__ = ( "#url" : "https://fantia.jp/posts/508363", "#category": ("", "fantia", "post"), "#class" : fantia.FantiaPostExtractor, - "#count" : 6, + "#count" : 0, "post_title": "zunda逆バニーでおしりコッショリ", "tags" : list, From c4c4e4d2f4bc0d9a7135b63c50886d8f308c4ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 13 Oct 2023 19:52:08 +0200 Subject: [PATCH 043/344] [newgrounds] improve 'art-image' extraction (#4642) - download files in original resolution - replace .webp with extension of first file --- gallery_dl/extractor/newgrounds.py | 15 ++++++++++++++- test/results/newgrounds.py | 27 +++++++++++++++++++++------ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index d754e653..b119c966 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -56,13 +56,26 @@ class NewgroundsExtractor(Extractor): yield Message.Directory, post yield Message.Url, url, text.nameext_from_url(url, post) + ext = post["extension"] for num, url in enumerate(text.extract_iter( post["_images"] + post["_comment"], 'data-smartload-src="', '"'), 1): post["num"] = num post["_index"] = "{}_{:>02}".format(post["index"], num) url = text.ensure_http_scheme(url) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + + if "_fallback" in post: + del post["_fallback"] + + if "/comments/" not in url: + url = url.replace("/medium_views/", "/images/", 1) + if post["extension"] == "webp": + post["_fallback"] = (url,) + post["extension"] = ext + url = url.replace(".webp", "." + ext) + + yield Message.Url, url, post else: self.log.warning( "Unable to get download URL for '%s'", post_url) diff --git a/test/results/newgrounds.py b/test/results/newgrounds.py index 1f956f19..991ab006 100644 --- a/test/results/newgrounds.py +++ b/test/results/newgrounds.py @@ -55,15 +55,30 @@ __tests__ = ( { "#url" : "https://www.newgrounds.com/art/view/zedrinbot/lewd-animation-tutorial", - "#comment" : "extra files in 'art-image-row' elements (#4642)", + "#comment" : "extra files in 'art-image-row' elements - WebP to GIF (#4642)", "#category": ("", "newgrounds", "image"), "#class" : newgrounds.NewgroundsImageExtractor, "#urls" : ( "https://art.ngfiles.com/images/5091000/5091275_45067_zedrinbot_untitled-5091275.0a9d27ed2bc265a7e89478ed6ad6f86f.gif?f1696187399", - "https://art.ngfiles.com/medium_views/5091000/5091275_45071_zedrinbot_untitled-5091275.6fdc62eaef43528fb1c9bda624d30a3d.webp?f1696187437", - "https://art.ngfiles.com/medium_views/5091000/5091275_45070_zedrinbot_untitled-5091275.0d7334746374465bd448908b88d1f810.webp?f1696187435", - "https://art.ngfiles.com/medium_views/5091000/5091275_45072_zedrinbot_untitled-5091275.6fdc62eaef43528fb1c9bda624d30a3d.webp?f1696187438", - "https://art.ngfiles.com/medium_views/5091000/5091275_45073_zedrinbot_untitled-5091275.20aa05c1cd22fd058e8c68ce58f5a302.webp?f1696187439", + "https://art.ngfiles.com/images/5091000/5091275_45071_zedrinbot_untitled-5091275.6fdc62eaef43528fb1c9bda624d30a3d.gif?f1696187437", + "https://art.ngfiles.com/images/5091000/5091275_45070_zedrinbot_untitled-5091275.0d7334746374465bd448908b88d1f810.gif?f1696187435", + "https://art.ngfiles.com/images/5091000/5091275_45072_zedrinbot_untitled-5091275.6fdc62eaef43528fb1c9bda624d30a3d.gif?f1696187438", + "https://art.ngfiles.com/images/5091000/5091275_45073_zedrinbot_untitled-5091275.20aa05c1cd22fd058e8c68ce58f5a302.gif?f1696187439", + ), +}, + +{ + "#url" : "https://www.newgrounds.com/art/view/zedrinbot/nazrin-tanlines", + "#comment" : "extra files in 'art-image-row' elements - native PNG files (#4642)", + "#category": ("", "newgrounds", "image"), + "#class" : newgrounds.NewgroundsImageExtractor, + "#urls" : ( + "https://art.ngfiles.com/images/5009000/5009916_14628_zedrinbot_nazrin-tanlines.265f7b6beec5855a349e2646e90cbc01.png?f1695698131", + "https://art.ngfiles.com/images/5009000/5009916_14632_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695698148", + "https://art.ngfiles.com/images/5009000/5009916_14634_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695698148", + "https://art.ngfiles.com/images/5009000/5009916_14633_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695698148", + "https://art.ngfiles.com/images/5009000/5009916_14635_zedrinbot_nazrin-tanlines.6a7aa4fd63e5f8077ad29314568246cc.png?f1695698149", + "https://art.ngfiles.com/images/5009000/5009916_14636_zedrinbot_nazrin-tanlines.6a7aa4fd63e5f8077ad29314568246cc.png?f1695698149", ), }, @@ -73,7 +88,7 @@ __tests__ = ( "#category": ("", "newgrounds", "image"), "#class" : newgrounds.NewgroundsImageExtractor, "#options" : {"username": None}, - "#count" : 1, + "#count" : 0, }, { From 13ce3a9acb2b8bc66fb8a1f51d4b40a6d2f084fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 13 Oct 2023 23:03:39 +0200 Subject: [PATCH 044/344] [warosu] fix extraction (#4634) --- gallery_dl/extractor/warosu.py | 34 +++++++++---------- test/results/warosu.py | 62 +++++++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 22 deletions(-) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 6f152ed7..8e6b842a 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -47,7 +47,7 @@ class WarosuThreadExtractor(Extractor): def metadata(self, page): boardname = text.extr(page, "", "") - title = text.extr(page, 'filetitle" itemprop="name">', '<') + title = text.unescape(text.extr(page, "class=filetitle>", "<")) return { "board" : self.board, "board_name": boardname.rpartition(" - ")[2], @@ -57,39 +57,37 @@ class WarosuThreadExtractor(Extractor): def posts(self, page): """Build a list of all post objects""" - page = text.extr(page, '
', '') - needle = '
' + page = text.extr(page, "
") + needle = "
" return [self.parse(post) for post in page.split(needle)] def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "File:" in post: + if " File:" in post: self._extract_image(post, data) part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] return data - @staticmethod - def _extract_post(post): + def _extract_post(self, post): extr = text.extract_from(post) return { - "no" : extr('id="p', '"'), - "name": extr('', ""), - "time": extr(''), - "now" : extr("", "<"), + "no" : extr("id=p", ">"), + "name": extr("class=postername>", "<").strip(), + "time": extr("class=posttime title=", "000>"), + "now" : extr("", "<").strip(), "com" : text.unescape(text.remove_html(extr( - '

', '

' - ).strip())), + "
", "
").strip())), } - @staticmethod - def _extract_image(post, data): + def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("File: ", ", ") + data["fsize"] = extr(" File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") - data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) - extr("
", "") - data["image"] = "https:" + extr('", "") + data["image"] = self.root + extr("") diff --git a/test/results/warosu.py b/test/results/warosu.py index c9273de7..e476b508 100644 --- a/test/results/warosu.py +++ b/test/results/warosu.py @@ -12,17 +12,71 @@ __tests__ = ( "#url" : "https://warosu.org/jp/thread/16656025", "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, - "#sha1_url" : "889d57246ed67e491e5b8f7f124e50ea7991e770", - "#sha1_metadata": "c00ea4c5460c5986994f17bb8416826d42ca57c0", + "#urls" : ( + "https://warosu.org/data/jp/img/0166/56/1488487280004.png", + "https://warosu.org/data/jp/img/0166/56/1488493239417.png", + "https://warosu.org/data/jp/img/0166/56/1488493636725.jpg", + "https://warosu.org/data/jp/img/0166/56/1488493700040.jpg", + "https://warosu.org/data/jp/img/0166/56/1488499585168.jpg", + "https://warosu.org/data/jp/img/0166/56/1488530851199.jpg", + "https://warosu.org/data/jp/img/0166/56/1488536072155.jpg", + "https://warosu.org/data/jp/img/0166/56/1488603426484.png", + "https://warosu.org/data/jp/img/0166/56/1488647021253.jpg", + "https://warosu.org/data/jp/img/0166/56/1488866825031.jpg", + "https://warosu.org/data/jp/img/0166/56/1489094956868.jpg", + ), }, { "#url" : "https://warosu.org/jp/thread/16658073", "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, - "#sha1_url" : "4500cf3184b067424fd9883249bd543c905fbecd", - "#sha1_metadata": "7534edf4ec51891dbf44d775b73fbbefd52eec71", "#sha1_content" : "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", + "#urls" : "https://warosu.org/data/jp/img/0166/58/1488521824388.jpg", + "#count" : 1, + + "board" : "jp", + "board_name": "Otaku Culture", + "com" : "Is this canon?", + "ext" : ".jpg", + "extension" : "jpg", + "filename" : "sadako-vs-kayako-movie-review", + "fsize" : "55 KB", + "h" : 675, + "image" : "https://warosu.org/data/jp/img/0166/58/1488521824388.jpg", + "name" : "Anonymous", + "no" : 16658073, + "now" : "Fri Mar 3 01:17:04 2017", + "thread" : "16658073", + "tim" : 1488521824388, + "time" : 1488503824, + "title" : "Is this canon?", + "w" : 450, +}, + +{ + "#url" : "https://warosu.org/ic/thread/4604652", + "#category": ("", "warosu", "thread"), + "#class" : warosu.WarosuThreadExtractor, + "#pattern" : r"https://warosu\.org/data/ic/img/0046/04/1590\d{9}\.jpg", + "#count" : 133, + + "board" : "ic", + "board_name": "Artwork/Critique", + "com" : str, + "ext" : ".jpg", + "filename" : str, + "fsize" : str, + "h" : range(200, 3507), + "image" : r"re:https://warosu\.org/data/ic/img/0046/04/1590\d+\.jpg", + "name" : "re:Anonymous|Dhe Specky Spider-Man", + "no" : range(4604652, 4620000), + "now" : r"re:\w\w\w \w\w\w \d\d \d\d:\d\d:\d\d 2020", + "thread" : "4604652", + "tim" : range(1590430159651, 1590755510488), + "time" : range(1590415759, 1590755510), + "title" : "American Classic Comic Artists", + "w" : range(200, 3000), }, ) From 830a48bca41d1f56b44082b34b75225c5bddf90b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 13 Oct 2023 23:04:47 +0200 Subject: [PATCH 045/344] [fantia] bad workaround for 833dce14 (#4627) at least this makes "filter": "content_num == content_count+1" with "event": "post-after" work --- gallery_dl/extractor/fantia.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 8a59e096..4a67695f 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -63,6 +63,8 @@ class FantiaExtractor(Extractor): post["content_filename"] or file["file_url"], post) yield Message.Url, file["file_url"], post + post["content_num"] += 1 + def posts(self): """Return post IDs""" From 1671a90077771649fbd92924e95e1c058f64b7e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 14 Oct 2023 14:48:40 +0200 Subject: [PATCH 046/344] use Python 3.12 for GitHub Actions --- .github/workflows/executables.yml | 4 ++-- .github/workflows/tests.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/executables.yml b/.github/workflows/executables.yml index e55ae82a..03251c86 100644 --- a/.github/workflows/executables.yml +++ b/.github/workflows/executables.yml @@ -11,12 +11,12 @@ jobs: matrix: os: ["windows-latest", "macOS-latest"] architecture: ["x64"] - python-version: ["3.11"] + python-version: ["3.12"] python-packages: [""] include: - os: "ubuntu-latest" architecture: "x64" - python-version: "3.11" + python-version: "3.12" python-packages: "secretstorage" - os: "windows-2019" architecture: "x86" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 168edb9b..46539068 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.9"] + python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] steps: - uses: actions/checkout@v3 From c6a3892210a7084183057071eea5c16d0f880a4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 14 Oct 2023 20:55:39 +0200 Subject: [PATCH 047/344] [imgbb] update username extraction (#4626) --- gallery_dl/extractor/imgbb.py | 33 +++++++++++++++-------- test/results/imgbb.py | 50 ++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 26 deletions(-) diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index bf7bd8f9..6c0684ed 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -84,6 +84,13 @@ class ImgbbExtractor(Extractor): raise exception.AuthenticationError() return self.cookies + def _extract_resource(self, page): + return util.json_loads(text.extr( + page, "CHV.obj.resource=", "};") + "}") + + def _extract_user(self, page): + return self._extract_resource(page).get("user") or {} + def _pagination(self, page, endpoint, params): data = None seek, pos = text.extract(page, 'data-seek="', '"') @@ -124,14 +131,14 @@ class ImgbbAlbumExtractor(ImgbbExtractor): self.page_url = "https://ibb.co/album/" + self.album_id def metadata(self, page): - album , pos = text.extract(page, '"og:title" content="', '"') - displayname, pos = text.extract(page, '"user":{"name":"', '"', pos) - username , pos = text.extract(page, ',"username":"', '"', pos) + album = text.extr(page, '"og:title" content="', '"') + user = self._extract_user(page) return { "album_id" : self.album_id, "album_name" : text.unescape(album), - "user" : username.lower() if username else "", - "displayname": displayname or "", + "user" : user.get("username") or "", + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } def images(self, page): @@ -160,11 +167,11 @@ class ImgbbUserExtractor(ImgbbExtractor): self.page_url = "https://{}.imgbb.com/".format(self.user) def metadata(self, page): - displayname, pos = text.extract(page, '"user":{"name":"', '"') - username , pos = text.extract(page, ',"username":"', '"', pos) + user = self._extract_user(page) return { - "user" : username or self.user, - "displayname": displayname or "", + "user" : user.get("username") or self.user, + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } def images(self, page): @@ -188,7 +195,9 @@ class ImgbbImageExtractor(ImgbbExtractor): def items(self): url = "https://ibb.co/" + self.image_id - extr = text.extract_from(self.request(url).text) + page = self.request(url).text + extr = text.extract_from(page) + user = self._extract_user(page) image = { "id" : self.image_id, @@ -197,7 +206,9 @@ class ImgbbImageExtractor(ImgbbExtractor): "url" : extr('"og:image" content="', '"'), "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), - "user" : extr(',"username":"', '"').lower(), + "user" : user.get("username") or "", + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } image["extension"] = text.ext_from_url(image["url"]) diff --git a/test/results/imgbb.py b/test/results/imgbb.py index adc1dcf9..b2351d0f 100644 --- a/test/results/imgbb.py +++ b/test/results/imgbb.py @@ -13,18 +13,34 @@ __tests__ = ( "#url" : "https://ibb.co/album/i5PggF", "#category": ("", "imgbb", "album"), "#class" : imgbb.ImgbbAlbumExtractor, - "#range" : "1-80", - "#sha1_url" : "70afec9fcc3a6de62a6b644b487d892d8d47cf1a", - "#sha1_metadata": "569e1d88ebdd27655387559cdf1cd526a3e1ab69", + "#patten" : r"https://i\.ibb\.co/\w{7}/[\w-]+\.jpg", + "#count" : 91, + "#sha1_url" : "efe7e5a76531436e3b82c87e4ebd34c4dfeb484c", + "#sha1_metadata": "f1ab5492adb6333409f3367566a6dd7110537e21", + + "album_id" : "i5PggF", + "album_name" : "British Scrap Book", + "extension" : "jpg", + "id" : "re:^\w{7}$", + "title" : str, + "url" : r"re:https://i\.ibb\.co/\w{7}/[\w-]+\.jpg", + "user" : "folkie", + "user_id" : "GvFMGK", + "displayname": "Folkie", + "width" : range(501, 1034), + "height" : range(335, 768), + "size" : range(74758, 439037), }, { "#url" : "https://ibb.co/album/i5PggF?sort=title_asc", + "#comment" : "'sort' query argument", "#category": ("", "imgbb", "album"), "#class" : imgbb.ImgbbAlbumExtractor, - "#range" : "1-80", - "#sha1_url" : "afdf5fc95d8e09d77e8f44312f3e9b843987bb5a", - "#sha1_metadata": "f090e14d0e5f7868595082b2c95da1309c84872d", + "#patten" : r"https://i\.ibb\.co/\w{7}/[\w-]+\.jpg", + "#count" : 91, + "#sha1_url" : "cde36552cc132a27178f22a1b9aceaa4df7e1575", + "#sha1_metadata": "b98bbb7671e31ebf9c7585fb9fc691b71bcdb546", }, { @@ -34,7 +50,9 @@ __tests__ = ( "#class" : imgbb.ImgbbAlbumExtractor, "#sha1_url": "ac0abcfcb89f4df6adc2f7e4ff872f3b03ef1bc7", - "user": "", + "displayname": "", + "user" : "", + "user_id" : "", }, { @@ -49,7 +67,7 @@ __tests__ = ( "#url" : "https://folkie.imgbb.com", "#category": ("", "imgbb", "user"), "#class" : imgbb.ImgbbUserExtractor, - "#pattern" : r"https?://i\.ibb\.co/\w+/[^/?#]+", + "#patten" : r"https://i\.ibb\.co/\w{7}/[\w-]+\.jpg", "#range" : "1-80", }, @@ -60,13 +78,15 @@ __tests__ = ( "#pattern" : r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg", "#sha1_content": "c5a0965178a8b357acd8aa39660092918c63795e", - "id" : "fUqh5b", - "title" : "Arundel Ireeman 5", - "url" : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg", - "width" : 960, - "height" : 719, - "user" : "folkie", - "extension": "jpg", + "id" : "fUqh5b", + "title" : "Arundel Ireeman 5", + "url" : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg", + "width" : 960, + "height" : 719, + "user" : "folkie", + "user_id" : "GvFMGK", + "displayname": "Folkie", + "extension" : "jpg", }, ) From 6dfe200ae4a93b8096b6ed87728fdff58319fbbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 15 Oct 2023 19:43:57 +0200 Subject: [PATCH 048/344] [kemonoparty] support discord URLs with channel IDs (#4662) --- gallery_dl/extractor/kemonoparty.py | 46 ++++++++++++++++++----------- test/results/kemonoparty.py | 21 +++++++++++++ 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 894c6719..d8ce4f79 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache +from ..cache import cache, memcache import itertools import re @@ -205,6 +205,12 @@ class KemonopartyExtractor(Extractor): }) return dms + @memcache(keyarg=1) + def _discord_channels(self, server): + url = "{}/api/discord/channels/lookup?q={}".format( + self.root, server) + return self.request(url).json() + def _validate(response): return (response.headers["content-length"] != "9" or @@ -270,11 +276,29 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - _, _, self.server, self.channel, self.channel_name = match.groups() + _, _, self.server, self.channel_id, self.channel = match.groups() + self.channel_name = "" def items(self): self._prepare_ddosguard_cookies() + if self.channel_id: + self.channel_name = self.channel + else: + if self.channel.isdecimal() and len(self.channel) >= 16: + key = "id" + else: + key = "name" + + for channel in self._discord_channels(self.server): + if channel[key] == self.channel: + break + else: + raise exception.NotFoundError("channel") + + self.channel_id = channel["id"] + self.channel_name = channel["name"] + find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall @@ -319,17 +343,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): yield Message.Url, url, post def posts(self): - if self.channel is None: - url = "{}/api/discord/channels/lookup?q={}".format( - self.root, self.server) - for channel in self.request(url).json(): - if channel["name"] == self.channel_name: - self.channel = channel["id"] - break - else: - raise exception.NotFoundError("channel") - - url = "{}/api/discord/channel/{}".format(self.root, self.channel) + url = "{}/api/discord/channel/{}".format(self.root, self.channel_id) params = {"skip": 0} while True: @@ -352,11 +366,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): self.server = match.group(3) def items(self): - url = "{}/api/discord/channels/lookup?q={}".format( - self.root, self.server) - channels = self.request(url).json() - - for channel in channels: + for channel in self._discord_channels(self.server): url = "{}/discord/server/{}/channel/{}#{}".format( self.root, self.server, channel["id"], channel["name"]) channel["_extractor"] = KemonopartyDiscordExtractor diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index f5419f02..005ade4d 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -155,12 +155,33 @@ __tests__ = ( "#class" : kemonoparty.KemonopartyPostExtractor, }, +{ + "#url" : "https://kemono.party/discord/server/488668827274444803#608504710906904576", + "#category": ("", "kemonoparty", "discord"), + "#class" : kemonoparty.KemonopartyDiscordExtractor, + "#count" : 4, + + "channel" : "608504710906904576", + "channel_name": "finish-work", +}, + { "#url" : "https://kemono.party/discord/server/488668827274444803#finish-work", "#category": ("", "kemonoparty", "discord"), "#class" : kemonoparty.KemonopartyDiscordExtractor, "#count" : 4, + "channel" : "608504710906904576", + "channel_name": "finish-work", +}, + +{ + "#url" : "https://kemono.party/discord/server/488668827274444803/channel/608504710906904576#finish-work", + "#category": ("", "kemonoparty", "discord"), + "#class" : kemonoparty.KemonopartyDiscordExtractor, + "#count" : 4, + + "channel" : "608504710906904576", "channel_name": "finish-work", }, From ade8347eadd4b633b7b053b174c56961f14a0961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 15 Oct 2023 19:54:28 +0200 Subject: [PATCH 049/344] [kemonoparty] fix DM dates --- gallery_dl/extractor/kemonoparty.py | 7 ++++--- test/results/kemonoparty.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index d8ce4f79..6f6ba70d 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -197,11 +197,12 @@ class KemonopartyExtractor(Extractor): dms = [] for dm in text.extract_iter(page, ""): + footer = text.extr(dm, "") dms.append({ - "body": text.unescape(text.extract( + "body": text.unescape(text.extr( dm, "
", "
Date: Mon, 16 Oct 2023 16:51:30 +0530 Subject: [PATCH 050/344] [redgifs] fix 'niches' extraction --- gallery_dl/extractor/redgifs.py | 15 +++++++++++---- test/results/redgifs.py | 8 ++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index e246405a..6185acb6 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -146,11 +146,17 @@ class RedgifsCollectionsExtractor(RedgifsExtractor): class RedgifsNichesExtractor(RedgifsExtractor): """Extractor for redgifs niches""" subcategory = "niches" - pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)" + pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)/?" + r"(?:\?([^#]+))?$") example = "https://www.redgifs.com/niches/NAME" + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.query = match.group(2) + def gifs(self): - return self.api.niches(self.key) + order = text.parse_query(self.query).get("order") + return self.api.niches(self.key, order or "new") class RedgifsSearchExtractor(RedgifsExtractor): @@ -232,9 +238,10 @@ class RedgifsAPI(): endpoint = "/v2/users/{}/collections".format(user) return self._pagination(endpoint, key="collections") - def niches(self, niche): + def niches(self, niche, order): endpoint = "/v2/niches/{}/gifs".format(niche) - return self._pagination(endpoint) + params = {"count": 30, "order": order} + return self._pagination(endpoint, params) def search(self, params): endpoint = "/v2/gifs/search" diff --git a/test/results/redgifs.py b/test/results/redgifs.py index 669f2a23..0febcf53 100644 --- a/test/results/redgifs.py +++ b/test/results/redgifs.py @@ -67,19 +67,19 @@ __tests__ = ( }, { - "#url" : "https://www.redgifs.com/niches/boobs", + "#url" : "https://www.redgifs.com/niches/just-boobs", "#category": ("", "redgifs", "niches"), "#class" : redgifs.RedgifsNichesExtractor, - "#pattern" : r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "#pattern" : r"https://\w+\.redgifs\.com/[\w-]+\.(mp4|jpg)", "#range" : "1-20", "#count" : 20, }, { - "#url" : "https://www.redgifs.com/niches/ass", + "#url" : "https://www.redgifs.com/niches/thick-booty", "#category": ("", "redgifs", "niches"), "#class" : redgifs.RedgifsNichesExtractor, - "#pattern" : r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "#pattern" : r"https://\w+\.redgifs\.com/[\w-]+\.(mp4|jpg)", "#range" : "1-20", "#count" : 20, }, From 2911ed12409b39f77f32456f5cb40c1fef3a6ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 15 Oct 2023 23:50:36 +0200 Subject: [PATCH 051/344] [chevereto] add generic extractors (#4664) - support jpgfish - support pixl.li / pixl.is (#3179, #4357) --- docs/supportedsites.md | 22 +++++-- gallery_dl/extractor/__init__.py | 2 +- gallery_dl/extractor/chevereto.py | 105 ++++++++++++++++++++++++++++++ gallery_dl/extractor/jpgfish.py | 105 ------------------------------ test/results/jpgfish.py | 92 ++++++++++++++------------ test/results/pixl.py | 63 ++++++++++++++++++ 6 files changed, 234 insertions(+), 155 deletions(-) create mode 100644 gallery_dl/extractor/chevereto.py delete mode 100644 gallery_dl/extractor/jpgfish.py create mode 100644 test/results/pixl.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dafd0f27..6d4fd9c0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -427,12 +427,6 @@ Consider all sites to be NSFW unless otherwise known.
- - - - - - @@ -998,6 +992,22 @@ Consider all sites to be NSFW unless otherwise known. + + + + + + + + + + + + + + + + diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3abe74b6..1c1473a0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -28,6 +28,7 @@ modules = [ "blogger", "bunkr", "catbox", + "chevereto", "comicvine", "cyberdrop", "danbooru", @@ -73,7 +74,6 @@ modules = [ "issuu", "itaku", "itchio", - "jpgfish", "jschan", "kabeuchi", "keenspot", diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py new file mode 100644 index 00000000..f7824e2f --- /dev/null +++ b/gallery_dl/extractor/chevereto.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Chevereto galleries""" + +from .common import BaseExtractor, Message +from .. import text + + +class CheveretoExtractor(BaseExtractor): + """Base class for chevereto extractors""" + basecategory = "chevereto" + directory_fmt = ("{category}", "{user}", "{album}",) + archive_fmt = "{id}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.path = match.group(match.lastindex) + + def _pagination(self, url): + while url: + page = self.request(url).text + + for item in text.extract_iter( + page, '
<') + + +BASE_PATTERN = CheveretoExtractor.update({ + "jpgfish": { + "root": "https://jpg2.su", + "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", + }, + "pixl": { + "root": "https://pixl.li", + "pattern": r"pixl\.(?:li|is)", + }, +}) + + +class CheveretoImageExtractor(CheveretoExtractor): + """Extractor for chevereto Images""" + subcategory = "image" + pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)" + example = "https://jpg2.su/img/TITLE.ID" + + def items(self): + url = self.root + self.path + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.path.rpartition(".")[2], + "url" : extr('"), ">", "<"), + "user" : extr('username: "', '"'), + } + + text.nameext_from_url(image["url"], image) + yield Message.Directory, image + yield Message.Url, image["url"], image + + +class CheveretoAlbumExtractor(CheveretoExtractor): + """Extractor for chevereto Albums""" + subcategory = "album" + pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)" + example = "https://jpg2.su/album/TITLE.ID" + + def items(self): + url = self.root + self.path + data = {"_extractor": CheveretoImageExtractor} + + if self.path.endswith("/sub"): + albums = self._pagination(url) + else: + albums = (url,) + + for album in albums: + for image in self._pagination(album): + yield Message.Queue, image, data + + +class CheveretoUserExtractor(CheveretoExtractor): + """Extractor for chevereto Users""" + subcategory = "user" + pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)" + example = "https://jpg2.su/USER" + + def items(self): + url = self.root + self.path + + if self.path.endswith("/albums"): + data = {"_extractor": CheveretoAlbumExtractor} + else: + data = {"_extractor": CheveretoImageExtractor} + + for url in self._pagination(url): + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py deleted file mode 100644 index 8862a7b7..00000000 --- a/gallery_dl/extractor/jpgfish.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://jpg1.su/""" - -from .common import Extractor, Message -from .. import text - -BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)" - - -class JpgfishExtractor(Extractor): - """Base class for jpgfish extractors""" - category = "jpgfish" - root = "https://jpg1.su" - directory_fmt = ("{category}", "{user}", "{album}",) - archive_fmt = "{id}" - - def _pagination(self, url): - while url: - page = self.request(url).text - - for item in text.extract_iter( - page, '
<')[0] - - -class JpgfishImageExtractor(JpgfishExtractor): - """Extractor for jpgfish Images""" - subcategory = "image" - pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" - example = "https://jpg1.su/img/TITLE.ID" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.path, self.image_id = match.groups() - - def items(self): - url = "{}/img/{}".format(self.root, self.path) - extr = text.extract_from(self.request(url).text) - - image = { - "id" : self.image_id, - "url" : extr('"), ">", "<")[0] or "", - "user" : extr('username: "', '"'), - } - - text.nameext_from_url(image["url"], image) - yield Message.Directory, image - yield Message.Url, image["url"], image - - -class JpgfishAlbumExtractor(JpgfishExtractor): - """Extractor for jpgfish Albums""" - subcategory = "album" - pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" - example = "https://jpg1.su/album/TITLE.ID" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.album, self.sub_albums = match.groups() - - def items(self): - url = "{}/a/{}".format(self.root, self.album) - data = {"_extractor": JpgfishImageExtractor} - - if self.sub_albums: - albums = self._pagination(url + "/sub") - else: - albums = (url,) - - for album in albums: - for image in self._pagination(album): - yield Message.Queue, image, data - - -class JpgfishUserExtractor(JpgfishExtractor): - """Extractor for jpgfish Users""" - subcategory = "user" - pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" - example = "https://jpg1.su/USER" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.user, self.albums = match.groups() - - def items(self): - url = "{}/{}".format(self.root, self.user) - - if self.albums: - url += "/albums" - data = {"_extractor": JpgfishAlbumExtractor} - else: - data = {"_extractor": JpgfishImageExtractor} - - for url in self._pagination(url): - yield Message.Queue, url, data diff --git a/test/results/jpgfish.py b/test/results/jpgfish.py index 5aa4a126..bf35bf7a 100644 --- a/test/results/jpgfish.py +++ b/test/results/jpgfish.py @@ -4,15 +4,15 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -from gallery_dl.extractor import jpgfish +from gallery_dl.extractor import chevereto __tests__ = ( { - "#url" : "https://jpg1.su/img/funnymeme.LecXGS", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, - "#pattern" : r"https://simp3\.jpg\.church/images/funnymeme\.jpg", + "#url" : "https://jpg2.su/img/funnymeme.LecXGS", + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, + "#urls" : "https://simp3.jpg.church/images/funnymeme.jpg", "#sha1_content": "098e5e9b17ad634358426e0ffd1c93871474d13c", "album" : "", @@ -25,125 +25,131 @@ __tests__ = ( { "#url" : "https://jpg.church/img/auCruA", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, "#pattern" : r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", "album": "401-500", }, +{ + "#url" : "https://jpg1.su/img/funnymeme.LecXGS", + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, +}, + { "#url" : "https://jpeg.pet/img/funnymeme.LecXGS", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, }, { "#url" : "https://jpg.pet/img/funnymeme.LecXGS", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, }, { "#url" : "https://jpg.fishing/img/funnymeme.LecXGS", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, }, { "#url" : "https://jpg.fish/img/funnymeme.LecXGS", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, }, { "#url" : "https://jpg.church/img/funnymeme.LecXGS", - "#category": ("", "jpgfish", "image"), - "#class" : jpgfish.JpgfishImageExtractor, + "#category": ("chevereto", "jpgfish", "image"), + "#class" : chevereto.CheveretoImageExtractor, }, { "#url" : "https://jpg1.su/album/CDilP/?sort=date_desc&page=1", - "#category": ("", "jpgfish", "album"), - "#class" : jpgfish.JpgfishAlbumExtractor, + "#category": ("chevereto", "jpgfish", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, "#count" : 2, }, { "#url" : "https://jpg.fishing/a/gunggingnsk.N9OOI", - "#category": ("", "jpgfish", "album"), - "#class" : jpgfish.JpgfishAlbumExtractor, + "#category": ("chevereto", "jpgfish", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, "#count" : 114, }, { "#url" : "https://jpg.fish/a/101-200.aNJ6A/", - "#category": ("", "jpgfish", "album"), - "#class" : jpgfish.JpgfishAlbumExtractor, + "#category": ("chevereto", "jpgfish", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, "#count" : 100, }, { "#url" : "https://jpg.church/a/hannahowo.aNTdH/sub", - "#category": ("", "jpgfish", "album"), - "#class" : jpgfish.JpgfishAlbumExtractor, + "#category": ("chevereto", "jpgfish", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, "#count" : 606, }, { "#url" : "https://jpeg.pet/album/CDilP/?sort=date_desc&page=1", - "#category": ("", "jpgfish", "album"), - "#class" : jpgfish.JpgfishAlbumExtractor, + "#category": ("chevereto", "jpgfish", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, }, { "#url" : "https://jpg.pet/album/CDilP/?sort=date_desc&page=1", - "#category": ("", "jpgfish", "album"), - "#class" : jpgfish.JpgfishAlbumExtractor, + "#category": ("chevereto", "jpgfish", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, }, { "#url" : "https://jpg1.su/exearco", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, "#count" : 3, }, { "#url" : "https://jpg.church/exearco/albums", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, "#count" : 1, }, { "#url" : "https://jpeg.pet/exearco", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, }, { "#url" : "https://jpg.pet/exearco", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, }, { "#url" : "https://jpg.fishing/exearco", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, }, { "#url" : "https://jpg.fish/exearco", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, }, { "#url" : "https://jpg.church/exearco", - "#category": ("", "jpgfish", "user"), - "#class" : jpgfish.JpgfishUserExtractor, + "#category": ("chevereto", "jpgfish", "user"), + "#class" : chevereto.CheveretoUserExtractor, }, ) diff --git a/test/results/pixl.py b/test/results/pixl.py new file mode 100644 index 00000000..e82353ee --- /dev/null +++ b/test/results/pixl.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import chevereto + + +__tests__ = ( +{ + "#url" : "https://pixl.li/image/894x1023-1c8d6dd3b1b0cd4b0d286b229157a7de.z3DwHB", + "#category": ("chevereto", "pixl", "image"), + "#class" : chevereto.CheveretoImageExtractor, + "#urls" : "https://i.pixl.li/894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de.jpg", + "#sha1_content": "3279b86d0ac42348c703770c4781ecdc300fc13c", + + "album": "", + "extension": "jpg", + "filename": "894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de", + "id": "z3DwHB", + "url": "https://i.pixl.li/894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de.jpg", + "user": "matafaka1", +}, + +{ + "#url" : "https://pixl.is/image/894x1023-1c8d6dd3b1b0cd4b0d286b229157a7de.z3DwHB", + "#category": ("chevereto", "pixl", "image"), + "#class" : chevereto.CheveretoImageExtractor, +}, + +{ + "#url" : "https://pixl.li/album/estelasaubi.D0bJf", + "#category": ("chevereto", "pixl", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, + "#pattern" : chevereto.CheveretoImageExtractor.pattern, + "#count" : 173, +}, + +{ + "#url" : "https://pixl.li/mjstik", + "#category": ("chevereto", "pixl", "user"), + "#class" : chevereto.CheveretoUserExtractor, + "#pattern" : chevereto.CheveretoImageExtractor.pattern, + "#range" : "1-20", + "#count" : 20, +}, + +{ + "#url" : "https://pixl.li/mjstik/albums", + "#category": ("chevereto", "pixl", "user"), + "#class" : chevereto.CheveretoUserExtractor, + "#pattern" : chevereto.CheveretoAlbumExtractor.pattern, + "#count" : 285, +}, + +{ + "#url" : "https://pixl.is/renford/albums", + "#category": ("chevereto", "pixl", "user"), + "#class" : chevereto.CheveretoUserExtractor, +}, + +) From 390d14dbccbadbf49eda97d7cb4f6df1d2c873cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 16 Oct 2023 18:14:30 +0200 Subject: [PATCH 052/344] [chevereto] support 'img.kiwi' and 'deltaporno.com' (#4664, #1381) --- docs/supportedsites.md | 12 ++++++++ gallery_dl/extractor/chevereto.py | 8 +++++ scripts/supportedsites.py | 2 ++ test/results/deltaporno.py | 41 +++++++++++++++++++++++++ test/results/imgkiwi.py | 51 +++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+) create mode 100644 test/results/deltaporno.py create mode 100644 test/results/imgkiwi.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6d4fd9c0..16187315 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1007,6 +1007,18 @@ Consider all sites to be NSFW unless otherwise known.
+ + + + + + + + + + + + diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index f7824e2f..21166bdb 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -42,6 +42,14 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://pixl.li", "pattern": r"pixl\.(?:li|is)", }, + "imgkiwi": { + "root": "https://img.kiwi", + "pattern": r"img\.kiwi", + }, + "deltaporno": { + "root": "https://gallery.deltaporno.com", + "pattern": r"gallery\.deltaporno\.com", + }, }) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 7dd69c56..470b629d 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -35,6 +35,7 @@ CATEGORY_MAP = { "bbc" : "BBC", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", + "deltaporno" : "DeltaPorno", "deviantart" : "DeviantArt", "drawfriends" : "Draw Friends", "dynastyscans" : "Dynasty Reader", @@ -65,6 +66,7 @@ CATEGORY_MAP = { "imgbb" : "ImgBB", "imgbox" : "imgbox", "imagechest" : "ImageChest", + "imgkiwi" : "IMG.Kiwi", "imgth" : "imgth", "imgur" : "imgur", "joyreactor" : "JoyReactor", diff --git a/test/results/deltaporno.py b/test/results/deltaporno.py new file mode 100644 index 00000000..5bc4307b --- /dev/null +++ b/test/results/deltaporno.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import chevereto + + +__tests__ = ( +{ + "#url" : "https://gallery.deltaporno.com/image/7af6c7e241c600cd83dffdb22d4a1bb83336ede3b04bf406b36abf7b6f7dc4d8.8Gchu", + "#category": ("chevereto", "deltaporno", "image"), + "#class" : chevereto.CheveretoImageExtractor, + "#urls" : "https://gallery.deltaporno.com/images/2023/02/16/7af6c7e241c600cd83dffdb22d4a1bb83336ede3b04bf406b36abf7b6f7dc4d82e43ec48389730d4.jpg", + "#sha1_content": "f7e2a138b00c0742ccd77ab4031703bd8cc5b5a7", + + "album" : "Urmumanddad321 nude", + "extension": "jpg", + "filename" : "7af6c7e241c600cd83dffdb22d4a1bb83336ede3b04bf406b36abf7b6f7dc4d82e43ec48389730d4", + "id" : "8Gchu", + "url" : "https://gallery.deltaporno.com/images/2023/02/16/7af6c7e241c600cd83dffdb22d4a1bb83336ede3b04bf406b36abf7b6f7dc4d82e43ec48389730d4.jpg", + "user" : "delta", +}, + +{ + "#url" : "https://gallery.deltaporno.com/album/urmumanddad321-nude.RqCYu", + "#category": ("chevereto", "deltaporno", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, + "#pattern" : chevereto.CheveretoImageExtractor.pattern, + "#count" : 28, + "#sha1_url": "fab8121bce72a9db2d1ed1e7520317a7a454d6c5", +}, + +{ + "#url" : "https://gallery.deltaporno.com/delta", + "#category": ("chevereto", "deltaporno", "user"), + "#class" : chevereto.CheveretoUserExtractor, +}, + +) diff --git a/test/results/imgkiwi.py b/test/results/imgkiwi.py new file mode 100644 index 00000000..16a4aa96 --- /dev/null +++ b/test/results/imgkiwi.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import chevereto + + +__tests__ = ( +{ + "#url" : "https://img.kiwi/image/79de2c41-70f9-4a87-bd6d-00fe9997c0c4.JR2wZz", + "#category": ("chevereto", "imgkiwi", "image"), + "#class" : chevereto.CheveretoImageExtractor, + "#urls" : "https://img.kiwi/images/2023/02/28/11ac1ebf28a2eae8265026b28e9c4413.jpg", + "#sha1_content": "9ea704a77e2038b9008350682cfad53a614a60bd", + + "album" : "Kins3y Wolansk1", + "extension": "jpg", + "filename" : "11ac1ebf28a2eae8265026b28e9c4413", + "id" : "JR2wZz", + "url" : "https://img.kiwi/images/2023/02/28/11ac1ebf28a2eae8265026b28e9c4413.jpg", + "user" : "johnirl", +}, + +{ + "#url" : "https://img.kiwi/album/kins3y-wolansk1.8Jxc", + "#category": ("chevereto", "imgkiwi", "album"), + "#class" : chevereto.CheveretoAlbumExtractor, + "#pattern" : chevereto.CheveretoImageExtractor.pattern, + "#count" : 19, +}, + +{ + "#url" : "https://img.kiwi/johnirl", + "#category": ("chevereto", "imgkiwi", "user"), + "#class" : chevereto.CheveretoUserExtractor, + "#pattern" : chevereto.CheveretoImageExtractor.pattern, + "#range" : "1-20", + "#count" : 20, +}, + +{ + "#url" : "https://img.kiwi/johnirl/albums", + "#category": ("chevereto", "imgkiwi", "user"), + "#class" : chevereto.CheveretoUserExtractor, + "#pattern" : chevereto.CheveretoAlbumExtractor.pattern, + "#count" : 50, +}, + +) From a1977a698eaf466fc5c23c122ed5488b5f815b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 16 Oct 2023 18:16:48 +0200 Subject: [PATCH 053/344] [tests] fix spurious failures in '_assert_isotime()' --- test/test_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_extractor.py b/test/test_extractor.py index 9387f5b6..29ccf97f 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -238,7 +238,7 @@ class TestExtractorWait(unittest.TestCase): until = datetime.fromtimestamp(until) o = self._isotime_to_seconds(output) u = self._isotime_to_seconds(until.time().isoformat()[:8]) - self.assertLess(o-u, 1.0) + self.assertLessEqual(o-u, 1.0) @staticmethod def _isotime_to_seconds(isotime): From 9bc5ad47846fed3a1ad4768110d144adcb50f9b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 17 Oct 2023 19:23:48 +0200 Subject: [PATCH 054/344] [tests] implement 'len:' --- test/test_results.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_results.py b/test/test_results.py index 4fb22c74..f275bbfc 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -201,6 +201,9 @@ class TestExtractorResults(unittest.TestCase): self.assertEqual(str(value), test[3:], msg=key) elif test.startswith("type:"): self.assertEqual(type(value).__name__, test[5:], msg=key) + elif test.startswith("len:"): + self.assertIsInstance(value, (list, tuple), msg=key) + self.assertEqual(len(value), int(test[4:]), msg=key) else: self.assertEqual(value, test, msg=key) else: From bfdc07632aa38cb5cd1390dd47cc0b65900c29a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 17 Oct 2023 19:09:52 +0200 Subject: [PATCH 055/344] [deviantart] expand nested comment replies (#4653) --- gallery_dl/extractor/deviantart.py | 40 +++++++++++++++++++++++++----- test/results/deviantart.py | 14 +++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 78d8473a..2c37ef12 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -234,7 +234,7 @@ class DeviantartExtractor(Extractor): if self.comments: deviation["comments"] = ( - self.api.comments(deviation["deviationid"], target="deviation") + self._extract_comments(deviation["deviationid"], "deviation") if deviation["stats"]["comments"] else () ) @@ -401,6 +401,28 @@ class DeviantartExtractor(Extractor): binascii.b2a_base64(payload).rstrip(b"=\n").decode()) ) + def _extract_comments(self, target_id, target_type="deviation"): + results = None + comment_ids = [None] + + while comment_ids: + comments = self.api.comments( + target_id, target_type, comment_ids.pop()) + + if results: + results.extend(comments) + else: + results = comments + + # parent comments, i.e. nodes with at least one child + parents = {c["parentid"] for c in comments} + # comments with more than one reply + replies = {c["commentid"] for c in comments if c["replies"]} + # add comment UUIDs with replies that are not parent to any node + comment_ids.extend(replies - parents) + + return results + def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" kwargs["fatal"] = None @@ -704,7 +726,7 @@ class DeviantartStatusExtractor(DeviantartExtractor): deviation["stats"] = {"comments": comments_count} if self.comments: deviation["comments"] = ( - self.api.comments(deviation["statusid"], target="status") + self._extract_comments(deviation["statusid"], "status") if comments_count else () ) @@ -1078,11 +1100,17 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - def comments(self, id, target, offset=0): + def comments(self, target_id, target_type="deviation", + comment_id=None, offset=0): """Fetch comments posted on a target""" - endpoint = "/comments/{}/{}".format(target, id) - params = {"maxdepth": "5", "offset": offset, "limit": 50, - "mature_content": self.mature} + endpoint = "/comments/{}/{}".format(target_type, target_id) + params = { + "commentid" : comment_id, + "maxdepth" : "5", + "offset" : offset, + "limit" : 50, + "mature_content": self.mature, + } return self._pagination_list(endpoint, params=params, key="thread") def deviation(self, deviation_id, public=None): diff --git a/test/results/deviantart.py b/test/results/deviantart.py index e7ca59bb..ea8773d2 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -547,15 +547,20 @@ __tests__ = ( "#options" : {"comments": True}, "#pattern" : r"https://wixmp-[^.]+\.wixmp\.com/f/.+/.+\.jpg\?token=.+", - "comments": list, + "comments": "len:44", }, { - "#url" : "https://www.deviantart.com/citizenfresh/art/Hverarond-789295466", - "#comment" : "wixmp URL rewrite", + "#url" : "https://www.deviantart.com/justatest235723/art/Blue-811519058", + "#comment" : "nested comments (#4653)", "#category": ("", "deviantart", "deviation"), "#class" : deviantart.DeviantartDeviationExtractor, - "#pattern" : r"https://wixmp-\w+\.wixmp\.com/f/[^/]+/[^.]+\.jpg\?token=", + "#options" : { + "original": False, + "comments": True, + }, + + "comments": "len:20", }, { @@ -563,7 +568,6 @@ __tests__ = ( "#comment" : "wixmp URL rewrite /intermediary/", "#category": ("", "deviantart", "deviation"), "#class" : deviantart.DeviantartDeviationExtractor, - "#options" : {"jwt": False}, "#pattern" : r"https://images-wixmp-\w+\.wixmp\.com/intermediary/f/[^/]+/[^.]+\.jpg", }, From 6b22af9720f4599eb6e2cb524f6f7aa57846ad17 Mon Sep 17 00:00:00 2001 From: Klion Xu Date: Thu, 19 Oct 2023 10:32:59 +0800 Subject: [PATCH 056/344] [kemonoparty] update API endpoint (#4676) --- gallery_dl/extractor/kemonoparty.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 6f6ba70d..e92a0b9d 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -208,7 +208,7 @@ class KemonopartyExtractor(Extractor): @memcache(keyarg=1) def _discord_channels(self, server): - url = "{}/api/discord/channels/lookup?q={}".format( + url = "{}/api/v1/discord/channels/lookup/{}".format( self.root, server) return self.request(url).json() @@ -228,7 +228,7 @@ class KemonopartyUserExtractor(KemonopartyExtractor): _, _, service, user_id, offset = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) + self.api_url = "{}/api/v1/{}/user/{}".format(self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) self.offset = text.parse_int(offset) @@ -256,7 +256,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): _, _, service, user_id, post_id = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/{}/user/{}/post/{}".format( + self.api_url = "{}/api/v1/{}/user/{}/post/{}".format( self.root, service, user_id, post_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) @@ -344,7 +344,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): yield Message.Url, url, post def posts(self): - url = "{}/api/discord/channel/{}".format(self.root, self.channel_id) + url = "{}/api/v1/discord/channel/{}".format(self.root, self.channel_id) params = {"skip": 0} while True: From dc1c2139b16edbdaf92dda3de80af462196ab72d Mon Sep 17 00:00:00 2001 From: Klion Xu Date: Thu, 19 Oct 2023 10:54:08 +0800 Subject: [PATCH 057/344] fix line too long --- gallery_dl/extractor/kemonoparty.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index e92a0b9d..43582121 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -228,7 +228,8 @@ class KemonopartyUserExtractor(KemonopartyExtractor): _, _, service, user_id, offset = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/v1/{}/user/{}".format(self.root, service, user_id) + self.api_url = "{}/api/v1/{}/user/{}".format( + self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) self.offset = text.parse_int(offset) @@ -344,7 +345,8 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): yield Message.Url, url, post def posts(self): - url = "{}/api/v1/discord/channel/{}".format(self.root, self.channel_id) + url = "{}/api/v1/discord/channel/{}".format( + self.root, self.channel_id) params = {"skip": 0} while True: From c9a976d8a62c0e90abcee2d09dafc8a43fea543b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 19 Oct 2023 17:36:16 +0200 Subject: [PATCH 058/344] [kemonoparty] various updates and fixes (#4676, #4681) - fix pagination - fix 'date' metadata - fix discord channel API endpoint --- gallery_dl/extractor/kemonoparty.py | 27 ++++++++------------------- test/results/kemonoparty.py | 12 ++++++------ 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 43582121..21160c6a 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -70,8 +70,7 @@ class KemonopartyExtractor(Extractor): self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = text.parse_datetime( - post["published"] or post["added"], - "%a, %d %b %Y %H:%M:%S %Z") + post["published"] or post["added"], "%Y-%m-%dT%H:%M:%S") if username: post["username"] = username if comments: @@ -208,7 +207,7 @@ class KemonopartyExtractor(Extractor): @memcache(keyarg=1) def _discord_channels(self, server): - url = "{}/api/v1/discord/channels/lookup/{}".format( + url = "{}/api/v1/discord/channel/lookup/{}".format( self.root, server) return self.request(url).json() @@ -241,10 +240,9 @@ class KemonopartyUserExtractor(KemonopartyExtractor): posts = self.request(url, params=params).json() yield from posts - cnt = len(posts) - if cnt < 25: - return - params["o"] += cnt + if len(posts) < 50: + break + params["o"] += 50 class KemonopartyPostExtractor(KemonopartyExtractor): @@ -262,8 +260,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): - posts = self.request(self.api_url).json() - return (posts[0],) if len(posts) > 1 else posts + return (self.request(self.api_url).json(),) class KemonopartyDiscordExtractor(KemonopartyExtractor): @@ -325,7 +322,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): post["channel_name"] = self.channel_name post["date"] = text.parse_datetime( - post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["published"], "%Y-%m-%dT%H:%M:%S.%f") post["count"] = len(files) yield Message.Directory, post @@ -348,15 +345,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): url = "{}/api/v1/discord/channel/{}".format( self.root, self.channel_id) params = {"skip": 0} - - while True: - posts = self.request(url, params=params).json() - yield from posts - - cnt = len(posts) - if cnt < 25: - break - params["skip"] += cnt + return self.request(url, params=params).json() class KemonopartyDiscordServerExtractor(KemonopartyExtractor): diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 059ac79f..83806930 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -12,8 +12,7 @@ __tests__ = ( "#url" : "https://kemono.party/fanbox/user/6993449", "#category": ("", "kemonoparty", "fanbox"), "#class" : kemonoparty.KemonopartyUserExtractor, - "#range" : "1-25", - "#count" : 25, + "#count" : 847, }, { @@ -21,8 +20,8 @@ __tests__ = ( "#comment" : "'max-posts' option, 'o' query parameter (#1674)", "#category": ("", "kemonoparty", "patreon"), "#class" : kemonoparty.KemonopartyUserExtractor, - "#options" : {"max-posts": 25}, - "#count" : "< 100", + "#options" : {"max-posts": 100}, + "#count" : range(200, 300), }, { @@ -44,7 +43,7 @@ __tests__ = ( "#pattern" : r"https://kemono.party/data/21/0f/210f35388e28bbcf756db18dd516e2d82ce75[0-9a-f]+\.jpg", "#sha1_content": "900949cefc97ab8dc1979cc3664785aac5ba70dd", - "added" : "Wed, 06 May 2020 20:28:02 GMT", + "added" : "2020-05-06T20:28:02.302000", "content" : str, "count" : 1, "date" : "dt:2019-08-11 02:09:04", @@ -55,7 +54,7 @@ __tests__ = ( "hash" : "210f35388e28bbcf756db18dd516e2d82ce758e0d32881eeee76d43e1716d382", "id" : "506575", "num" : 1, - "published" : "Sun, 11 Aug 2019 02:09:04 GMT", + "published" : "2019-08-11T02:09:04", "service" : "fanbox", "shared_file": False, "subcategory": "fanbox", @@ -183,6 +182,7 @@ __tests__ = ( "channel" : "608504710906904576", "channel_name": "finish-work", + "date" : "type:datetime", }, { From 174191cb79f1d22c095d499c7fadbf7ff0587da0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 19 Oct 2023 21:57:27 +0200 Subject: [PATCH 059/344] [kemonoparty] restore discord pagination (#4676) --- gallery_dl/extractor/kemonoparty.py | 11 +++++++++-- test/results/kemonoparty.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 21160c6a..be51b36a 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -344,8 +344,15 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def posts(self): url = "{}/api/v1/discord/channel/{}".format( self.root, self.channel_id) - params = {"skip": 0} - return self.request(url, params=params).json() + params = {"o": 0} + + while True: + posts = self.request(url, params=params).json() + yield from posts + + if len(posts) < 150: + break + params["o"] += 150 class KemonopartyDiscordServerExtractor(KemonopartyExtractor): diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 83806930..61a3fc89 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -185,6 +185,18 @@ __tests__ = ( "date" : "type:datetime", }, +{ + "#url" : "https://kemono.party/discord/server/818188637329031199#818343747275456522", + "#comment" : "pagination", + "#category": ("", "kemonoparty", "discord"), + "#class" : kemonoparty.KemonopartyDiscordExtractor, + "#range" : "1-250", + "#count" : 250, + + "channel" : "818343747275456522", + "channel_name": "wraith-sfw-gallery", +}, + { "#url" : "https://kemono.su/discord/server/256559665620451329/channel/462437519519383555#", "#category": ("", "kemonoparty", "discord"), From aaf539009b63ab34599cf88d42c92e7a54e88395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 19 Oct 2023 22:32:51 +0200 Subject: [PATCH 060/344] [kemonoparty] initial support for post revisions (#4498, #4597) - single revision https://kemono.party/SERVICE/user/12345/post/12345/revision/12345 - all revisions https://kemono.party/SERVICE/user/12345/post/12345/revisions --- gallery_dl/extractor/kemonoparty.py | 22 +++++++++++++++++--- test/results/kemonoparty.py | 32 +++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index be51b36a..9b9e0821 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -211,6 +211,10 @@ class KemonopartyExtractor(Extractor): self.root, server) return self.request(url).json() + @memcache(keyarg=1) + def _post_revisions(self, url): + return self.request(url + "/revisions").json() + def _validate(response): return (response.headers["content-length"] != "9" or @@ -248,11 +252,12 @@ class KemonopartyUserExtractor(KemonopartyExtractor): class KemonopartyPostExtractor(KemonopartyExtractor): """Extractor for a single kemono.party post""" subcategory = "post" - pattern = USER_PATTERN + r"/post/([^/?#]+)" + pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?" example = "https://kemono.party/SERVICE/user/12345/post/12345" def __init__(self, match): - _, _, service, user_id, post_id = match.groups() + _, _, service, user_id, post_id, self.revision, self.revision_id = \ + match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/v1/{}/user/{}/post/{}".format( @@ -260,7 +265,18 @@ class KemonopartyPostExtractor(KemonopartyExtractor): self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): - return (self.request(self.api_url).json(),) + if not self.revision: + return (self.request(self.api_url).json(),) + + revs = self._post_revisions(self.api_url) + if not self.revision_id: + return revs + + for rev in revs: + if str(rev["revision_id"]) == self.revision_id: + return (rev,) + + raise exception.NotFoundError("revision") class KemonopartyDiscordExtractor(KemonopartyExtractor): diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 61a3fc89..b0f272f3 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -5,6 +5,7 @@ # published by the Free Software Foundation. from gallery_dl.extractor import kemonoparty +from gallery_dl import exception __tests__ = ( @@ -154,6 +155,37 @@ __tests__ = ( "#class" : kemonoparty.KemonopartyPostExtractor, }, +{ + "#url" : "https://kemono.party/patreon/user/3161935/post/68231671/revision/134996", + "#comment" : "revisions (#4498)", + "#category": ("", "kemonoparty", "patreon"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#urls" : "https://kemono.party/data/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", + + "revision_id": 134996, +}, + +{ + "#url" : "https://kemono.party/patreon/user/3161935/post/68231671/revisions", + "#comment" : "revisions (#4498)", + "#category": ("", "kemonoparty", "patreon"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#pattern" : r"https://kemono\.party/data/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86\.jpg", + "#count" : 9, + "#archive" : False, + + "revision_id": range(134996, 3052965), +}, + + +{ + "#url" : "https://kemono.party/patreon/user/3161935/post/68231671/revision/12345", + "#comment" : "revisions (#4498)", + "#category": ("", "kemonoparty", "patreon"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#exception": exception.NotFoundError, +}, + { "#url" : "https://kemono.party/discord/server/488668827274444803#608504710906904576", "#category": ("", "kemonoparty", "discord"), From 6e830ffc9e6bf23378fcd6da146909c38d58409b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 19 Oct 2023 23:06:06 +0200 Subject: [PATCH 061/344] [kemonoparty] support post searches (#3385, #4057) --- gallery_dl/extractor/kemonoparty.py | 8 ++++---- test/results/kemonoparty.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 9b9e0821..2f524c6f 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -224,21 +224,21 @@ def _validate(response): class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.party user listing""" subcategory = "user" - pattern = USER_PATTERN + r"/?(?:\?o=(\d+))?(?:$|[?#])" + pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])" example = "https://kemono.party/SERVICE/user/12345" def __init__(self, match): - _, _, service, user_id, offset = match.groups() + _, _, service, user_id, self.query = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/v1/{}/user/{}".format( self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) - self.offset = text.parse_int(offset) def posts(self): url = self.api_url - params = {"o": self.offset} + params = text.parse_query(self.query) + params["o"] = text.parse_int(params.get("o")) while True: posts = self.request(url, params=params).json() diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index b0f272f3..6594c4b2 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -25,6 +25,19 @@ __tests__ = ( "#count" : range(200, 300), }, +{ + "#url" : "https://kemono.party/fanbox/user/6993449?q=お蔵入りになった", + "#comment" : "search / 'q' query parameter (#3385, #4057)", + "#category": ("", "kemonoparty", "fanbox"), + "#class" : kemonoparty.KemonopartyUserExtractor, + "#urls" : ( + "https://kemono.party/data/ef/7b/ef7b4398a2f4ada597421fd3c116cff86e85695911f7cd2a459b0e566b864e46.png", + "https://kemono.party/data/73/e6/73e615f6645b9d1af6329448601673c9275f07fd11eb37670c97e307e29a9ee9.png", + ), + + "id": "8779", +}, + { "#url" : "https://kemono.su/subscribestar/user/alcorart", "#category": ("", "kemonoparty", "subscribestar"), From 0d52b775cb06ea88f6ae2279885f73b1d6fe31f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 20 Oct 2023 00:16:59 +0200 Subject: [PATCH 062/344] [kemonoparty] add 'revisions' option (#4498, #4597) --- docs/configuration.rst | 14 +++++++++++++- gallery_dl/extractor/kemonoparty.py | 27 +++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 64d1a0af..25b0ad9c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2072,7 +2072,19 @@ Type Default ``false`` Description - Extract ``username`` metadata + Extract ``username`` metadata. + + +extractor.kemonoparty.revisions +------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract post revisions. + + Note: This requires 1 additional HTTP request per post. extractor.khinsider.format diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 2f524c6f..1596cfb1 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -239,10 +239,24 @@ class KemonopartyUserExtractor(KemonopartyExtractor): url = self.api_url params = text.parse_query(self.query) params["o"] = text.parse_int(params.get("o")) + revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - yield from posts + + if revisions: + for post in posts: + post["revision_id"] = 0 + yield post + post_url = "{}/post/{}".format(self.api_url, post["id"]) + try: + revs = self._post_revisions(post_url) + except exception.HttpError: + pass + else: + yield from revs + else: + yield from posts if len(posts) < 50: break @@ -266,7 +280,16 @@ class KemonopartyPostExtractor(KemonopartyExtractor): def posts(self): if not self.revision: - return (self.request(self.api_url).json(),) + post = self.request(self.api_url).json() + if self.config("revisions"): + post["revision_id"] = 0 + try: + revs = self._post_revisions(self.api_url) + except exception.HttpError: + pass + else: + return itertools.chain((post,), revs) + return (post,) revs = self._post_revisions(self.api_url) if not self.revision_id: From b2c3db3e24d7360de34cc0ad4cd80e623be85cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 20 Oct 2023 15:22:44 +0200 Subject: [PATCH 063/344] [bunkr] add extractor for media URLs (#4684) --- gallery_dl/extractor/bunkr.py | 34 ++++++++++++++++++++++++++++++++- test/results/bunkr.py | 36 ++++++++++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 5509f5a8..d5e63b22 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -12,6 +12,8 @@ from .lolisafe import LolisafeAlbumExtractor from .. import text from urllib.parse import urlsplit, urlunsplit +BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)" + MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", "cdn12.bunkr.ru": "media-files12.bunkr.la", @@ -28,7 +30,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkrr.su albums""" category = "bunkr" root = "https://bunkrr.su" - pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" + pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkrr.su/a/ID" def fetch_album(self, album_id): @@ -72,3 +74,33 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): url = urlunsplit((scheme, domain, path, query, fragment)) yield {"file": text.unescape(url)} + + +class BunkrMediaExtractor(LolisafeAlbumExtractor): + """Extractor for bunkrr.su media links""" + category = "bunkr" + subcategory = "media" + root = "https://bunkrr.su" + directory_fmt = ("{category}",) + pattern = BASE_PATTERN + r"/[vi]/([^/?#]+)" + example = "https://bunkrr.su/v/FILENAME" + + def fetch_album(self, album_id): + try: + path = urlsplit(self.url).path + page = self.request(self.root + path).text + if path[1] == "v": + url = text.extr(page, ' Date: Fri, 20 Oct 2023 17:30:23 +0200 Subject: [PATCH 064/344] [bunkr] fix '/d/' file URLs (#4685) --- gallery_dl/extractor/bunkr.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index d5e63b22..26123b8b 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -55,11 +55,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): for url in urls: if url.startswith("/"): try: - page = self.request(self.root + text.unescape(url)).text - if url[1] == "v": - url = text.extr(page, ' Date: Fri, 20 Oct 2023 17:39:50 +0200 Subject: [PATCH 065/344] [4chanarchives] disable Referer headers by default (#4686) --- gallery_dl/extractor/4chanarchives.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py index f018d3ec..27ac7c55 100644 --- a/gallery_dl/extractor/4chanarchives.py +++ b/gallery_dl/extractor/4chanarchives.py @@ -20,6 +20,7 @@ class _4chanarchivesThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread} - {title}") filename_fmt = "{no}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{no}" + referer = False pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)" example = "https://4chanarchives.com/board/a/thread/12345/" From b52fd91ac6ba4e80c87a084360ef9c4444ebf8fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 21 Oct 2023 13:20:35 +0200 Subject: [PATCH 066/344] [sankaku] support '/posts/' URLs (#4688) --- gallery_dl/extractor/sankaku.py | 2 +- test/results/sankaku.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 745a351b..dc355112 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/post(?:s|/show)/([0-9a-f]+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/test/results/sankaku.py b/test/results/sankaku.py index e4fbf5c1..9a1738a7 100644 --- a/test/results/sankaku.py +++ b/test/results/sankaku.py @@ -147,6 +147,18 @@ __tests__ = ( "md5": "f8ba89043078f0e4be2d9c46550b840a", }, +{ + "#url" : "https://chan.sankakucomplex.com/posts/f8ba89043078f0e4be2d9c46550b840a", + "#comment" : "/posts/ instead of /post/show/ (#4688)", + "#category": ("booru", "sankaku", "post"), + "#class" : sankaku.SankakuPostExtractor, + "#pattern" : r"https://s\.sankakucomplex\.com/data/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg", + "#count" : 1, + + "id" : 33195194, + "md5": "f8ba89043078f0e4be2d9c46550b840a", +}, + { "#url" : "https://chan.sankakucomplex.com/post/show/360451", "#category": ("booru", "sankaku", "post"), From 7958ab1946c00fea832432817dda38c8da85f4fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 21 Oct 2023 13:22:55 +0200 Subject: [PATCH 067/344] [newgrounds] support 'imageData' files (#4642) --- gallery_dl/extractor/newgrounds.py | 67 +++++++++++++++++++++--------- test/results/newgrounds.py | 11 +++++ 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index b119c966..a6971e84 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -54,27 +54,30 @@ class NewgroundsExtractor(Extractor): if metadata: post.update(metadata) yield Message.Directory, post + post["num"] = 0 yield Message.Url, url, text.nameext_from_url(url, post) - ext = post["extension"] - for num, url in enumerate(text.extract_iter( - post["_images"] + post["_comment"], - 'data-smartload-src="', '"'), 1): - post["num"] = num - post["_index"] = "{}_{:>02}".format(post["index"], num) + if "_multi" in post: + for data in post["_multi"]: + post["num"] += 1 + post["_index"] = "{}_{:>02}".format( + post["index"], post["num"]) + post.update(data) + url = data["image"] + + text.nameext_from_url(url, post) + yield Message.Url, url, post + + if "_fallback" in post: + del post["_fallback"] + + for url in text.extract_iter( + post["_comment"], 'data-smartload-src="', '"'): + post["num"] += 1 + post["_index"] = "{}_{:>02}".format( + post["index"], post["num"]) url = text.ensure_http_scheme(url) text.nameext_from_url(url, post) - - if "_fallback" in post: - del post["_fallback"] - - if "/comments/" not in url: - url = url.replace("/medium_views/", "/images/", 1) - if post["extension"] == "webp": - post["_fallback"] = (url,) - post["extension"] = ext - url = url.replace(".webp", "." + ext) - yield Message.Url, url, post else: self.log.warning( @@ -149,7 +152,6 @@ class NewgroundsExtractor(Extractor): extr = text.extract_from(page) data = extract_data(extr, post_url) - data["_images"] = extr('
').partition(">")[2] data["comment"] = text.unescape(text.remove_html( @@ -168,8 +170,7 @@ class NewgroundsExtractor(Extractor): data["post_url"] = post_url return data - @staticmethod - def _extract_image_data(extr, url): + def _extract_image_data(self, extr, url): full = text.extract_from(util.json_loads(extr( '"full_image_text":', '});'))) data = { @@ -187,8 +188,34 @@ class NewgroundsExtractor(Extractor): index = data["url"].rpartition("/")[2].partition("_")[0] data["index"] = text.parse_int(index) data["_index"] = index + + image_data = extr("let imageData =", "\n];") + if image_data: + data["_multi"] = self._extract_images_multi(image_data) + else: + art_images = extr('
Date: Sat, 21 Oct 2023 13:23:45 +0200 Subject: [PATCH 068/344] [cookies] include exception in fallback warning --- gallery_dl/cookies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 20959a8f..416cc9a1 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -832,8 +832,9 @@ class DatabaseConnection(): self.database = sqlite3.connect( uri, uri=True, isolation_level=None, check_same_thread=False) return self.database - except Exception: - _log_debug("Falling back to temporary database copy") + except Exception as exc: + _log_debug("Falling back to temporary database copy (%s: %s)", + exc.__class__.__name__, exc) try: self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-") From 95a74be2a5190f31f1ac7c914ca2e129332633cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 21 Oct 2023 16:32:52 +0200 Subject: [PATCH 069/344] release version 1.26.1 --- CHANGELOG.md | 40 ++++++++++++++++++++++++++++++++++++++++ README.rst | 4 ++-- docs/supportedsites.md | 2 +- gallery_dl/version.py | 2 +- 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4ce4baf..34607f2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,45 @@ # Changelog +## 1.26.1 - 2023-10-21 +### Extractors +#### Additions +- [bunkr] add extractor for media URLs ([#4684](https://github.com/mikf/gallery-dl/issues/4684)) +- [chevereto] add generic extractors for `chevereto` sites ([#4664](https://github.com/mikf/gallery-dl/issues/4664)) + - `deltaporno.com` ([#1381](https://github.com/mikf/gallery-dl/issues/1381)) + - `img.kiwi` + - `jpgfish` + - `pixl.li` ([#3179](https://github.com/mikf/gallery-dl/issues/3179), [#4357](https://github.com/mikf/gallery-dl/issues/4357)) +- [deviantart] implement `"group": "skip"` ([#4630](https://github.com/mikf/gallery-dl/issues/4630)) +- [fantia] add `content_count` and `content_num` metadata fields ([#4627](https://github.com/mikf/gallery-dl/issues/4627)) +- [imgbb] add `displayname` and `user_id` metadata ([#4626](https://github.com/mikf/gallery-dl/issues/4626)) +- [kemonoparty] support post revisions; add `revisions` option ([#4498](https://github.com/mikf/gallery-dl/issues/4498), [#4597](https://github.com/mikf/gallery-dl/issues/4597)) +- [kemonoparty] support searches ([#3385](https://github.com/mikf/gallery-dl/issues/3385), [#4057](https://github.com/mikf/gallery-dl/issues/4057)) +- [kemonoparty] support discord URLs with channel IDs ([#4662](https://github.com/mikf/gallery-dl/issues/4662)) +- [moebooru] add `metadata` option ([#4646](https://github.com/mikf/gallery-dl/issues/4646)) +- [newgrounds] support multi-image posts ([#4642](https://github.com/mikf/gallery-dl/issues/4642)) +- [sankaku] support `/posts/` URLs ([#4688](https://github.com/mikf/gallery-dl/issues/4688)) +- [twitter] add `sensitive` metadata field ([#4619](https://github.com/mikf/gallery-dl/issues/4619)) +#### Fixes +- [4chanarchives] disable Referer headers by default ([#4686](https://github.com/mikf/gallery-dl/issues/4686)) +- [bunkr] fix `/d/` file URLs ([#4685](https://github.com/mikf/gallery-dl/issues/4685)) +- [deviantart] expand nested comment replies ([#4653](https://github.com/mikf/gallery-dl/issues/4653)) +- [deviantart] disable `jwt` ([#4652](https://github.com/mikf/gallery-dl/issues/4652)) +- [hentaifoundry] fix `.swf` file downloads ([#4641](https://github.com/mikf/gallery-dl/issues/4641)) +- [imgbb] fix `user` metadata extraction ([#4626](https://github.com/mikf/gallery-dl/issues/4626)) +- [imgbb] update pagination end condition ([#4626](https://github.com/mikf/gallery-dl/issues/4626)) +- [kemonoparty] update API endpoints ([#4676](https://github.com/mikf/gallery-dl/issues/4676), [#4677](https://github.com/mikf/gallery-dl/issues/4677)) +- [patreon] update `campaign_id` path ([#4639](https://github.com/mikf/gallery-dl/issues/4639)) +- [reddit] fix wrong previews ([#4649](https://github.com/mikf/gallery-dl/issues/4649)) +- [redgifs] fix `niches` extraction ([#4666](https://github.com/mikf/gallery-dl/issues/4666), [#4667](https://github.com/mikf/gallery-dl/issues/4667)) +- [twitter] fix crash due to missing `source` ([#4620](https://github.com/mikf/gallery-dl/issues/4620)) +- [warosu] fix extraction ([#4634](https://github.com/mikf/gallery-dl/issues/4634)) +### Post Processors +#### Additions +- support `{_filename}`, `{_directory}`, and `{_path}` replacement fields for `--exec` ([#4633](https://github.com/mikf/gallery-dl/issues/4633)) +### Miscellaneous +#### Improvements +- avoid temporary copies with `--cookies-from-browser` by opening cookie databases in read-only mode + ## 1.26.0 - 2023-10-03 - ### Extractors #### Additions diff --git a/README.rst b/README.rst index 14cfb095..207b68ec 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 16187315..3924cd39 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -112,7 +112,7 @@ Consider all sites to be NSFW unless otherwise known.
- + diff --git a/gallery_dl/version.py b/gallery_dl/version.py index ef2afe69..593cffab 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.1-dev" +__version__ = "1.26.1" From b68aad3dab1aed8184abe46180f62003c75c6e8c Mon Sep 17 00:00:00 2001 From: inty Date: Sat, 21 Oct 2023 19:19:22 +0000 Subject: [PATCH 070/344] [reddit] implement Reddit Mobile share links --- docs/supportedsites.md | 2 +- gallery_dl/extractor/reddit.py | 23 +++++++++++++++++++++++ test/results/reddit.py | 8 ++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3924cd39..9ca6c705 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -712,7 +712,7 @@ Consider all sites to be NSFW unless otherwise known. - + diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index cd2ba3d2..c0bf5b3e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -292,6 +292,29 @@ class RedditImageExtractor(Extractor): yield Message.Url, url, data +class RedditRedirectExtractor(Extractor): + """Extractor for personalized share URLs produced by the mobile app""" + category = "reddit" + subcategory = "redirect" + pattern = (r"(?:https?://)?(?:" + r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))" + r"/s/([a-zA-Z0-9]{10})") + example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ" + + def __init__(self, match): + Extractor.__init__(self, match) + self.subreddit = match.group(1) + self.share_url = match.group(2) + + def items(self): + url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \ + self.share_url + data = {"_extractor": RedditSubmissionExtractor} + response = self.request(url, method="HEAD", allow_redirects=False, + notfound="submission") + yield Message.Queue, response.headers["Location"], data + + class RedditAPI(): """Interface for the Reddit API diff --git a/test/results/reddit.py b/test/results/reddit.py index 330ef5b6..8a4359cf 100644 --- a/test/results/reddit.py +++ b/test/results/reddit.py @@ -240,4 +240,12 @@ __tests__ = ( "#pattern" : r"^https://i\.redd\.it/00af44lpn0u51\.jpg$", }, +{ + "#url" : "https://www.reddit.com/r/analog/s/hKrTTvFVwZ", + "#comment" : "Mobile share URL", + "#category": ("", "reddit", "redirect"), + "#class" : reddit.RedditRedirectExtractor, + "#pattern" : r"^https://www\.reddit\.com/r/analog/comments/179exao/photographing_the_recent_annular_eclipse_with_a", +}, + ) From c0714d5585b40bc6c0047569a2783ff7c53048bd Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Tue, 24 Oct 2023 23:05:28 +0530 Subject: [PATCH 071/344] [4archive] add 'thread' and 'board' extractors --- gallery_dl/extractor/4archive.py | 110 +++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + test/results/4archive.py | 62 +++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 gallery_dl/extractor/4archive.py create mode 100644 test/results/4archive.py diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py new file mode 100644 index 00000000..b04a2fda --- /dev/null +++ b/gallery_dl/extractor/4archive.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://4archive.org/""" + +from .common import Extractor, Message +from .. import text + + +class _4archiveThreadExtractor(Extractor): + """Extractor for 4archive threads""" + category = "4archive" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{no} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{no}" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" + root = "https://4archive.org" + example = "https://4archive.org/board/a/thread/12345/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + posts = self.posts(page) + + if not data["title"]: + data["title"] = posts[0]["com"][:50] + + for post in posts: + post.update(data) + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Directory, data + if "url" in post: + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + return { + "board" : self.board, + "thread": text.parse_int(self.thread), + "title" : text.unescape(text.extr( + page, 'class="subject">', "")) + } + + def posts(self, page): + return [ + self.parse(post) + for post in page.split('class="postContainer')[1:] + ] + + @staticmethod + def parse(post): + extr = text.extract_from(post) + data = { + "name": extr('class="name">', ""), + "date": text.parse_datetime( + extr('class="dateTime postNum" >', "<").strip(), + "%Y-%m-%d %H:%M:%S"), + "no" : text.parse_int(extr('href="#p', '"')), + } + if 'class="file"' in post: + extr('class="fileText"', ">File: ").strip()[1:], + "size" : text.parse_bytes(extr(" (", ", ")[:-1]), + "width" : text.parse_int(extr("", "x")), + "height" : text.parse_int(extr("", "px")), + }) + extr("
", "
"))) + return data + + +class _4archiveBoardExtractor(Extractor): + """Extractor for 4archive boards""" + category = "4archive" + subcategory = "board" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" + root = "https://4archive.org" + example = "https://4archive.org/board/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + self.num = text.parse_int(match.group(2), 1) + + def items(self): + data = {"_extractor": _4archiveThreadExtractor} + while True: + url = "{}/board/{}/{}".format(self.root, self.board, self.num) + page = self.request(url).text + if 'class="thread"' not in page: + return + for thread in text.extract_iter(page, 'class="thread" id="t', '"'): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, thread) + yield Message.Queue, url, data + self.num += 1 diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1c1473a0..22e4fe34 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -15,6 +15,7 @@ modules = [ "35photo", "3dbooru", "4chan", + "4archive", "4chanarchives", "500px", "8chan", diff --git a/test/results/4archive.py b/test/results/4archive.py new file mode 100644 index 00000000..9b5934a7 --- /dev/null +++ b/test/results/4archive.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.4archive") +_4archive = getattr(gallery_dl.extractor, "4archive") +import datetime + + +__tests__ = ( +{ + "#url" : "https://4archive.org/board/u/thread/2397221", + "#category": ("", "4archive", "thread"), + "#class" : _4archive._4archiveThreadExtractor, + "#pattern" : r"https://i\.imgur\.com/\w{7}\.\w+$", + "#count" : 16, + + "board" : "u", + "com" : str, + "date" : datetime.datetime, + "name" : "Anonymous", + "no" : range(2397221, 2418158), + "thread": 2397221, + "time" : int, + "title" : "best anime", + "url" : str, + "width" : int, + "height": int, + "size" : int, +}, + +{ + "#url" : "https://4archive.org/board/jp/thread/17611798", + "#category": ("", "4archive", "thread"), + "#class" : _4archive._4archiveThreadExtractor, + "#pattern" : r"https://i\.imgur\.com/\w{7}\.\w+$", + "#count" : 85, +}, + +{ + "#url" : "https://4archive.org/board/u", + "#category": ("", "4archive", "board"), + "#class" : _4archive._4archiveBoardExtractor, + "#pattern" : _4archive._4archiveThreadExtractor.pattern, + "#board" : "u", + "#range" : "1-20", + "#count" : 20, +}, + +{ + "#url" : "https://4archive.org/board/jp/10", + "#category": ("", "4archive", "board"), + "#class" : _4archive._4archiveBoardExtractor, + "#pattern" : _4archive._4archiveThreadExtractor.pattern, + "#board" : "jp", + "#range" : "1-50", + "#count" : 50, +} + +) From 31dbbffc0ba820157e8f6d3186ebdeb2e20c185a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 25 Oct 2023 16:45:27 +0200 Subject: [PATCH 072/344] =?UTF-8?q?[twitter]=20cache=20'user=5Fby=5F?= =?UTF-8?q?=E2=80=A6'=20results=20(#4719)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gallery_dl/extractor/twitter.py | 4 +++- gallery_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 61e871ef..52d962eb 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache +from ..cache import cache, memcache import itertools import json import re @@ -1194,6 +1194,7 @@ class TwitterAPI(): } return self._pagination_users(endpoint, variables) + @memcache(keyarg=1) def user_by_rest_id(self, rest_id): endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId" features = self.features.copy() @@ -1207,6 +1208,7 @@ class TwitterAPI(): } return self._call(endpoint, params)["data"]["user"]["result"] + @memcache(keyarg=1) def user_by_screen_name(self, screen_name): endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName" params = { diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 593cffab..29f1d055 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.1" +__version__ = "1.26.2-dev" From 12a800ce21c859df80b101c86070aec75a35f693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 25 Oct 2023 17:18:06 +0200 Subject: [PATCH 073/344] [patreon] improve 'campaign_id' handling (#4699, #4715) - add ways to directly specify a 'campaign_id' - 'campaign-id' config option - 'c' or 'campaign_id' URL query parameter - more descriptive error messages - show 'campaign_id' value in debug log --- docs/configuration.rst | 14 +++++++++ gallery_dl/extractor/patreon.py | 50 ++++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 25b0ad9c..23cc8f5b 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2440,6 +2440,20 @@ Description Note: This requires 1 additional HTTP request per post. +extractor.patreon.campaign-id +----------------------------- +Type + ``string`` +Default + ``"auto"`` +Description + Alternative way of specifying the ``campaign_id`` value of a creator + in case the automatic extraction method no longer functions. + + Another way of specifying this value is using a ``c`` or ``campaign_id`` + URL query parameter, e,g, ``https://www.patreon.com/NAME?c=12345``. + + extractor.patreon.files ----------------------- Type diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6ac9a83b..0975992f 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -267,34 +267,52 @@ class PatreonCreatorExtractor(PatreonExtractor): def posts(self): query = text.parse_query(self.query) + campaign_id = self._get_campaign_id(query) + filters = self._get_filters(query) - creator_id = query.get("u") - if creator_id: - url = "{}/user/posts?u={}".format(self.root, creator_id) + self.log.debug("campaign_id: %s", campaign_id) + + url = self._build_url("posts", ( + "&filter[campaign_id]=" + campaign_id + + "&filter[contains_exclusive_posts]=true" + "&filter[is_draft]=false" + filters + + "&sort=" + query.get("sort", "-published_at") + )) + return self._pagination(url) + + def _get_campaign_id(self, query): + campaign_id = self.config("campaign-id") + if campaign_id and campaign_id != "auto": + return str(campaign_id) + + campaign_id = query.get("c") or query.get("campaign_id") + if campaign_id: + return campaign_id + + user_id = query.get("u") + if user_id: + url = "{}/user/posts?u={}".format(self.root, user_id) else: url = "{}/{}/posts".format(self.root, self.creator) page = self.request(url, notfound="creator").text try: + data = None data = self._extract_bootstrap(page) - campaign_id = data["campaign"]["data"]["id"] - except (KeyError, ValueError): - raise exception.NotFoundError("creator") - - filters = "".join( + return data["campaign"]["data"]["id"] + except (KeyError, ValueError) as exc: + self.log.debug(data) + raise exception.StopExtraction( + "Unable to extract campaign ID (%s: %s)", + exc.__class__.__name__, exc) + + def _get_filters(self, query): + return "".join( "&filter[{}={}".format(key[8:], text.escape(value)) for key, value in query.items() if key.startswith("filters[") ) - url = self._build_url("posts", ( - "&filter[campaign_id]=" + campaign_id + - "&filter[contains_exclusive_posts]=true" - "&filter[is_draft]=false" + filters + - "&sort=" + query.get("sort", "-published_at") - )) - return self._pagination(url) - class PatreonUserExtractor(PatreonExtractor): """Extractor for media from creators supported by you""" From 1042278bec329c8300ee96f4f22c6ed08f0b2b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 25 Oct 2023 17:47:03 +0200 Subject: [PATCH 074/344] [misskey] support 'misskey.design' (#4713) --- docs/supportedsites.md | 6 ++++ gallery_dl/extractor/misskey.py | 4 +++ test/results/misskeydesign.py | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 test/results/misskeydesign.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3924cd39..49d1444b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1185,6 +1185,12 @@ Consider all sites to be NSFW unless otherwise known.
+ + + + + + diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 95b83b62..5385f8aa 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -70,6 +70,10 @@ BASE_PATTERN = MisskeyExtractor.update({ "root": "https://misskey.io", "pattern": r"misskey\.io", }, + "misskey.design": { + "root": "https://misskey.design", + "pattern": r"misskey\.design", + }, "lesbian.energy": { "root": "https://lesbian.energy", "pattern": r"lesbian\.energy", diff --git a/test/results/misskeydesign.py b/test/results/misskeydesign.py new file mode 100644 index 00000000..f12be9ff --- /dev/null +++ b/test/results/misskeydesign.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import misskey + + +__tests__ = ( +{ + "#url" : "https://misskey.design/@machina_3D", + "#category": ("misskey", "misskey.design", "user"), + "#class" : misskey.MisskeyUserExtractor, + "#pattern" : r"https://file\.misskey\.design/post/[\w-]{36}\.\w+", + "#range" : "1-50", + "#count" : 50, +}, + +{ + "#url" : "https://misskey.design/@blooddj@pawoo.net", + "#category": ("misskey", "misskey.design", "user"), + "#class" : misskey.MisskeyUserExtractor, + "#count" : 7, +}, + +{ + "#url" : "https://misskey.design/@kujyo_t/following", + "#category": ("misskey", "misskey.design", "following"), + "#class" : misskey.MisskeyFollowingExtractor, + "#count" : ">= 250", +}, + +{ + "#url" : "https://misskey.design/notes/9jva1danjc", + "#category": ("misskey", "misskey.design", "note"), + "#class" : misskey.MisskeyNoteExtractor, + "#urls" : "https://file.misskey.design/post/a8d27901-24e1-42ab-b8a6-1e09c98c6f55.webp", +}, + +{ + "#url" : "https://misskey.design/my/favorites", + "#category": ("misskey", "misskey.design", "favorite"), + "#class" : misskey.MisskeyFavoriteExtractor, +}, + +{ + "#url" : "https://misskey.design/api/i/favorites", + "#category": ("misskey", "misskey.design", "favorite"), + "#class" : misskey.MisskeyFavoriteExtractor, +}, + +) From acb713b95a6ddcb8553af31e86e2ae0466cda6e9 Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:08:45 +0530 Subject: [PATCH 075/344] [4archive] update --- gallery_dl/extractor/4archive.py | 11 ++++++----- test/results/4archive.py | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index b04a2fda..d1983697 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -7,7 +7,7 @@ """Extractors for https://4archive.org/""" from .common import Extractor, Message -from .. import text +from .. import text, util class _4archiveThreadExtractor(Extractor): @@ -17,8 +17,9 @@ class _4archiveThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{no} {filename}.{extension}" archive_fmt = "{board}_{thread}_{no}" - pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" root = "https://4archive.org" + referer = False + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" example = "https://4archive.org/board/a/thread/12345/" def __init__(self, match): @@ -37,8 +38,8 @@ class _4archiveThreadExtractor(Extractor): for post in posts: post.update(data) - post["time"] = text.parse_int(post["date"].timestamp()) - yield Message.Directory, data + post["time"] = int(util.datetime_to_timestamp(post["date"])) + yield Message.Directory, post if "url" in post: yield Message.Url, post["url"], text.nameext_from_url( post["filename"], post) @@ -87,8 +88,8 @@ class _4archiveBoardExtractor(Extractor): """Extractor for 4archive boards""" category = "4archive" subcategory = "board" - pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" root = "https://4archive.org" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" example = "https://4archive.org/board/a/" def __init__(self, match): diff --git a/test/results/4archive.py b/test/results/4archive.py index 9b5934a7..ec90b929 100644 --- a/test/results/4archive.py +++ b/test/results/4archive.py @@ -44,7 +44,6 @@ __tests__ = ( "#category": ("", "4archive", "board"), "#class" : _4archive._4archiveBoardExtractor, "#pattern" : _4archive._4archiveThreadExtractor.pattern, - "#board" : "u", "#range" : "1-20", "#count" : 20, }, @@ -54,7 +53,6 @@ __tests__ = ( "#category": ("", "4archive", "board"), "#class" : _4archive._4archiveBoardExtractor, "#pattern" : _4archive._4archiveThreadExtractor.pattern, - "#board" : "jp", "#range" : "1-50", "#count" : 50, } From d2874c77249dbdd74dc89e07f0054a23a32381fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 25 Oct 2023 20:11:14 +0200 Subject: [PATCH 076/344] [4archive] docs/supportedsites --- docs/supportedsites.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3924cd39..ecffc70e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -31,6 +31,12 @@ Consider all sites to be NSFW unless otherwise known. + + + + + + From 75dec71253f5b08b7200adf2bde0344750e40b14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 26 Oct 2023 18:02:31 +0200 Subject: [PATCH 077/344] [idolcomplex] disable Referer headers by default (#4726) --- gallery_dl/extractor/idolcomplex.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 16e40970..b7b6ef10 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -22,6 +22,7 @@ class IdolcomplexExtractor(SankakuExtractor): cookies_domain = "idol.sankakucomplex.com" cookies_names = ("login", "pass_hash") root = "https://" + cookies_domain + referer = False request_interval = 5.0 def __init__(self, match): From 3bbaa875f1749bb06a3a2d47fad07d0dbd48e911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 26 Oct 2023 21:50:18 +0200 Subject: [PATCH 078/344] [kemonoparty] fix parsing of non-standard 'dates' (#4676) --- gallery_dl/extractor/kemonoparty.py | 13 +++++++++---- test/results/kemonoparty.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 1596cfb1..76e940c7 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -69,8 +69,9 @@ class KemonopartyExtractor(Extractor): headers["Referer"] = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers - post["date"] = text.parse_datetime( - post["published"] or post["added"], "%Y-%m-%dT%H:%M:%S") + post["date"] = self._parse_datetime( + post["published"] or post["added"]) + if username: post["username"] = username if comments: @@ -205,6 +206,11 @@ class KemonopartyExtractor(Extractor): }) return dms + def _parse_datetime(self, date_string): + if len(date_string) > 19: + date_string = date_string[:19] + return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S") + @memcache(keyarg=1) def _discord_channels(self, server): url = "{}/api/v1/discord/channel/lookup/{}".format( @@ -360,8 +366,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "name": path, "type": "inline", "hash": ""}) post["channel_name"] = self.channel_name - post["date"] = text.parse_datetime( - post["published"], "%Y-%m-%dT%H:%M:%S.%f") + post["date"] = self._parse_datetime(post["published"]) post["count"] = len(files) yield Message.Directory, post diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 6594c4b2..06f62c82 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -199,6 +199,16 @@ __tests__ = ( "#exception": exception.NotFoundError, }, +{ + "#url" : "https://kemono.su/patreon/user/6298789/post/69764693", + "#comment" : "'published' metadata with extra microsecond data", + "#category": ("", "kemonoparty", "patreon"), + "#class" : kemonoparty.KemonopartyPostExtractor, + + "date" : "dt:2022-07-29 21:12:11", + "published": "2022-07-29T21:12:11.483000", +}, + { "#url" : "https://kemono.party/discord/server/488668827274444803#608504710906904576", "#category": ("", "kemonoparty", "discord"), From d0effcae20c7fc82bafdf26cae2919ea2d223ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 26 Oct 2023 22:26:38 +0200 Subject: [PATCH 079/344] [kemonoparty] add 'revision_index' metadata field (#4727) --- gallery_dl/extractor/kemonoparty.py | 18 ++++++++++++++---- test/results/kemonoparty.py | 8 +++++--- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 76e940c7..631ba266 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -219,7 +219,14 @@ class KemonopartyExtractor(Extractor): @memcache(keyarg=1) def _post_revisions(self, url): - return self.request(url + "/revisions").json() + revs = self.request(url + "/revisions").json() + + idx = len(revs) + for rev in revs: + rev["revision_index"] = idx + idx -= 1 + + return revs def _validate(response): @@ -253,13 +260,15 @@ class KemonopartyUserExtractor(KemonopartyExtractor): if revisions: for post in posts: post["revision_id"] = 0 - yield post post_url = "{}/post/{}".format(self.api_url, post["id"]) try: revs = self._post_revisions(post_url) except exception.HttpError: - pass + post["revision_index"] = 1 + yield post else: + post["revision_index"] = len(revs) + 1 + yield post yield from revs else: yield from posts @@ -292,8 +301,9 @@ class KemonopartyPostExtractor(KemonopartyExtractor): try: revs = self._post_revisions(self.api_url) except exception.HttpError: - pass + post["revision_index"] = 1 else: + post["revision_index"] = len(revs) + 1 return itertools.chain((post,), revs) return (post,) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 06f62c82..ad94a496 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -169,13 +169,14 @@ __tests__ = ( }, { - "#url" : "https://kemono.party/patreon/user/3161935/post/68231671/revision/134996", + "#url" : "https://kemono.su/patreon/user/3161935/post/68231671/revision/142470", "#comment" : "revisions (#4498)", "#category": ("", "kemonoparty", "patreon"), "#class" : kemonoparty.KemonopartyPostExtractor, - "#urls" : "https://kemono.party/data/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", + "#urls" : "https://kemono.su/data/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", - "revision_id": 134996, + "revision_id": 142470, + "revision_index": 2, }, { @@ -188,6 +189,7 @@ __tests__ = ( "#archive" : False, "revision_id": range(134996, 3052965), + "revision_index": range(1, 9), }, From 969be65d0b886a956d9b6ac84d315ff38b228b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 27 Oct 2023 15:33:00 +0200 Subject: [PATCH 080/344] [instagram] update API headers --- gallery_dl/extractor/instagram.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index c7041833..b0789be1 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -778,13 +778,15 @@ class InstagramRestAPI(): kwargs["headers"] = { "Accept" : "*/*", "X-CSRFToken" : extr.csrf_token, - "X-Instagram-AJAX": "1006242110", "X-IG-App-ID" : "936619743392459", - "X-ASBD-ID" : "198387", + "X-ASBD-ID" : "129477", "X-IG-WWW-Claim" : extr.www_claim, "X-Requested-With": "XMLHttpRequest", - "Alt-Used" : "www.instagram.com", + "Connection" : "keep-alive", "Referer" : extr.root + "/", + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", } return extr.request(url, **kwargs).json() From 218295a4c6de1207ca250db7f2a807a6ba17eb6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 27 Oct 2023 17:58:02 +0200 Subject: [PATCH 081/344] [twitter] fix avatars without 'date' information (#4696) --- gallery_dl/extractor/twitter.py | 7 +++++-- test/results/twitter.py | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 52d962eb..63653c1f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -287,8 +287,11 @@ class TwitterExtractor(Extractor): date = text.parse_timestamp( ((tweet_id >> 22) + 1288834974657) // 1000) else: - date = text.parse_datetime( - tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + try: + date = text.parse_datetime( + tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + except Exception: + date = util.NONE tget = tweet.get tdata = { diff --git a/test/results/twitter.py b/test/results/twitter.py index fa95a046..6f9efbba 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -5,7 +5,7 @@ # published by the Free Software Foundation. from gallery_dl.extractor import twitter -from gallery_dl import exception +from gallery_dl import util, exception __tests__ = ( @@ -522,7 +522,7 @@ The Washington Post writes, "Three weeks after the toxic train derailment in Ohi "#url" : "https://twitter.com/supernaturepics/photo", "#category": ("", "twitter", "avatar"), "#class" : twitter.TwitterAvatarExtractor, - "#pattern" : r"https://pbs\.twimg\.com/profile_images/554585280938659841/FLVAlX18\.jpeg", + "#urls" : "https://pbs.twimg.com/profile_images/554585280938659841/FLVAlX18.jpeg", "date" : "dt:2015-01-12 10:26:49", "extension": "jpeg", @@ -537,6 +537,17 @@ The Washington Post writes, "Three weeks after the toxic train derailment in Ohi "#count" : 0, }, +{ + "#url" : "https://twitter.com/i_n_u/photo", + "#comment" : "old avatar with small ID and no valid 'date' (#4696)", + "#category": ("", "twitter", "avatar"), + "#class" : twitter.TwitterAvatarExtractor, + "#urls" : "https://pbs.twimg.com/profile_images/2946444489/32028c6affdab425e037ff5a6bf77c1d.jpeg", + + "date" : util.NONE, + "tweet_id" : 2946444489, +}, + { "#url" : "https://twitter.com/supernaturepics/header_photo", "#category": ("", "twitter", "background"), From fd36eafe32898c4563b01da42fa181a4e83823ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 27 Oct 2023 23:01:43 +0200 Subject: [PATCH 082/344] [twitter] restore truncated retweet text (#3430, #4690) --- gallery_dl/extractor/twitter.py | 50 +++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 63653c1f..cc8b8f62 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -272,28 +272,23 @@ class TwitterExtractor(Extractor): author = tweet["user"] author = self._transform_user(author) - if "note_tweet" in tweet: - note = tweet["note_tweet"]["note_tweet_results"]["result"] - else: - note = None - - source = tweet["source"] - if "legacy" in tweet: - tweet = tweet["legacy"] + legacy = tweet["legacy"] + else: + legacy = tweet + tget = legacy.get - tweet_id = int(tweet["id_str"]) + tweet_id = int(legacy["id_str"]) if tweet_id >= 300000000000000: date = text.parse_timestamp( ((tweet_id >> 22) + 1288834974657) // 1000) else: try: date = text.parse_datetime( - tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + legacy["created_at"], "%a %b %d %H:%M:%S %z %Y") except Exception: date = util.NONE - tget = tweet.get tdata = { "tweet_id" : tweet_id, "retweet_id" : text.parse_int( @@ -307,8 +302,8 @@ class TwitterExtractor(Extractor): "date" : date, "author" : author, "user" : self._user or author, - "lang" : tweet["lang"], - "source" : text.extr(source, ">", "<"), + "lang" : legacy["lang"], + "source" : text.extr(tweet["source"], ">", "<"), "sensitive" : tget("possibly_sensitive"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), @@ -316,7 +311,13 @@ class TwitterExtractor(Extractor): "retweet_count" : tget("retweet_count"), } - entities = note["entity_set"] if note else tweet["entities"] + if "note_tweet" in tweet: + note = tweet["note_tweet"]["note_tweet_results"]["result"] + content = note["text"] + entities = note["entity_set"] + else: + content = tget("full_text") or tget("text") or "" + entities = legacy["entities"] hashtags = entities.get("hashtags") if hashtags: @@ -330,8 +331,7 @@ class TwitterExtractor(Extractor): "nick": u["name"], } for u in mentions] - content = text.unescape( - note["text"] if note else tget("full_text") or tget("text") or "") + content = text.unescape(content) urls = entities.get("urls") if urls: for url in urls: @@ -339,11 +339,13 @@ class TwitterExtractor(Extractor): txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content - if "in_reply_to_screen_name" in tweet: - tdata["reply_to"] = tweet["in_reply_to_screen_name"] - if "quoted_by" in tweet: - tdata["quote_by"] = tweet["quoted_by"] + if "in_reply_to_screen_name" in legacy: + tdata["reply_to"] = legacy["in_reply_to_screen_name"] + if "quoted_by" in legacy: + tdata["quote_by"] = legacy["quoted_by"] if tdata["retweet_id"]: + tdata["content"] = "RT @{}: {}".format( + author["name"], tdata["content"]) tdata["date_original"] = text.parse_timestamp( ((tdata["retweet_id"] >> 22) + 1288834974657) // 1000) @@ -1532,15 +1534,21 @@ class TwitterAPI(): retweet["core"]["user_results"]["result"] rtlegacy = retweet["legacy"] + + if "note_tweet" in retweet: + tweet["note_tweet"] = retweet["note_tweet"] + if "extended_entities" in rtlegacy and \ "extended_entities" not in legacy: legacy["extended_entities"] = \ rtlegacy["extended_entities"] + if "withheld_scope" in rtlegacy and \ "withheld_scope" not in legacy: legacy["withheld_scope"] = \ rtlegacy["withheld_scope"] - legacy["full_text"] = rtlegacy["full_text"] + + legacy["full_text"] = rtlegacy["full_text"] except KeyError: pass From 28ada11cbaf2f5c0024b72df9956b2e916ec751d Mon Sep 17 00:00:00 2001 From: Tobias Hellmann <10066140+Tobi823@users.noreply.github.com> Date: Sat, 28 Oct 2023 22:27:26 +0200 Subject: [PATCH 083/344] Try to parse newer HTTP response from Patreon --- gallery_dl/extractor/patreon.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 0975992f..6e05afd5 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -249,8 +249,14 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - return util.json_loads(text.extr( - page, "window.patreon.bootstrap,", "});") + "}") + if "window.patreon.bootstrap," in page: + return util.json_loads(text.extr( + page, "window.patreon.bootstrap,", "});") + "}") + elif 'window.patreon = {"bootstrap":' in page: + return util.json_loads(text.extr( + page, 'window.patreon = {"bootstrap":', '},"apiServer"') + "}") + else: + raise Exception("unknown HTML and JS structure") class PatreonCreatorExtractor(PatreonExtractor): From c9a2be36d494f7a9ff6bac846b38532af8d609ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 29 Oct 2023 13:48:42 +0100 Subject: [PATCH 084/344] [sankaku] support '/posts/' tag search URLs (#4740) --- gallery_dl/extractor/sankaku.py | 2 +- test/results/sankaku.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index dc355112..bebea2a2 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -87,7 +87,7 @@ class SankakuTagExtractor(SankakuExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/?\?([^#]*)" + pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)" example = "https://sankaku.app/?tags=TAG" def __init__(self, match): diff --git a/test/results/sankaku.py b/test/results/sankaku.py index 9a1738a7..37330f26 100644 --- a/test/results/sankaku.py +++ b/test/results/sankaku.py @@ -53,6 +53,20 @@ __tests__ = ( "#class" : sankaku.SankakuTagExtractor, }, +{ + "#url" : "https://chan.sankakucomplex.com/posts?tags=TAG", + "#comment" : "'/posts' in tag search URL (#4740)", + "#category": ("booru", "sankaku", "tag"), + "#class" : sankaku.SankakuTagExtractor, +}, + +{ + "#url" : "https://chan.sankakucomplex.com/ja/posts/?tags=あえいおう", + "#comment" : "'/posts' in tag search URL (#4740)", + "#category": ("booru", "sankaku", "tag"), + "#class" : sankaku.SankakuTagExtractor, +}, + { "#url" : "https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", "#comment" : "error on five or more tags", From e46efbd5b58168d02e29f85f8174bb8f84fa2ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 29 Oct 2023 15:32:11 +0100 Subject: [PATCH 085/344] prevent crash when 'stdout.line_buffering' is not defined (#642) --- gallery_dl/output.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 4f2ee269..9508ff33 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -254,14 +254,14 @@ def stderr_write_flush(s): sys.stderr.flush() -if sys.stdout.line_buffering: +if getattr(sys.stdout, "line_buffering", None): def stdout_write(s): sys.stdout.write(s) else: stdout_write = stdout_write_flush -if sys.stderr.line_buffering: +if getattr(sys.stderr, "line_buffering", None): def stderr_write(s): sys.stderr.write(s) else: From 4730de163f44ab59b21a28ad4253cd14d55958bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 29 Oct 2023 15:57:21 +0100 Subject: [PATCH 086/344] [patreon] refactor _extract_bootstrap() --- gallery_dl/extractor/patreon.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6e05afd5..351c5e3c 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -249,14 +249,23 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - if "window.patreon.bootstrap," in page: - return util.json_loads(text.extr( - page, "window.patreon.bootstrap,", "});") + "}") - elif 'window.patreon = {"bootstrap":' in page: - return util.json_loads(text.extr( - page, 'window.patreon = {"bootstrap":', '},"apiServer"') + "}") - else: - raise Exception("unknown HTML and JS structure") + bootstrap = text.extr( + page, 'window.patreon = {"bootstrap":', '},"apiServer"') + if bootstrap: + return util.json_loads(bootstrap + "}") + + bootstrap = text.extr(page, "window.patreon.bootstrap,", "});") + if bootstrap: + return util.json_loads(bootstrap + "}") + + data = text.extr(page, "window.patreon = {", "};\n") + if data: + try: + return util.json_loads("{" + data + "}")["bootstrap"] + except Exception: + pass + + raise exception.StopExtraction("Unable to extract bootstrap data") class PatreonCreatorExtractor(PatreonExtractor): From fd8f58ad76f2226196bb3e99c0d16c92e01bb4f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 30 Oct 2023 13:37:09 +0100 Subject: [PATCH 087/344] [behance] unescape embed URLs (#4742) --- gallery_dl/extractor/behance.py | 3 ++- test/results/behance.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index fc5f9eff..a92918e9 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -170,7 +170,8 @@ class BehanceGalleryExtractor(BehanceExtractor): elif mtype == "EmbedModule": embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: - append(("ytdl:" + text.extr(embed, 'src="', '"'), module)) + embed = text.unescape(text.extr(embed, 'src="', '"')) + append(("ytdl:" + embed, module)) return result diff --git a/test/results/behance.py b/test/results/behance.py index 7b39b5b4..2a23b3ed 100644 --- a/test/results/behance.py +++ b/test/results/behance.py @@ -13,8 +13,10 @@ __tests__ = ( "#url" : "https://www.behance.net/gallery/17386197/A-Short-Story", "#category": ("", "behance", "gallery"), "#class" : behance.BehanceGalleryExtractor, - "#count" : 2, - "#sha1_url": "ab79bd3bef8d3ae48e6ac74fd995c1dfaec1b7d2", + "#urls" : ( + "ytdl:https://player.vimeo.com/video/97189640?title=0&byline=0&portrait=0&color=ffffff", + "https://mir-s3-cdn-cf.behance.net/project_modules/source/a5a12417386197.562bc055a107d.jpg", + ), "id" : 17386197, "name" : r"re:\"Hi\". A short story about the important things ", From 68e72a836c0cb023d0229354aebde9cbf3b21099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 30 Oct 2023 13:17:47 +0100 Subject: [PATCH 088/344] [exhentai] fix extraction (#4730) - update to new API response layout - use proper API server URL - fix 'filesize' metadata --- gallery_dl/extractor/exhentai.py | 55 +++++++++++++++++++------------- test/results/exhentai.py | 2 +- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 44bfe7d8..268385ef 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -112,12 +112,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - self.key = {} - self.count = 0 self.gallery_id = text.parse_int(match.group(2) or match.group(5)) self.gallery_token = match.group(3) self.image_token = match.group(4) self.image_num = text.parse_int(match.group(6), 1) + self.key_start = None + self.key_show = None + self.key_next = None + self.api_url = "" + self.count = 0 def _init(self): source = self.config("source") @@ -145,17 +148,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): gpage = self._gallery_page() self.image_token = text.extr(gpage, 'hentai.org/s/', '"') if not self.image_token: - self.log.error("Failed to extract initial image token") self.log.debug("Page content:\n%s", gpage) - return + raise exception.StopExtraction( + "Failed to extract initial image token") ipage = self._image_page() else: ipage = self._image_page() part = text.extr(ipage, 'hentai.org/g/', '"') if not part: - self.log.error("Failed to extract gallery token") self.log.debug("Page content:\n%s", ipage) - return + raise exception.StopExtraction( + "Failed to extract gallery token") self.gallery_token = part.split("/")[1] gpage = self._gallery_page() @@ -208,6 +211,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) + self.api_url = extr('var api_url = "', '"') or (self.root + "/api.php") + data = { "gid" : self.gallery_id, "token" : self.gallery_token, @@ -225,7 +230,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): '>Visible: + + + + + + diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe34..efdcde78 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -147,6 +147,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tmohentai", "toyhouse", "tsumino", "tumblr", diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py new file mode 100644 index 00000000..462e51dd --- /dev/null +++ b/gallery_dl/extractor/tmohentai.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tmohentai.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r'(?:https?://)?tmohentai\.com' + + +class TmohentaiExtractor(Extractor): + category = 'tmohentai' + root = 'http://tmohentai.com' + directory_fmt = ('{category}', '{title}') + filename_fmt = '{filename}.{extension}' + archive_fmt = '{title}_{filename}' + pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' + example = 'https://tmohentai.com/contents/12345a67b89c0' + + def __init__(self, match): + Extractor.__init__(self, match) + self.contents = match.group(2) + self.reader = match.group(3) + self.id_string = match.group(4) + + def parse_location(self): + if self.contents: + url = f'{self.root}/reader/{self.id_string}/paginated' + else: + url = self.url + return url + + def items(self): + url = self.parse_location() + page_src = self.request( + text.ensure_http_scheme(url)).text + + data = self.metadata() + yield Message.Directory, data + + page_nums = text.extract_iter(page_src, 'option value="', '"') + pages = [text.extr(page_src, 'data-original="', '"')] + base_page = pages[0].rpartition('/')[0] + for num, page in enumerate(page_nums, start=1): + file = f'{base_page}/{num:>03}.webp' + img = text.nameext_from_url(file, { + 'num': num, + }) + yield Message.Url, file, img + + def metadata(self): + contents = f'{self.root}/contents/{self.id_string}' + contents_src = self.request(text.ensure_http_scheme(contents)).text + + genders_src = text.extr(contents_src, 'Genders', '') + genders_list = text.extract_iter(genders_src, '">', '') + + tags_src = text.extr(contents_src, 'Tags', '') + tags_list = text.extract_iter(tags_src, '">', '') + + upload_src = text.extr(contents_src, 'Uploaded By', '/a>') + data = { + 'title' : text.extr(contents_src, '

', '

'), + 'id_string': self.id_string, + 'artists' : text.remove_html( + text.extr(contents_src, 'tag tag-accepted">', '')), + 'genders' : list(genders_list), + 'tags' : list(tags_list), + 'uploader' : text.extr(upload_src, '">', '<'), + 'language' : text.extr( + contents_src, ' ', ''), + } + return data From dad7ba1d581b0829ccc6c4cd8a44efcb58cb3d51 Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Fri, 17 Nov 2023 21:08:34 -0500 Subject: [PATCH 126/344] [tmohentai] fix edge cases. updated archive_fmt and filename_fmt --- gallery_dl/extractor/tmohentai.py | 34 ++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index 462e51dd..0a56b230 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -16,8 +16,8 @@ class TmohentaiExtractor(Extractor): category = 'tmohentai' root = 'http://tmohentai.com' directory_fmt = ('{category}', '{title}') - filename_fmt = '{filename}.{extension}' - archive_fmt = '{title}_{filename}' + filename_fmt = '{title}_{filename}.{extension}' + archive_fmt = '{id_string}_{filename}' pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' example = 'https://tmohentai.com/contents/12345a67b89c0' @@ -31,9 +31,20 @@ class TmohentaiExtractor(Extractor): if self.contents: url = f'{self.root}/reader/{self.id_string}/paginated' else: - url = self.url + url_str = self.url.rpartition('/') + if url_str[-1].isdigit(): + url = url_str[0] + else: + url = self.url return url + @staticmethod + def get_file_info(page_src): + file = text.extr(page_src, 'data-original="', '"') + file_loc, _, file_name = file.rpartition('/') + start_num, ext = file_name.split('.') + return file_loc, start_num, ext + def items(self): url = self.parse_location() page_src = self.request( @@ -42,13 +53,16 @@ class TmohentaiExtractor(Extractor): data = self.metadata() yield Message.Directory, data - page_nums = text.extract_iter(page_src, 'option value="', '"') - pages = [text.extr(page_src, 'data-original="', '"')] - base_page = pages[0].rpartition('/')[0] - for num, page in enumerate(page_nums, start=1): - file = f'{base_page}/{num:>03}.webp' + file_loc, start_num, ext = self.get_file_info(page_src) + page_nums = text.extract_iter( + page_src, 'option value="', '"') + + for num, page in enumerate(page_nums, start=int(start_num)): + file = f'{file_loc}/{num:>03}.{ext}' img = text.nameext_from_url(file, { - 'num': num, + 'num' : num, + 'title' : data['title'], + 'id_string': self.id_string, }) yield Message.Url, file, img @@ -64,7 +78,7 @@ class TmohentaiExtractor(Extractor): upload_src = text.extr(contents_src, 'Uploaded By', '/a>') data = { - 'title' : text.extr(contents_src, '

', '

'), + 'title' : text.extr(contents_src, '

', '

').strip(), 'id_string': self.id_string, 'artists' : text.remove_html( text.extr(contents_src, 'tag tag-accepted">', '')), From ed965eecbb42adaebe9ddbbd4a8bfe48214b1ae7 Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Sat, 18 Nov 2023 14:39:17 -0500 Subject: [PATCH 127/344] [tmohentai] refactor to str.format for backwards compatibility --- gallery_dl/extractor/tmohentai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index 0a56b230..a02b8e8a 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -29,7 +29,7 @@ class TmohentaiExtractor(Extractor): def parse_location(self): if self.contents: - url = f'{self.root}/reader/{self.id_string}/paginated' + url = '{}/reader/{}/paginated'.format(self.root, self.id_string) else: url_str = self.url.rpartition('/') if url_str[-1].isdigit(): @@ -58,7 +58,7 @@ class TmohentaiExtractor(Extractor): page_src, 'option value="', '"') for num, page in enumerate(page_nums, start=int(start_num)): - file = f'{file_loc}/{num:>03}.{ext}' + file = '{}/{:>03}.{}'.format(file_loc, num, ext) img = text.nameext_from_url(file, { 'num' : num, 'title' : data['title'], @@ -67,7 +67,7 @@ class TmohentaiExtractor(Extractor): yield Message.Url, file, img def metadata(self): - contents = f'{self.root}/contents/{self.id_string}' + contents = '{}/contents/{}'.format(self.root, self.id_string) contents_src = self.request(text.ensure_http_scheme(contents)).text genders_src = text.extr(contents_src, 'Genders', '') From e97d7b1c8587080a63a7668f6e8975ece38702a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 18 Nov 2023 20:57:28 +0100 Subject: [PATCH 128/344] [exhentai] fix empty api_url with '"source": "hitomi"' (#4829) --- gallery_dl/extractor/exhentai.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index db074b0f..5f9ccb21 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -40,6 +40,7 @@ class ExhentaiExtractor(Extractor): if domain == "auto": domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain + self.api_url = self.root + "/api.php" self.cookies_domain = "." + domain Extractor.initialize(self) @@ -120,7 +121,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key_start = None self.key_show = None self.key_next = None - self.api_url = "" self.count = 0 def _init(self): @@ -220,7 +220,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) - self.api_url = extr('var api_url = "', '"') or (self.root + "/api.php") + + api_url = extr('var api_url = "', '"') + if api_url: + self.api_url = api_url data = { "gid" : self.gallery_id, From 34a387b6e2159fa3692df282fdfe81d1524f2971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 18 Nov 2023 23:43:40 +0100 Subject: [PATCH 129/344] support 'metadata-*' names for '*-metadata' options For example, instead of 'url-metadata' it is now also possible to use 'metadata-url' as option name. - metadata-url - metadata-path - metadata-http - metadata-version - metadata-parent --- docs/configuration.rst | 18 ++++++++++-------- gallery_dl/extractor/common.py | 6 ++++++ gallery_dl/job.py | 15 +++++++-------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index c22bd8c2..b0d09b7f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -166,6 +166,8 @@ Description extractor.*.parent-metadata --------------------------- +extractor.*.metadata-parent +--------------------------- Type * ``bool`` * ``string`` @@ -642,12 +644,12 @@ Description `format strings`_. +extractor.*.metadata-url +------------------------ extractor.*.url-metadata ------------------------ Type ``string`` -Default - ``null`` Description Insert a file's download URL into its metadata dictionary as the given name. @@ -658,12 +660,12 @@ Description with a ``metadata`` post processor, etc. +extractor.*.metadata-path +------------------------- extractor.*.path-metadata ------------------------- Type ``string`` -Default - ``null`` Description Insert a reference to the current `PathFormat `__ @@ -673,12 +675,12 @@ Description to access the current file's filename as ``"{gdl_path.filename}"``. +extractor.*.metadata-http +------------------------- extractor.*.http-metadata ------------------------- Type ``string`` -Default - ``null`` Description Insert an ``object`` containing a file's HTTP headers and ``filename``, ``extension``, and ``date`` parsed from them @@ -689,12 +691,12 @@ Description and its parsed form as ``"{gdl_http[date]}"``. +extractor.*.metadata-version +---------------------------- extractor.*.version-metadata ---------------------------- Type ``string`` -Default - ``null`` Description Insert an ``object`` containing gallery-dl's version info into metadata dictionaries as the given name. diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 3bec4248..f3784272 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -78,6 +78,12 @@ class Extractor(): def config(self, key, default=None): return config.interpolate(self._cfgpath, key, default) + def config2(self, key, key2, default=None, sentinel=util.SENTINEL): + value = self.config(key, sentinel) + if value is not sentinel: + return value + return self.config(key2, default) + def config_deprecated(self, key, deprecated, default=None, sentinel=util.SENTINEL, history=set()): value = self.config(deprecated, sentinel) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 1e80cbf3..9aad226f 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -87,11 +87,10 @@ class Job(): extr.category = pextr.category extr.subcategory = pextr.subcategory - self.metadata_url = extr.config("url-metadata") - self.metadata_http = extr.config("http-metadata") - - version_info = extr.config("version-metadata") - metadata_path = extr.config("path-metadata") + self.metadata_url = extr.config2("metadata-url", "url-metadata") + self.metadata_http = extr.config2("metadata-http", "http-metadata") + metadata_path = extr.config2("metadata-path", "path-metadata") + metadata_version = extr.config2("metadata-version", "version-metadata") # user-supplied metadata kwdict = extr.config("keywords") @@ -99,8 +98,8 @@ class Job(): self.kwdict.update(kwdict) if metadata_path: self.kwdict[metadata_path] = path_proxy - if version_info: - self.kwdict[version_info] = { + if metadata_version: + self.kwdict[metadata_version] = { "version" : version.__version__, "is_executable" : util.EXECUTABLE, "current_git_head": util.git_head() @@ -375,7 +374,7 @@ class DownloadJob(Job): else: extr._parentdir = pextr._parentdir - pmeta = pextr.config("parent-metadata") + pmeta = pextr.config2("parent-metadata", "metadata-parent") if pmeta: if isinstance(pmeta, str): data = self.kwdict.copy() From 12818ce9a5ca5b8baaafb6caf903eb417412a067 Mon Sep 17 00:00:00 2001 From: Nitrousoxide Date: Sun, 19 Nov 2023 10:34:43 -0500 Subject: [PATCH 130/344] add simple dockerfile --- Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..ee9d1a94 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:alpine +RUN python3 -m pip install -U gallery-dl +RUN python3 -m pip install -U youtube_dl +RUN apk update +RUN apk add ffmpeg +ENTRYPOINT [ "gallery-dl" ] \ No newline at end of file From 43c6b914a2db5800e97cafd875f408d242ca6189 Mon Sep 17 00:00:00 2001 From: Nitrousoxide Date: Sun, 19 Nov 2023 11:14:24 -0500 Subject: [PATCH 131/344] Update Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit changed youtube_dl to yt-dlp and combined install line Co-authored-by: Jouni Järvinen --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ee9d1a94..110505d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,5 @@ FROM python:alpine -RUN python3 -m pip install -U gallery-dl -RUN python3 -m pip install -U youtube_dl +RUN python3 -m pip install -U gallery-dl yt-dlp RUN apk update RUN apk add ffmpeg ENTRYPOINT [ "gallery-dl" ] \ No newline at end of file From ec99d24b18d5d5af97709ddd6f81da3c798e80d0 Mon Sep 17 00:00:00 2001 From: Nitrousoxide Date: Mon, 20 Nov 2023 11:53:48 -0500 Subject: [PATCH 132/344] Update Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mike Fährmann --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 110505d3..d0d88e00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,4 +2,4 @@ FROM python:alpine RUN python3 -m pip install -U gallery-dl yt-dlp RUN apk update RUN apk add ffmpeg -ENTRYPOINT [ "gallery-dl" ] \ No newline at end of file +ENTRYPOINT [ "gallery-dl" ] From aea15f6d173ca8f7dfdbfba5facef0688b27c004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 20 Nov 2023 22:16:15 +0100 Subject: [PATCH 133/344] add 'metadata-extractor' option (#4549) --- docs/configuration.rst | 12 ++++++++++++ gallery_dl/job.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index b0d09b7f..ae9edb5c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -675,6 +675,18 @@ Description to access the current file's filename as ``"{gdl_path.filename}"``. +extractor.*.metadata-extractor +------------------------------ +extractor.*.extractor-metadata +------------------------------ +Type + ``string`` +Description + Insert a reference to the current + `Extractor `__ + object into metadata dictionaries as the given name. + + extractor.*.metadata-http ------------------------- extractor.*.http-metadata diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9aad226f..ac2ac7ae 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -91,19 +91,23 @@ class Job(): self.metadata_http = extr.config2("metadata-http", "http-metadata") metadata_path = extr.config2("metadata-path", "path-metadata") metadata_version = extr.config2("metadata-version", "version-metadata") + metadata_extractor = extr.config2( + "metadata-extractor", "extractor-metadata") - # user-supplied metadata - kwdict = extr.config("keywords") - if kwdict: - self.kwdict.update(kwdict) if metadata_path: self.kwdict[metadata_path] = path_proxy + if metadata_extractor: + self.kwdict[metadata_extractor] = extr if metadata_version: self.kwdict[metadata_version] = { "version" : version.__version__, "is_executable" : util.EXECUTABLE, "current_git_head": util.git_head() } + # user-supplied metadata + kwdict = extr.config("keywords") + if kwdict: + self.kwdict.update(kwdict) def run(self): """Execute or run the job""" From a43cf78bb7829d0cfa360cb0a50f2950305255d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 20 Nov 2023 22:41:12 +0100 Subject: [PATCH 134/344] [erome] tests --- gallery_dl/extractor/erome.py | 6 ++++-- test/results/erome.py | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 6e77d2f2..6a0e069a 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,13 +44,15 @@ class EromeExtractor(Extractor): pos = page.index('
', '') + count, pos = text.extract( + page, 'fa-camera">', '', pos) + data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "_http_headers": {"Referer": url}, - "count" : count, + "count" : text.parse_int(count), } yield Message.Directory, data diff --git a/test/results/erome.py b/test/results/erome.py index f3269c04..40f34aad 100644 --- a/test/results/erome.py +++ b/test/results/erome.py @@ -16,6 +16,7 @@ __tests__ = ( "#count" : 1, "album_id": "NQgdlWvk", + "count" : 1, "num" : 1, "title" : "porn", "user" : "yYgWBZw8o8qsMzM", @@ -29,6 +30,7 @@ __tests__ = ( "#count" : 6, "album_id": "TdbZ4ogi", + "count" : 6, "num" : int, "title" : "82e78cfbb461ad87198f927fcb1fda9a1efac9ff.", "user" : "yYgWBZw8o8qsMzM", From 31963fa9478cbfeb55b8f98e283266b83e819dc1 Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Mon, 20 Nov 2023 21:35:32 -0500 Subject: [PATCH 135/344] [tmohentai] inherit from GalleryExtractor. refactor metadata. --- gallery_dl/extractor/tmohentai.py | 75 ++++++++++++------------------- 1 file changed, 28 insertions(+), 47 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index a02b8e8a..ef05f989 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -6,14 +6,15 @@ """Extractors for https://tmohentai.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor, Message from .. import text BASE_PATTERN = r'(?:https?://)?tmohentai\.com' -class TmohentaiExtractor(Extractor): +class TmohentaiGalleryExtractor(GalleryExtractor): category = 'tmohentai' + subcategory = 'gallery' root = 'http://tmohentai.com' directory_fmt = ('{category}', '{title}') filename_fmt = '{title}_{filename}.{extension}' @@ -22,71 +23,51 @@ class TmohentaiExtractor(Extractor): example = 'https://tmohentai.com/contents/12345a67b89c0' def __init__(self, match): - Extractor.__init__(self, match) + GalleryExtractor.__init__(self, match) self.contents = match.group(2) self.reader = match.group(3) self.id_string = match.group(4) def parse_location(self): - if self.contents: - url = '{}/reader/{}/paginated'.format(self.root, self.id_string) - else: - url_str = self.url.rpartition('/') - if url_str[-1].isdigit(): - url = url_str[0] - else: - url = self.url + url = self.url + if self.reader: + url = '{}/contents/{}'.format(self.root, self.id_string) return url - @staticmethod - def get_file_info(page_src): - file = text.extr(page_src, 'data-original="', '"') - file_loc, _, file_name = file.rpartition('/') - start_num, ext = file_name.split('.') - return file_loc, start_num, ext - def items(self): url = self.parse_location() - page_src = self.request( + page = self.request( text.ensure_http_scheme(url)).text + data = self.metadata(page) - data = self.metadata() yield Message.Directory, data + imgs = self.images(page) - file_loc, start_num, ext = self.get_file_info(page_src) - page_nums = text.extract_iter( - page_src, 'option value="', '"') - - for num, page in enumerate(page_nums, start=int(start_num)): - file = '{}/{:>03}.{}'.format(file_loc, num, ext) - img = text.nameext_from_url(file, { - 'num' : num, + cdn = 'https://imgrojo.tmohentai.com/contents' + for num, _ in enumerate(imgs, start=0): + url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num)) + img = text.nameext_from_url(url, { + 'num' : num + 1, 'title' : data['title'], 'id_string': self.id_string, }) - yield Message.Url, file, img - - def metadata(self): - contents = '{}/contents/{}'.format(self.root, self.id_string) - contents_src = self.request(text.ensure_http_scheme(contents)).text + yield Message.Url, url, img - genders_src = text.extr(contents_src, 'Genders', '') - genders_list = text.extract_iter(genders_src, '">', '') + def images(self, page): + pages = text.extract_iter( + page, 'class="lanzador', '>') + return pages - tags_src = text.extr(contents_src, 'Tags', '') - tags_list = text.extract_iter(tags_src, '">', '') + def metadata(self, page): + extr = text.extract_from(page, page.index('tag tag-accepted">')) - upload_src = text.extr(contents_src, 'Uploaded By', '/a>') data = { - 'title' : text.extr(contents_src, '

', '

').strip(), + 'title' : text.extr(page, '

', '

').strip(), 'id_string': self.id_string, - 'artists' : text.remove_html( - text.extr(contents_src, 'tag tag-accepted">', '')), - 'genders' : list(genders_list), - 'tags' : list(tags_list), - 'uploader' : text.extr(upload_src, '">', '<'), - 'language' : text.extr( - contents_src, ' ', ''), + 'artists' : text.remove_html(extr('">', '')), + 'genders' : text.split_html(extr('Genders', '', '')), + 'uploader' : text.remove_html(extr('Uploaded By', '')), + 'language' : extr(' ', '\n'), } return data From 714b1a7089aafdfef4eb2e8b74c7faef4564083a Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Tue, 21 Nov 2023 10:46:48 -0500 Subject: [PATCH 136/344] [tmohentai] simplify url matching --- gallery_dl/extractor/tmohentai.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index ef05f989..d4e16086 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -19,25 +19,17 @@ class TmohentaiGalleryExtractor(GalleryExtractor): directory_fmt = ('{category}', '{title}') filename_fmt = '{title}_{filename}.{extension}' archive_fmt = '{id_string}_{filename}' - pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' + pattern = BASE_PATTERN + r'/(contents)|(reader)/(\w+)' example = 'https://tmohentai.com/contents/12345a67b89c0' def __init__(self, match): - GalleryExtractor.__init__(self, match) - self.contents = match.group(2) - self.reader = match.group(3) - self.id_string = match.group(4) - - def parse_location(self): - url = self.url - if self.reader: - url = '{}/contents/{}'.format(self.root, self.id_string) - return url + self.id_string = match.group(2) + url = '{}/contents/{}'.format(self.root, self.id_string) + GalleryExtractor.__init__(self, match, url) def items(self): - url = self.parse_location() page = self.request( - text.ensure_http_scheme(url)).text + text.ensure_http_scheme(self.url)).text data = self.metadata(page) yield Message.Directory, data @@ -61,7 +53,7 @@ class TmohentaiGalleryExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page, page.index('tag tag-accepted">')) - data = { + return { 'title' : text.extr(page, '

', '

').strip(), 'id_string': self.id_string, 'artists' : text.remove_html(extr('">', '')), @@ -70,4 +62,3 @@ class TmohentaiGalleryExtractor(GalleryExtractor): 'uploader' : text.remove_html(extr('Uploaded By', '')), 'language' : extr(' ', '\n'), } - return data From e17a48fe5612692dde00371e33eaf087113a5991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 21 Nov 2023 16:52:25 +0100 Subject: [PATCH 137/344] [blogger] inherit from BaseExtractor - support www.micmicidol.club (#4759) --- docs/supportedsites.md | 22 +++++--- gallery_dl/extractor/blogger.py | 36 +++++++------ scripts/supportedsites.py | 18 +++++-- test/results/blogger.py | 82 +++------------------------- test/results/blogspot.py | 95 +++++++++++++++++++++++++++++++++ test/results/micmicidol.py | 84 +++++++++++++++++++++++++++++ 6 files changed, 234 insertions(+), 103 deletions(-) create mode 100644 test/results/blogspot.py create mode 100644 test/results/micmicidol.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a15566df..fc03ef22 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -109,12 +109,6 @@ Consider all sites to be NSFW unless otherwise known.
- - - - - - @@ -998,6 +992,22 @@ Consider all sites to be NSFW unless otherwise known. + + + + + + + + + + + + + + + + diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index d75c3498..58ae59db 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -8,30 +8,22 @@ """Extractors for Blogger blogs""" -from .common import Extractor, Message +from .common import BaseExtractor, Message from .. import text, util import re -BASE_PATTERN = ( - r"(?:blogger:(?:https?://)?([^/]+)|" - r"(?:https?://)?([\w-]+\.blogspot\.com))") - -class BloggerExtractor(Extractor): +class BloggerExtractor(BaseExtractor): """Base class for blogger extractors""" - category = "blogger" - directory_fmt = ("{category}", "{blog[name]}", + basecategory = "blogger" + directory_fmt = ("blogger", "{blog[name]}", "{post[date]:%Y-%m-%d} {post[title]}") filename_fmt = "{num:>03}.{extension}" archive_fmt = "{post[id]}_{num}" - root = "https://www.blogger.com" - - def __init__(self, match): - Extractor.__init__(self, match) - self.blog = match.group(1) or match.group(2) def _init(self): self.api = BloggerAPI(self) + self.blog = self.root.rpartition("/")[2] self.videos = self.config("videos", True) def items(self): @@ -92,6 +84,18 @@ class BloggerExtractor(Extractor): """Return additional metadata""" +BASE_PATTERN = BloggerExtractor.update({ + "blogspot": { + "root": None, + "pattern": r"[\w-]+\.blogspot\.com", + }, + "micmicidol": { + "root": "https://www.micmicidol.club", + "pattern": r"(?:www\.)?micmicidol\.club", + }, +}) + + class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" subcategory = "post" @@ -100,7 +104,7 @@ class BloggerPostExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match.group(match.lastindex) def posts(self, blog): return (self.api.post_by_path(blog["id"], self.path),) @@ -124,7 +128,7 @@ class BloggerSearchExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.query = text.unquote(match.group(3)) + self.query = text.unquote(match.group(match.lastindex)) def posts(self, blog): return self.api.blog_search(blog["id"], self.query) @@ -141,7 +145,7 @@ class BloggerLabelExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.label = text.unquote(match.group(3)) + self.label = text.unquote(match.group(match.lastindex)) def posts(self, blog): return self.api.blog_posts(blog["id"], self.label) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 470b629d..cd063f04 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -87,6 +87,7 @@ CATEGORY_MAP = { "mangaread" : "MangaRead", "mangasee" : "MangaSee", "mastodon.social": "mastodon.social", + "micmicidol" : "MIC MIC IDOL", "myhentaigallery": "My Hentai Gallery", "myportfolio" : "Adobe Portfolio", "naverwebtoon" : "NaverWebtoon", @@ -292,6 +293,10 @@ BASE_MAP = { "vichan" : "vichan Imageboards", } +URL_MAP = { + "blogspot": "https://www.blogger.com/", +} + _OAUTH = 'OAuth' _COOKIES = 'Cookies' _APIKEY_DB = \ @@ -362,7 +367,7 @@ IGNORE_LIST = ( def domain(cls): - """Return the web-domain related to an extractor class""" + """Return the domain name associated with an extractor class""" try: url = sys.modules[cls.__module__].__doc__.split()[-1] if url.startswith("http"): @@ -429,10 +434,13 @@ def build_extractor_list(): for category, root in extr.instances: base[category].append(extr.subcategory) if category not in domains: - if not root and results: - # use domain from first matching test - test = results.category(category)[0] - root = test["#class"].from_url(test["#url"]).root + if not root: + if category in URL_MAP: + root = URL_MAP[category].rstrip("/") + elif results: + # use domain from first matching test + test = results.category(category)[0] + root = test["#class"].from_url(test["#url"]).root domains[category] = root + "/" # sort subcategory lists diff --git a/test/results/blogger.py b/test/results/blogger.py index 214d450d..aeb82f76 100644 --- a/test/results/blogger.py +++ b/test/results/blogger.py @@ -8,100 +8,30 @@ from gallery_dl.extractor import blogger __tests__ = ( -{ - "#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html", - "#category": ("", "blogger", "post"), - "#class" : blogger.BloggerPostExtractor, - "#pattern" : "https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg", - "#sha1_url": "9928429fb62f712eb4de80f53625eccecc614aae", - - "blog": { - "date" : "dt:2010-11-21 18:19:42", - "description": "", - "id" : "5623928067739466034", - "kind" : "blogger#blog", - "locale" : dict, - "name" : "Julian Bunker Photography", - "pages" : int, - "posts" : int, - "published" : "2010-11-21T10:19:42-08:00", - "updated" : str, - "url" : "http://julianbphotography.blogspot.com/", - }, - "post": { - "author" : "Julian Bunker", - "content" : str, - "date" : "dt:2010-12-26 01:08:00", - "etag" : str, - "id" : "6955139236418998998", - "kind" : "blogger#post", - "published": "2010-12-25T17:08:00-08:00", - "replies" : "0", - "title" : "Moon Rise", - "updated" : "2011-12-06T05:21:24-08:00", - "url" : r"re:.+/2010/12/moon-rise.html$", - }, - "num" : int, - "url" : str, -}, - { "#url" : "blogger:http://www.julianbunker.com/2010/12/moon-rise.html", - "#category": ("", "blogger", "post"), - "#class" : blogger.BloggerPostExtractor, -}, - -{ - "#url" : "http://cfnmscenesinmovies.blogspot.com/2011/11/cfnm-scene-jenna-fischer-in-office.html", - "#comment" : "video (#587)", - "#category": ("", "blogger", "post"), + "#category": ("blogger", "www.julianbunker.com", "post"), "#class" : blogger.BloggerPostExtractor, - "#pattern" : r"https://.+\.googlevideo\.com/videoplayback", -}, - -{ - "#url" : "https://randomthingsthroughmyletterbox.blogspot.com/2022/01/bitter-flowers-by-gunnar-staalesen-blog.html", - "#comment" : "new image domain (#2204)", - "#category": ("", "blogger", "post"), - "#class" : blogger.BloggerPostExtractor, - "#pattern" : "https://blogger.googleusercontent.com/img/a/.+=s0$", - "#count" : 8, -}, - -{ - "#url" : "https://julianbphotography.blogspot.com/", - "#category": ("", "blogger", "blog"), - "#class" : blogger.BloggerBlogExtractor, - "#pattern" : r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg", - "#range" : "1-25", - "#count" : 25, }, { "#url" : "blogger:https://www.kefblog.com.ng/", - "#category": ("", "blogger", "blog"), + "#category": ("blogger", "www.kefblog.com.ng", "blog"), "#class" : blogger.BloggerBlogExtractor, "#range" : "1-25", "#count" : 25, }, { - "#url" : "https://julianbphotography.blogspot.com/search?q=400mm", - "#category": ("", "blogger", "search"), + "#url" : "blogger:http://www.julianbunker.com/search?q=400mm", + "#category": ("blogger", "1www.julianbunker.com", "search"), "#class" : blogger.BloggerSearchExtractor, - "#count" : "< 10", - - "query": "400mm", }, { - "#url" : "https://dmmagazine.blogspot.com/search/label/D%26D", - "#category": ("", "blogger", "label"), + "#url" : "blogger:http://www.julianbunker.com/search/label/D%26D", + "#category": ("blogger", "www.julianbunker.com", "label"), "#class" : blogger.BloggerLabelExtractor, - "#range" : "1-25", - "#count" : 25, - - "label": "D&D", }, ) diff --git a/test/results/blogspot.py b/test/results/blogspot.py new file mode 100644 index 00000000..83f4e5f7 --- /dev/null +++ b/test/results/blogspot.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import blogger + + +__tests__ = ( +{ + "#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html", + "#category": ("blogger", "blogspot", "post"), + "#class" : blogger.BloggerPostExtractor, + "#urls" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", + + "blog": { + "date" : "dt:2010-11-21 18:19:42", + "description": "", + "id" : "5623928067739466034", + "kind" : "blogger#blog", + "locale" : dict, + "name" : "Julian Bunker Photography", + "pages" : int, + "posts" : int, + "published" : "2010-11-21T10:19:42-08:00", + "updated" : str, + "url" : "http://julianbphotography.blogspot.com/", + }, + "post": { + "author" : "Julian Bunker", + "content" : str, + "date" : "dt:2010-12-26 01:08:00", + "etag" : str, + "id" : "6955139236418998998", + "kind" : "blogger#post", + "published": "2010-12-25T17:08:00-08:00", + "replies" : "0", + "title" : "Moon Rise", + "updated" : "2011-12-06T05:21:24-08:00", + "url" : "http://julianbphotography.blogspot.com/2010/12/moon-rise.html", + }, + "extension": "jpg", + "filename" : "Icy-Moonrise---For-Web", + "num" : 1, + "num" : int, + "url" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", +}, + +{ + "#url" : "http://cfnmscenesinmovies.blogspot.com/2011/11/cfnm-scene-jenna-fischer-in-office.html", + "#comment" : "video (#587)", + "#category": ("blogger", "blogspot", "post"), + "#class" : blogger.BloggerPostExtractor, + "#pattern" : r"https://.+\.googlevideo\.com/videoplayback", +}, + +{ + "#url" : "https://randomthingsthroughmyletterbox.blogspot.com/2022/01/bitter-flowers-by-gunnar-staalesen-blog.html", + "#comment" : "new image domain (#2204)", + "#category": ("blogger", "blogspot", "post"), + "#class" : blogger.BloggerPostExtractor, + "#pattern" : "https://blogger.googleusercontent.com/img/a/.+=s0$", + "#count" : 8, +}, + +{ + "#url" : "https://julianbphotography.blogspot.com/", + "#category": ("blogger", "blogspot", "blog"), + "#class" : blogger.BloggerBlogExtractor, + "#pattern" : r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg", + "#range" : "1-25", + "#count" : 25, +}, + +{ + "#url" : "https://julianbphotography.blogspot.com/search?q=400mm", + "#category": ("blogger", "blogspot", "search"), + "#class" : blogger.BloggerSearchExtractor, + "#count" : "< 10", + + "query": "400mm", +}, + +{ + "#url" : "https://dmmagazine.blogspot.com/search/label/D%26D", + "#category": ("blogger", "blogspot", "label"), + "#class" : blogger.BloggerLabelExtractor, + "#range" : "1-25", + "#count" : 25, + + "label": "D&D", +}, + +) diff --git a/test/results/micmicidol.py b/test/results/micmicidol.py new file mode 100644 index 00000000..f66bbd75 --- /dev/null +++ b/test/results/micmicidol.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import blogger + + +__tests__ = ( +{ + "#url" : "https://www.micmicidol.club/2023/11/weekly-taishu-20231113-cover.html", + "#category": ("blogger", "micmicidol", "post"), + "#class" : blogger.BloggerPostExtractor, + "#urls" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgtpSSdrol9aKP_ztcc_mp9TUUS0U_t2DYJuGX3XCs6X5CkxIb-pM98QlxbkgJFvQj-0e6RbXNBf047qyMDZLcPJsm9dTqAn2XkTVfLhWRaxxVvIYnHYu0R0d7WsAUSFs0MDe4Sotpuqp5DQnjr45T17CXKbWtq9cR3op9dDQh3yiw2a6_HInIjLRm5io/s0/000-micmicidol.jpg", + + "blog": { + "date" : "dt:2023-09-18 19:48:53", + "description": "", + "id" : "7192714164191173242", + "kind" : "blogger#blog", + "locale" : { + "country" : "TW", + "language": "zh", + "variant" : "", + }, + "name" : "MIC MIC IDOL", + "pages" : int, + "posts" : int, + "published" : "2023-09-18T12:48:53-07:00", + "updated" : str, + "url" : "http://www.micmicidol.club/" + }, + "post": { + "author" : "MIC MIC IDOL", + "content" : " ", + "date" : "dt:2023-11-18 08:01:00", + "etag" : str, + "id" : "5395888649239375388", + "kind" : "blogger#post", + "labels" : [ + "- Cover", + "Weekly Taishu", + "Weekly Taishu Cover", + ], + "published": "2023-11-18T00:01:00-08:00", + "replies" : "0", + "title" : "Weekly Taishu 週刊大衆 2023.11.13 Cover", + "updated" : "2023-11-18T03:00:42-08:00", + "url" : "http://www.micmicidol.club/2023/11/weekly-taishu-20231113-cover.html" + }, + "num" : 1, + "url" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgtpSSdrol9aKP_ztcc_mp9TUUS0U_t2DYJuGX3XCs6X5CkxIb-pM98QlxbkgJFvQj-0e6RbXNBf047qyMDZLcPJsm9dTqAn2XkTVfLhWRaxxVvIYnHYu0R0d7WsAUSFs0MDe4Sotpuqp5DQnjr45T17CXKbWtq9cR3op9dDQh3yiw2a6_HInIjLRm5io/s0/000-micmicidol.jpg", +}, + +{ + "#url" : "https://www.micmicidol.club/", + "#category": ("blogger", "micmicidol", "blog"), + "#class" : blogger.BloggerBlogExtractor, + "#range" : "1-25", + "#count" : 25, +}, + +{ + "#url" : "https://www.micmicidol.club/search?q=cover", + "#category": ("blogger", "micmicidol", "search"), + "#class" : blogger.BloggerSearchExtractor, + "#range" : "1-25", + "#count" : 25, + + "query" : "cover", +}, + +{ + "#url" : "https://www.micmicidol.club/search/label/Weekly%20Taishu%20Cover", + "#category": ("blogger", "micmicidol", "label"), + "#class" : blogger.BloggerLabelExtractor, + "#range" : "1-25", + "#count" : 25, + + "label" : "Weekly Taishu Cover", +}, + +) From c4a201ed42e6679d7edc3ce98d75054f574c00fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 21 Nov 2023 20:24:07 +0100 Subject: [PATCH 138/344] [tmohentai] simplify + tests --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tmohentai.py | 66 ++++++++++++------------------- scripts/supportedsites.py | 1 + test/results/tmohentai.py | 54 +++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 42 deletions(-) create mode 100644 test/results/tmohentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 94cef0f7..8aadcde5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -830,7 +830,7 @@ Consider all sites to be NSFW unless otherwise known. - + diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index d4e16086..be45702a 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -6,59 +6,43 @@ """Extractors for https://tmohentai.com/""" -from .common import GalleryExtractor, Message +from .common import GalleryExtractor from .. import text -BASE_PATTERN = r'(?:https?://)?tmohentai\.com' +BASE_PATTERN = r"(?:https?://)?tmohentai\.com" class TmohentaiGalleryExtractor(GalleryExtractor): - category = 'tmohentai' - subcategory = 'gallery' - root = 'http://tmohentai.com' - directory_fmt = ('{category}', '{title}') - filename_fmt = '{title}_{filename}.{extension}' - archive_fmt = '{id_string}_{filename}' - pattern = BASE_PATTERN + r'/(contents)|(reader)/(\w+)' - example = 'https://tmohentai.com/contents/12345a67b89c0' + category = "tmohentai" + root = "http://tmohentai.com" + directory_fmt = ("{category}", "{title} ({gallery_id})") + pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)" + example = "https://tmohentai.com/contents/12345a67b89c0" def __init__(self, match): - self.id_string = match.group(2) - url = '{}/contents/{}'.format(self.root, self.id_string) + self.gallery_id = match.group(1) + url = "{}/contents/{}".format(self.root, self.gallery_id) GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request( - text.ensure_http_scheme(self.url)).text - data = self.metadata(page) - - yield Message.Directory, data - imgs = self.images(page) - - cdn = 'https://imgrojo.tmohentai.com/contents' - for num, _ in enumerate(imgs, start=0): - url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num)) - img = text.nameext_from_url(url, { - 'num' : num + 1, - 'title' : data['title'], - 'id_string': self.id_string, - }) - yield Message.Url, url, img - def images(self, page): - pages = text.extract_iter( - page, 'class="lanzador', '>') - return pages + fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format( + self.gallery_id).format + cnt = page.count('class="lanzador') + return [(fmt(i), None) for i in range(0, cnt)] def metadata(self, page): - extr = text.extract_from(page, page.index('tag tag-accepted">')) + extr = text.extract_from(page) return { - 'title' : text.extr(page, '

', '

').strip(), - 'id_string': self.id_string, - 'artists' : text.remove_html(extr('">', '')), - 'genders' : text.split_html(extr('Genders', '', '')), - 'uploader' : text.remove_html(extr('Uploaded By', '')), - 'language' : extr(' ', '\n'), + "gallery_id": self.gallery_id, + "title" : text.unescape(extr("

", "<").strip()), + "artists" : text.split_html(extr( + "", "")), + "categories": text.split_html(extr( + "", "")), + "tags" : text.split_html(extr( + "", "")), + "uploader" : text.remove_html(extr( + "", "")), + "language" : extr(" ", "\n"), } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 470b629d..695108e0 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -121,6 +121,7 @@ CATEGORY_MAP = { "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", "tco" : "Twitter t.co", + "tmohentai" : "TMOHentai", "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", diff --git a/test/results/tmohentai.py b/test/results/tmohentai.py new file mode 100644 index 00000000..2bae050a --- /dev/null +++ b/test/results/tmohentai.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import tmohentai + + +__tests__ = ( +{ + "#url" : "https://tmohentai.com/contents/653c2aeaa693c", + "#category": ("", "tmohentai", "gallery"), + "#class" : tmohentai.TmohentaiGalleryExtractor, + "#pattern" : r"https://imgrojo\.tmohentai\.com/contents/653c2aeaa693c/\d\d\d\.webp", + "#count" : 46, + + "artists" : ["Andoryu"], + "categories": [ + "Big Breasts", + "BlowJob", + "Cheating", + "Mature", + "Milf", + "Student", + ], + "count" : 46, + "extension" : "webp", + "gallery_id": "653c2aeaa693c", + "language" : "Español", + "num" : int, + "tags" : [ + "milf", + "Madre", + "enormes pechos", + "Peluda", + "nakadashi", + "cheating", + "madura", + "sexo a escondidas", + "Ama de casa", + "mamada", + ], + "title" : "La Mama de mi Novia es tan Pervertida que no Pude Soportarlo mas", + "uploader" : "NekoCreme Fansub", +}, + +{ + "#url" : "https://tmohentai.com/reader/653c2aeaa693c/paginated/1", + "#category": ("", "tmohentai", "gallery"), + "#class" : tmohentai.TmohentaiGalleryExtractor, +}, + +) From 725c8dd55a6f23f443f2ea4c61cf51fb3515bcc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 21 Nov 2023 22:11:43 +0100 Subject: [PATCH 139/344] [tmohentai] 'categories' -> 'genres' quite likely that the site meant 'genres' by "Genders" --- gallery_dl/extractor/tmohentai.py | 2 +- test/results/tmohentai.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index be45702a..9c297279 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -38,7 +38,7 @@ class TmohentaiGalleryExtractor(GalleryExtractor): "title" : text.unescape(extr("

", "<").strip()), "artists" : text.split_html(extr( "", "")), - "categories": text.split_html(extr( + "genres" : text.split_html(extr( "", "")), "tags" : text.split_html(extr( "", "")), diff --git a/test/results/tmohentai.py b/test/results/tmohentai.py index 2bae050a..b565ae5e 100644 --- a/test/results/tmohentai.py +++ b/test/results/tmohentai.py @@ -16,7 +16,7 @@ __tests__ = ( "#count" : 46, "artists" : ["Andoryu"], - "categories": [ + "genres" : [ "Big Breasts", "BlowJob", "Cheating", From 2402162e8a14805d1437de05d90d93461f9e42ba Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Wed, 22 Nov 2023 19:35:43 +0530 Subject: [PATCH 140/344] [fapello] support '.su' TLD --- gallery_dl/extractor/fapello.py | 31 +++++++++++++++++-------- test/results/fapello.py | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index d4524e0e..aff8e616 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -10,6 +10,9 @@ from .common import Extractor, Message from .. import text, exception +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fapello\.(?:com|su)" + + class FapelloPostExtractor(Extractor): """Extractor for individual posts on fapello.com""" category = "fapello" @@ -17,16 +20,16 @@ class FapelloPostExtractor(Extractor): directory_fmt = ("{category}", "{model}") filename_fmt = "{model}_{id}.{extension}" archive_fmt = "{type}_{model}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" - r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)") + pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)" example = "https://fapello.com/MODEL/12345/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.model, self.id = match.groups() def items(self): - url = "https://fapello.com/{}/{}/".format(self.model, self.id) + url = "{}/{}/{}/".format(self.root, self.model, self.id) page = text.extr( self.request(url, allow_redirects=False).text, 'class="uk-align-center"', "", None) @@ -48,27 +51,29 @@ class FapelloModelExtractor(Extractor): """Extractor for all posts from a fapello model""" category = "fapello" subcategory = "model" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" - r"/(?!top-(?:likes|followers)|popular_videos" + pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos" r"|videos|trending|search/?$)" r"([^/?#]+)/?$") example = "https://fapello.com/model/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.model = match.group(1) def items(self): num = 1 data = {"_extractor": FapelloPostExtractor} while True: - url = "https://fapello.com/ajax/model/{}/page-{}/".format( - self.model, num) + url = "{}/ajax/model/{}/page-{}/".format( + self.root, self.model, num) page = self.request(url).text if not page: return for url in text.extract_iter(page, ' Date: Wed, 22 Nov 2023 19:01:19 +0100 Subject: [PATCH 141/344] [pixeldrain] add 'file' and 'album' extractors (#4839) --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/pixeldrain.py | 83 +++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + test/results/pixeldrain.py | 92 ++++++++++++++++++++++++++++++ 5 files changed, 183 insertions(+) create mode 100644 gallery_dl/extractor/pixeldrain.py create mode 100644 test/results/pixeldrain.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2370449f..da499f82 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -637,6 +637,12 @@ Consider all sites to be NSFW unless otherwise known.

+ + + + + + diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index efdcde78..72239d5c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -117,6 +117,7 @@ modules = [ "piczel", "pillowfort", "pinterest", + "pixeldrain", "pixiv", "pixnet", "plurk", diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py new file mode 100644 index 00000000..acfd7d4b --- /dev/null +++ b/gallery_dl/extractor/pixeldrain.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pixeldrain.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?pixeldrain\.com" + + +class PixeldrainExtractor(Extractor): + """Base class for pixeldrain extractors""" + category = "pixeldrain" + root = "https://pixeldrain.com" + archive_fmt = "{id}" + + def parse_datetime(self, date_string): + return text.parse_datetime( + date_string, "%Y-%m-%dT%H:%M:%S.%fZ") + + +class PixeldrainFileExtractor(PixeldrainExtractor): + """Extractor for pixeldrain files""" + subcategory = "file" + filename_fmt = "{filename[:230]} ({id}).{extension}" + pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)" + example = "https://pixeldrain.com/u/abcdefgh" + + def __init__(self, match): + Extractor.__init__(self, match) + self.file_id = match.group(1) + + def items(self): + url = "{}/api/file/{}".format(self.root, self.file_id) + file = self.request(url + "/info").json() + + file["url"] = url + "?download" + file["date"] = self.parse_datetime(file["date_upload"]) + + text.nameext_from_url(file["name"], file) + yield Message.Directory, file + yield Message.Url, file["url"], file + + +class PixeldrainAlbumExtractor(PixeldrainExtractor): + """Extractor for pixeldrain albums""" + subcategory = "album" + directory_fmt = ("{category}", + "{album[date]:%Y-%m-%d} {album[title]} ({album[id]})") + filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}" + pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)" + example = "https://pixeldrain.com/l/abcdefgh" + + def __init__(self, match): + Extractor.__init__(self, match) + self.album_id = match.group(1) + + def items(self): + url = "{}/api/list/{}".format(self.root, self.album_id) + album = self.request(url).json() + + files = album["files"] + album["count"] = album["file_count"] + album["date"] = self.parse_datetime(album["date_created"]) + + del album["files"] + del album["file_count"] + + yield Message.Directory, {"album": album} + for num, file in enumerate(files, 1): + file["album"] = album + file["num"] = num + file["url"] = url = "{}/api/file/{}?download".format( + self.root, file["id"]) + file["date"] = self.parse_datetime(file["date_upload"]) + text.nameext_from_url(file["name"], file) + yield Message.Url, url, file diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 2922c06e..c7748bb0 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -97,6 +97,7 @@ CATEGORY_MAP = { "nsfwalbum" : "NSFWalbum.com", "paheal" : "rule #34", "photovogue" : "PhotoVogue", + "pixeldrain" : "pixeldrain", "pornimagesxxx" : "Porn Image", "pornpics" : "PornPics.com", "pornreactor" : "PornReactor", diff --git a/test/results/pixeldrain.py b/test/results/pixeldrain.py new file mode 100644 index 00000000..a0c41a93 --- /dev/null +++ b/test/results/pixeldrain.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import pixeldrain +import datetime + +__tests__ = ( +{ + "#url" : "https://pixeldrain.com/u/jW9E6s4h", + "#category": ("", "pixeldrain", "file"), + "#class" : pixeldrain.PixeldrainFileExtractor, + "#urls" : "https://pixeldrain.com/api/file/jW9E6s4h?download", + "#sha1_content": "0c8768055e4e20e7c7259608b67799171b691140", + + "abuse_reporter_name" : "", + "abuse_type" : "", + "allow_video_player" : True, + "availability" : "", + "availability_message": "", + "bandwidth_used" : int, + "bandwidth_used_paid" : 0, + "can_download" : True, + "can_edit" : False, + "date" : "dt:2023-11-22 16:33:27", + "date_last_view" : r"re:\d+-\d+-\d+T\d+:\d+:\d+\.\d+Z", + "date_upload" : "2023-11-22T16:33:27.744Z", + "delete_after_date" : "0001-01-01T00:00:00Z", + "delete_after_downloads": 0, + "download_speed_limit": 0, + "downloads" : int, + "extension" : "png", + "filename" : "test-テスト-\"&>", + "hash_sha256" : "eb359cd8f02a7d6762f9863798297ff6a22569c5c87a9d38c55bdb3a3e26003f", + "id" : "jW9E6s4h", + "mime_type" : "image/png", + "name" : "test-テスト-\"&>.png", + "show_ads" : True, + "size" : 182, + "success" : True, + "thumbnail_href" : "/file/jW9E6s4h/thumbnail", + "url" : "https://pixeldrain.com/api/file/jW9E6s4h?download", + "views" : int, +}, + +{ + "#url" : "https://pixeldrain.com/u/yEK1n2Qc", + "#category": ("", "pixeldrain", "file"), + "#class" : pixeldrain.PixeldrainFileExtractor, + "#urls" : "https://pixeldrain.com/api/file/yEK1n2Qc?download", + "#sha1_content": "08463261191d403de2133d829060050d8b04609f", + + "date" : "dt:2023-11-22 16:38:04", + "date_upload": "2023-11-22T16:38:04.928Z", + "extension" : "txt", + "filename" : '"&>', + "hash_sha256": "4c1e2bbcbe1dea8b6f895f5cdd8461c37c561bce4f1b3556ba58392d95964294", + "id" : "yEK1n2Qc", + "mime_type" : "text/plain; charset=utf-8", + "name" : '"&>.txt', + "size" : 14, +}, + +{ + "#url" : "https://pixeldrain.com/l/zQ7XpWfM", + "#category": ("", "pixeldrain", "album"), + "#class" : pixeldrain.PixeldrainAlbumExtractor, + "#urls" : ( + "https://pixeldrain.com/api/file/yEK1n2Qc?download", + "https://pixeldrain.com/api/file/jW9E6s4h?download", + ), + + "album" : { + "can_edit" : False, + "count" : 2, + "date" : "dt:2023-11-22 16:40:39", + "date_created": "2023-11-22T16:40:39.218Z", + "id" : "zQ7XpWfM", + "success" : True, + "title" : "アルバム", + }, + "date" : datetime.datetime, + "description": "", + "detail_href": r"re:/file/(yEK1n2Qc|jW9E6s4h)/info", + "hash_sha256": r"re:\w{64}", + "id" : r"re:yEK1n2Qc|jW9E6s4h", + "mime_type" : str, +}, + +) From cb9a1176e67ed3bf41ca2aed83419621bf7ece77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 22 Nov 2023 19:13:51 +0100 Subject: [PATCH 142/344] [pixeldrain] add 'api-key' option (#4839) --- docs/configuration.rst | 8 ++++++++ gallery_dl/extractor/pixeldrain.py | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index ae9edb5c..403d2ca0 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2553,6 +2553,14 @@ Description Download from video pins. +extractor.pixeldrain.api-key +---------------------------- +Type + ``string`` +Description + Your account's `API key `__ + + extractor.pixiv.include ----------------------- Type diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index acfd7d4b..34b4ebff 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -20,6 +20,11 @@ class PixeldrainExtractor(Extractor): root = "https://pixeldrain.com" archive_fmt = "{id}" + def _init(self): + api_key = self.config("api-key") + if api_key: + self.session.auth = ("", api_key) + def parse_datetime(self, date_string): return text.parse_datetime( date_string, "%Y-%m-%dT%H:%M:%S.%fZ") From b43be67206f4b648350f1bea67c07ab4a0a8644d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 22 Nov 2023 20:15:00 +0100 Subject: [PATCH 143/344] [exhentai] add 'gp' option (#4576) --- docs/configuration.rst | 14 ++++++++++++++ gallery_dl/extractor/exhentai.py | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 403d2ca0..a76ce458 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1568,6 +1568,20 @@ Description to already favorited galleries. +extractor.exhentai.gp +--------------------- +Type + ``string`` +Example + ``"resized"`` +Description + Selects how to handle "you do not have enough GP" errors. + + * `"resized"`: Continue downloading `non-original `__ images. + * `"stop"`: Stop the current extractor run. + * `"wait"`: Wait for user input before retrying the current image. + + extractor.exhentai.limits ------------------------- Type diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 5f9ccb21..5dc498f1 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -175,6 +175,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.log.warning("'%s'", page) if " requires GP" in page: + gp = self.config("gp") + if gp == "stop": + raise exception.StopExtraction("Not enough GP") + elif gp == "wait": + input("Press ENTER to continue.") + return response.url + self.log.info("Falling back to non-original downloads") self.original = False return data["_url_1280"] From 4e155134153a688f909ba993eb06511af43f6b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 22 Nov 2023 22:14:30 +0100 Subject: [PATCH 144/344] [docs] fix 'Example' -> 'Default' from b43be672 copy-paste mistake --- docs/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a76ce458..eb263c17 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1572,7 +1572,7 @@ extractor.exhentai.gp --------------------- Type ``string`` -Example +Default ``"resized"`` Description Selects how to handle "you do not have enough GP" errors. From 119755a5a34bd59b3a6d3d90c991fa1a637a05d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 00:43:07 +0100 Subject: [PATCH 145/344] [tests] implement skipping/failing tests when pressing ctrl+c --- test/test_results.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/test_results.py b/test/test_results.py index f275bbfc..182509f5 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -405,7 +405,17 @@ def generate_tests(): def _generate_method(result): def test(self): print("\n" + result["#url"]) - self._run_test(result) + try: + self._run_test(result) + except KeyboardInterrupt as exc: + v = input("\n[e]xit | [f]ail | [S]kip ? ").strip().lower() + if v in ("e", "exit"): + raise + if v in ("f", "fail"): + self.fail("manual test failure") + else: + self._skipped.append((result["#url"], exc)) + self.skipTest(exc) return test # enable selective testing for direct calls From 645b4627ef4e31b79219154c5db5356c26a16cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 02:41:52 +0100 Subject: [PATCH 146/344] [sankaku] update URL patterns --- gallery_dl/extractor/sankaku.py | 4 ++-- test/results/sankaku.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index bebea2a2..89412584 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -117,7 +117,7 @@ class SankakuPoolExtractor(SankakuExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)" + pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\d+)" example = "https://sankaku.app/books/12345" def __init__(self, match): @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post(?:s|/show)/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/posts?(?:/show)?/([0-9a-f]+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/test/results/sankaku.py b/test/results/sankaku.py index 37330f26..89396daa 100644 --- a/test/results/sankaku.py +++ b/test/results/sankaku.py @@ -111,6 +111,12 @@ __tests__ = ( "#class" : sankaku.SankakuPoolExtractor, }, +{ + "#url" : "https://chan.sankakucomplex.com/pools/show/90", + "#category": ("booru", "sankaku", "pool"), + "#class" : sankaku.SankakuPoolExtractor, +}, + { "#url" : "https://sankaku.app/post/show/360451", "#category": ("booru", "sankaku", "post"), @@ -173,6 +179,16 @@ __tests__ = ( "md5": "f8ba89043078f0e4be2d9c46550b840a", }, +{ + "#url" : "https://chan.sankakucomplex.com/en/posts/show/ac8e3b92ea328ce9cf7211e69c905bf9", + "#comment" : "/en/posts/show/HEX", + "#category": ("booru", "sankaku", "post"), + "#class" : sankaku.SankakuPostExtractor, + + "id" : 360451, + "md5": "ac8e3b92ea328ce9cf7211e69c905bf9", +}, + { "#url" : "https://chan.sankakucomplex.com/post/show/360451", "#category": ("booru", "sankaku", "post"), From f9dac43be9bbdb52e0223ffb6efb9c0be97b968e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 02:44:55 +0100 Subject: [PATCH 147/344] [warosu] fix file URLs --- gallery_dl/extractor/warosu.py | 5 ++++- test/results/warosu.py | 30 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 8e6b842a..3bb635d6 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -90,4 +90,7 @@ class WarosuThreadExtractor(Extractor): data["filename"] = text.unquote(extr( "", "<").rstrip().rpartition(".")[0]) extr("
", "") - data["image"] = self.root + extr("
") + + data["image"] = url = extr("") + if url[0] == "/": + data["image"] = self.root + url diff --git a/test/results/warosu.py b/test/results/warosu.py index e476b508..efc7f832 100644 --- a/test/results/warosu.py +++ b/test/results/warosu.py @@ -13,17 +13,17 @@ __tests__ = ( "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, "#urls" : ( - "https://warosu.org/data/jp/img/0166/56/1488487280004.png", - "https://warosu.org/data/jp/img/0166/56/1488493239417.png", - "https://warosu.org/data/jp/img/0166/56/1488493636725.jpg", - "https://warosu.org/data/jp/img/0166/56/1488493700040.jpg", - "https://warosu.org/data/jp/img/0166/56/1488499585168.jpg", - "https://warosu.org/data/jp/img/0166/56/1488530851199.jpg", - "https://warosu.org/data/jp/img/0166/56/1488536072155.jpg", - "https://warosu.org/data/jp/img/0166/56/1488603426484.png", - "https://warosu.org/data/jp/img/0166/56/1488647021253.jpg", - "https://warosu.org/data/jp/img/0166/56/1488866825031.jpg", - "https://warosu.org/data/jp/img/0166/56/1489094956868.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488487280004.png", + "https://i.warosu.org/data/jp/img/0166/56/1488493239417.png", + "https://i.warosu.org/data/jp/img/0166/56/1488493636725.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488493700040.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488499585168.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488530851199.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488536072155.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488603426484.png", + "https://i.warosu.org/data/jp/img/0166/56/1488647021253.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1488866825031.jpg", + "https://i.warosu.org/data/jp/img/0166/56/1489094956868.jpg", ), }, @@ -32,7 +32,7 @@ __tests__ = ( "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, "#sha1_content" : "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", - "#urls" : "https://warosu.org/data/jp/img/0166/58/1488521824388.jpg", + "#urls" : "https://i.warosu.org/data/jp/img/0166/58/1488521824388.jpg", "#count" : 1, "board" : "jp", @@ -43,7 +43,7 @@ __tests__ = ( "filename" : "sadako-vs-kayako-movie-review", "fsize" : "55 KB", "h" : 675, - "image" : "https://warosu.org/data/jp/img/0166/58/1488521824388.jpg", + "image" : "https://i.warosu.org/data/jp/img/0166/58/1488521824388.jpg", "name" : "Anonymous", "no" : 16658073, "now" : "Fri Mar 3 01:17:04 2017", @@ -58,7 +58,7 @@ __tests__ = ( "#url" : "https://warosu.org/ic/thread/4604652", "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, - "#pattern" : r"https://warosu\.org/data/ic/img/0046/04/1590\d{9}\.jpg", + "#pattern" : r"https://i.warosu\.org/data/ic/img/0046/04/1590\d{9}\.jpg", "#count" : 133, "board" : "ic", @@ -68,7 +68,7 @@ __tests__ = ( "filename" : str, "fsize" : str, "h" : range(200, 3507), - "image" : r"re:https://warosu\.org/data/ic/img/0046/04/1590\d+\.jpg", + "image" : r"re:https://i.warosu\.org/data/ic/img/0046/04/1590\d+\.jpg", "name" : "re:Anonymous|Dhe Specky Spider-Man", "no" : range(4604652, 4620000), "now" : r"re:\w\w\w \w\w\w \d\d \d\d:\d\d:\d\d 2020", From adc3aa0b77836e570b4f247c76ad601f849c742c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 21:21:14 +0100 Subject: [PATCH 148/344] [zerochan] fix metadata extraction author, path, tags --- gallery_dl/extractor/zerochan.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 5fe1943c..13073998 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -63,14 +63,14 @@ class ZerochanExtractor(BooruExtractor): data = { "id" : text.parse_int(entry_id), - "author" : extr('"author": "', '"'), + "author" : text.parse_unicode_escapes(extr(' "name": "', '"')), "file_url": extr('"contentUrl": "', '"'), "date" : text.parse_datetime(extr('"datePublished": "', '"')), "width" : text.parse_int(extr('"width": "', ' ')), "height" : text.parse_int(extr('"height": "', ' ')), "size" : text.parse_bytes(extr('"contentSize": "', 'B')), "path" : text.split_html(extr( - 'class="breadcrumbs', '

'))[2:], + 'class="breadcrumbs', ''))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('
    '), "source" : extr('

    Source

    ', '

    ').rpartition( @@ -80,9 +80,9 @@ class ZerochanExtractor(BooruExtractor): html = data["tags"] tags = data["tags"] = [] for tag in html.split("
  • -->", "") - tags.append(category + ":" + name.strip()) + category = text.extr(tag, 'data-type="', '"') + name = text.extr(tag, 'data-tag="', '"') + tags.append(category.capitalize() + ":" + name) return data From 5b979b57066b26ca7adf7d3c335602b67c15a5b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 21:27:19 +0100 Subject: [PATCH 149/344] [xvideos] fix metadata extraction --- gallery_dl/extractor/xvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index a28d8f5a..46e574e3 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -38,13 +38,13 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) - title = extr('"title":"', '"') user = { "id" : text.parse_int(extr('"id_user":', ',')), "display": extr('"display":"', '"'), "sex" : extr('"sex":"', '"'), "name" : self.user, } + title = extr('"title":"', '"') user["description"] = extr( '', '').strip() tags = extr('Tagged:', '<').strip() From 23cd17997d8e2b142977529cfd3b7a4a005c81a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 21:54:21 +0100 Subject: [PATCH 150/344] [wallpapercave] fix extraction --- gallery_dl/extractor/wallpapercave.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index bce1026d..faf3b0de 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2021 David Hoppenbrouwers +# Copyright 2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,7 +23,20 @@ class WallpapercaveImageExtractor(Extractor): def items(self): page = self.request(text.ensure_http_scheme(self.url)).text + + path = None for path in text.extract_iter(page, 'class="download" href="', '"'): image = text.nameext_from_url(path) yield Message.Directory, image yield Message.Url, self.root + path, image + + if path is None: + try: + path = text.rextract( + page, 'href="', '"', page.index('id="tdownload"'))[0] + except Exception: + pass + else: + image = text.nameext_from_url(path) + yield Message.Directory, image + yield Message.Url, self.root + path, image From c8c744a7c0d4ce7663966c181ba099b7cf3a96ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 24 Nov 2023 22:17:34 +0100 Subject: [PATCH 151/344] [webtoons] fix pagination when receiving an HTTP redirect --- gallery_dl/extractor/webtoons.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index dc9a4f19..3f2f410d 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -146,7 +146,12 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): if page and path not in page: return - page = self.request(self.root + path).text + response = self.request(self.root + path) + if response.history: + parts = response.url.split("/") + self.path = "/".join(parts[3:-1]) + + page = response.text data["page"] = self.page_no for url in self.get_episode_urls(page): From 7608201a44f328fe7ad058fb10539ce773321841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 25 Nov 2023 00:51:14 +0100 Subject: [PATCH 152/344] [tumblr] fix 'day' extractor another bug caused by a383eca7 --- gallery_dl/extractor/tumblr.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 3dab16e8..f50ddb79 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -322,12 +322,15 @@ class TumblrDayExtractor(TumblrExtractor): def __init__(self, match): TumblrExtractor.__init__(self, match) year, month, day = match.group(4).split("/") - self.date_min = ( - # 719163 == date(1970, 1, 1).toordinal() - date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + self.ordinal = date(int(year), int(month), int(day)).toordinal() def _init(self): TumblrExtractor._init(self) + + self.date_min = ( + # 719163 == date(1970, 1, 1).toordinal() + (self.ordinal - 719163) * 86400) + self.api.before = self.date_min + 86400 def posts(self): From 311ec1d9ef8fe92078873c5b94515c1a0e046947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 25 Nov 2023 23:53:27 +0100 Subject: [PATCH 153/344] [mangaread] fix extraction --- gallery_dl/extractor/mangaread.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 8f193741..4b017dca 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -50,8 +50,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): page = text.extr( page, '
    ', '
    ', '') + page, '', pos) image_id, pos = text.extract( page, 'id="imageid_input" value="', '"', pos) gallery_id, pos = text.extract( page, 'id="galleryid_input" value="', '"', pos) info = util.json_loads(info) - url = info["contentUrl"] return url, text.nameext_from_url(url, { "title": text.unescape(info["name"]), From fc1101779c43341ac21ce90bc604347889237112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 26 Nov 2023 01:24:42 +0100 Subject: [PATCH 155/344] [hiperdex] fix 'manga' metadata --- gallery_dl/extractor/hiperdex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 32ca1519..20491b56 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -30,10 +30,10 @@ class HiperdexBase(): extr = text.extract_from(page) return { - "manga" : text.unescape(extr( - "", "<").rpartition(" Manga - ")[0].strip()), "url" : text.unescape(extr( 'property="og:url" content="', '"')), + "manga" : text.unescape(extr( + '"headline": "', '"')), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( From 8ac68ffba24de24dde7c94511c109b7861d4caf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 26 Nov 2023 02:08:12 +0100 Subject: [PATCH 156/344] [hentaicosplays] force 'https://' for download URLs --- gallery_dl/extractor/hentaicosplays.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 62df1925..d5ff8c83 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -42,7 +42,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): def images(self, page): return [ - (url, None) + (url.replace("http:", "https:", 1), None) for url in text.extract_iter( page, '<amp-img class="auto-style" src="', '"') ] From 159e623e65eaef473c2f0a098d1d4705091d4c8e Mon Sep 17 00:00:00 2001 From: Nitrousoxide <nitro2985@gmail.com> Date: Sun, 26 Nov 2023 11:08:53 -0500 Subject: [PATCH 157/344] add section to readme for using docker --- README.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.rst b/README.rst index 9c1b3388..c4e40a9e 100644 --- a/README.rst +++ b/README.rst @@ -132,6 +132,27 @@ For macOS users with MacPorts: sudo port install gallery-dl +Docker +-------- +Using the Dockerfile in the repository: + +.. code:: bash + git clone https://github.com/mikf/gallery-dl.git + cd gallery-dl/ + docker build -t gallery-dl:latest . + +To run the container you will probably want to attach some directories on the host so that the config file and downloads can persist across runs. + +Make sure to either download the example config file reference in the repo and place it in the mounted volume location or touch an empty file there. + +If you gave the container a different tag or are using podman then make sure you adjust. Run `docker image ls`` to check the name if you are not sure. + +This will remove the container after every use so you will always have a fresh environment for it to run. If you setup a ci-cd pipeline to autobuild the container you can also add a --pull=newer flag so that when you run it docker will check to see if there is a newer container and download it before running. + +.. code:: bash + docker run --rm -v $HOME/Downloads/:/gallery-dl/ -v $HOME/.config/gallery-dl/gallery-dl.conf:/etc/gallery-dl.conf -it gallery-dl:latest + +You can also add an alias to your shell for "gallery-dl" or create a simple bash script and drop it somewhere in your $PATH to act as a shim for this command. Usage ===== From b932d4fed8e1213fe5c1de73e26e4e16fcf26be2 Mon Sep 17 00:00:00 2001 From: Nitrousoxide <nitro2985@gmail.com> Date: Sun, 26 Nov 2023 11:11:05 -0500 Subject: [PATCH 158/344] add docker section to readme --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index c4e40a9e..e3f4846e 100644 --- a/README.rst +++ b/README.rst @@ -137,6 +137,7 @@ Docker Using the Dockerfile in the repository: .. code:: bash + git clone https://github.com/mikf/gallery-dl.git cd gallery-dl/ docker build -t gallery-dl:latest . @@ -150,6 +151,7 @@ If you gave the container a different tag or are using podman then make sure you This will remove the container after every use so you will always have a fresh environment for it to run. If you setup a ci-cd pipeline to autobuild the container you can also add a --pull=newer flag so that when you run it docker will check to see if there is a newer container and download it before running. .. code:: bash + docker run --rm -v $HOME/Downloads/:/gallery-dl/ -v $HOME/.config/gallery-dl/gallery-dl.conf:/etc/gallery-dl.conf -it gallery-dl:latest You can also add an alias to your shell for "gallery-dl" or create a simple bash script and drop it somewhere in your $PATH to act as a shim for this command. From d949582934df5c7bcca43527c4053b577ca8ff4a Mon Sep 17 00:00:00 2001 From: Nitrousoxide <nitro2985@gmail.com> Date: Sun, 26 Nov 2023 11:13:07 -0500 Subject: [PATCH 159/344] add docker how-to to readme --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index e3f4846e..fc43900c 100644 --- a/README.rst +++ b/README.rst @@ -146,12 +146,12 @@ To run the container you will probably want to attach some directories on the ho Make sure to either download the example config file reference in the repo and place it in the mounted volume location or touch an empty file there. -If you gave the container a different tag or are using podman then make sure you adjust. Run `docker image ls`` to check the name if you are not sure. +If you gave the container a different tag or are using podman then make sure you adjust. Run ``docker image ls`` to check the name if you are not sure. -This will remove the container after every use so you will always have a fresh environment for it to run. If you setup a ci-cd pipeline to autobuild the container you can also add a --pull=newer flag so that when you run it docker will check to see if there is a newer container and download it before running. +This will remove the container after every use so you will always have a fresh environment for it to run. If you setup a ci-cd pipeline to autobuild the container you can also add a ``--pull=newer`` flag so that when you run it docker will check to see if there is a newer container and download it before running. .. code:: bash - + docker run --rm -v $HOME/Downloads/:/gallery-dl/ -v $HOME/.config/gallery-dl/gallery-dl.conf:/etc/gallery-dl.conf -it gallery-dl:latest You can also add an alias to your shell for "gallery-dl" or create a simple bash script and drop it somewhere in your $PATH to act as a shim for this command. From d9734ce008483a1f702dac483b60920e1b8ebc08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 26 Nov 2023 18:03:13 +0100 Subject: [PATCH 160/344] [cyberdrop] update to site layout changes --- gallery_dl/extractor/cyberdrop.py | 56 +++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index 59fd1e58..d8649605 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -7,6 +7,7 @@ """Extractors for https://cyberdrop.me/""" from . import lolisafe +from .common import Message from .. import text @@ -16,24 +17,43 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" + def items(self): + files, data = self.fetch_album(self.album_id) + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + file.update(data) + text.nameext_from_url(file["name"], file) + file["name"], sep, file["id"] = file["filename"].rpartition("-") + yield Message.Url, file["url"], file + def fetch_album(self, album_id): - url = self.root + "/a/" + self.album_id - extr = text.extract_from(self.request(url).text) - - files = [] - append = files.append - while True: - url = text.unescape(extr('id="file" href="', '"')) - if not url: - break - append({"file": url, - "_fallback": (self.root + url[url.find("/", 8):],)}) - - return files, { + url = "{}/a/{}".format(self.root, album_id) + page = self.request(url).text + extr = text.extract_from(page) + + desc = extr('property="og:description" content="', '"') + if desc.startswith("A privacy-focused censorship-resistant file " + "sharing platform free for everyone."): + desc = "" + extr('id="title"', "") + + album = { "album_id" : self.album_id, - "album_name" : extr("name: '", "'"), - "date" : text.parse_timestamp(extr("timestamp: ", ",")), - "album_size" : text.parse_int(extr("totalSize: ", ",")), - "description": extr("description: `", "`"), - "count" : len(files), + "album_name" : text.unescape(extr('title="', '"')), + "album_size" : text.parse_bytes(extr( + '<p class="title">', "B")), + "date" : text.parse_datetime(extr( + '<p class="title">', '<'), "%d.%m.%Y"), + "description": text.unescape(text.unescape( # double + desc.rpartition(" [R")[0])), } + + file_ids = list(text.extract_iter(page, 'id="file" href="/f/', '"')) + album["count"] = len(file_ids) + return self._extract_files(file_ids), album + + def _extract_files(self, file_ids): + for file_id in file_ids: + url = "{}/api/f/{}".format(self.root, file_id) + yield self.request(url).json() From bdb3ce721793fdf20681627ca360531421dc69e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 26 Nov 2023 23:19:05 +0100 Subject: [PATCH 161/344] [foolslide] remove 'powermanga.org' --- docs/supportedsites.md | 10 --------- gallery_dl/extractor/foolslide.py | 4 ---- scripts/supportedsites.py | 1 - test/results/powermanga.py | 36 ------------------------------- 4 files changed, 51 deletions(-) delete mode 100644 test/results/powermanga.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index da499f82..8f54b157 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1506,16 +1506,6 @@ Consider all sites to be NSFW unless otherwise known. <td></td> </tr> -<tr> - <td colspan="4"><strong>FoOlSlide Instances</strong></td> -</tr> -<tr> - <td>PowerManga</td> - <td>https://read.powermanga.org/</td> - <td>Chapters, Manga</td> - <td></td> -</tr> - <tr> <td colspan="4"><strong>Mastodon Instances</strong></td> </tr> diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index b0699b03..bb684c2d 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -38,10 +38,6 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ - "powermanga": { - "root": "https://read.powermanga.org", - "pattern": r"read(?:er)?\.powermanga\.org", - }, }) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index c7748bb0..3afac13f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -101,7 +101,6 @@ CATEGORY_MAP = { "pornimagesxxx" : "Porn Image", "pornpics" : "PornPics.com", "pornreactor" : "PornReactor", - "powermanga" : "PowerManga", "readcomiconline": "Read Comic Online", "rbt" : "RebeccaBlackTech", "redgifs" : "RedGIFs", diff --git a/test/results/powermanga.py b/test/results/powermanga.py deleted file mode 100644 index fb02b0cc..00000000 --- a/test/results/powermanga.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import foolslide - - -__tests__ = ( -{ - "#url" : "https://read.powermanga.org/read/one_piece_digital_colour_comics/en/0/75/", - "#category": ("foolslide", "powermanga", "chapter"), - "#class" : foolslide.FoolslideChapterExtractor, - "#sha1_url" : "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", - "#sha1_metadata": "a60c42f2634b7387899299d411ff494ed0ad6dbe", -}, - -{ - "#url" : "https://read.powermanga.org/series/one_piece_digital_colour_comics/", - "#category": ("foolslide", "powermanga", "manga"), - "#class" : foolslide.FoolslideMangaExtractor, - "#count" : ">= 1", - - "chapter" : int, - "chapter_minor" : str, - "chapter_string": str, - "group" : "PowerManga", - "lang" : "en", - "language" : "English", - "manga" : "One Piece Digital Colour Comics", - "title" : str, - "volume" : int, -}, - -) From 9f3368c46f756a792bdbceb34a5f7f639810e0bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 26 Nov 2023 23:52:24 +0100 Subject: [PATCH 162/344] [pornhub] fix 'user' metadata for gifs --- gallery_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index c5ce8327..7ff40a37 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -143,7 +143,7 @@ class PornhubGifExtractor(PornhubExtractor): "url" : extr('"contentUrl": "', '"'), "date" : text.parse_datetime( extr('"uploadDate": "', '"'), "%Y-%m-%d"), - "user" : extr('data-mxptext="', '"'), + "user" : text.remove_html(extr("Created by:", "</div>")), } yield Message.Directory, gif From 95c1dfb0897a4073c3759fa51b0191d9670dc97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 27 Nov 2023 01:02:39 +0100 Subject: [PATCH 163/344] [tests] swap assertEqual argument order before this, it would show test failures as + test value - extracted value when it should be the other way round --- test/test_results.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index 182509f5..c7a50019 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -192,22 +192,22 @@ class TestExtractorResults(unittest.TestCase): subtest = True self._test_kwdict(value[idx], item) if not subtest: - self.assertEqual(value, test, msg=key) + self.assertEqual(test, value, msg=key) elif isinstance(test, str): if test.startswith("re:"): self.assertRegex(value, test[3:], msg=key) elif test.startswith("dt:"): self.assertIsInstance(value, datetime.datetime, msg=key) - self.assertEqual(str(value), test[3:], msg=key) + self.assertEqual(test[3:], str(value), msg=key) elif test.startswith("type:"): - self.assertEqual(type(value).__name__, test[5:], msg=key) + self.assertEqual(test[5:], type(value).__name__, msg=key) elif test.startswith("len:"): self.assertIsInstance(value, (list, tuple), msg=key) - self.assertEqual(len(value), int(test[4:]), msg=key) + self.assertEqual(int(test[4:]), len(value), msg=key) else: - self.assertEqual(value, test, msg=key) + self.assertEqual(test, value, msg=key) else: - self.assertEqual(value, test, msg=key) + self.assertEqual(test, value, msg=key) class ResultJob(job.DownloadJob): From 1e9bacd1698066512e860d4747e1c23e6711d2c6 Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Mon, 27 Nov 2023 21:58:06 +0530 Subject: [PATCH 164/344] [nitter] fix video extraction --- gallery_dl/extractor/nitter.py | 2 ++ test/results/nitternet.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 9f5cc9da..bc7b308b 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -96,6 +96,8 @@ class NitterExtractor(BaseExtractor): for url in text.extract_iter( attachments, '<source src="', '"'): + if url[0] == "/": + url = self.root + url append(text.nameext_from_url(url, {"url": url})) else: diff --git a/test/results/nitternet.py b/test/results/nitternet.py index 2a1cccf6..31786a37 100644 --- a/test/results/nitternet.py +++ b/test/results/nitternet.py @@ -76,7 +76,7 @@ __tests__ = ( "#sha1_url" : "3f2b64e175bf284aa672c3bb53ed275e470b919a", "#sha1_content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", - "comments" : 19, + "comments" : int, "content" : "Big Wedeene River, Canada", "count" : 1, "date" : "dt:2015-05-29 17:40:00", @@ -84,7 +84,7 @@ __tests__ = ( "filename" : "CGMNYZvW0AIVoom", "likes" : int, "num" : 1, - "quotes" : 10, + "quotes" : int, "retweets" : int, "tweet_id" : "604341487988576256", "url" : "https://nitter.net/pic/orig/media%2FCGMNYZvW0AIVoom.jpg", From 013ca215437fc2f80d0141e577d8550ba0c5ad5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 27 Nov 2023 18:27:08 +0100 Subject: [PATCH 165/344] [idolcomplex] update to site layout changes --- gallery_dl/extractor/idolcomplex.py | 129 ++++++++++++++++------------ 1 file changed, 74 insertions(+), 55 deletions(-) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index b7b6ef10..5c7a1b3a 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -15,15 +15,17 @@ from .. import text, util, exception import collections import re +BASE_PATTERN = r"(?:https?://)?idol\.sankakucomplex\.com(?:/[a-z]{2})?" + class IdolcomplexExtractor(SankakuExtractor): """Base class for idolcomplex extractors""" category = "idolcomplex" + root = "https://idol.sankakucomplex.com" cookies_domain = "idol.sankakucomplex.com" - cookies_names = ("login", "pass_hash") - root = "https://" + cookies_domain + cookies_names = ("_idolcomplex_session",) referer = False - request_interval = 5.0 + request_interval = (4.0, 6.0) def __init__(self, match): SankakuExtractor.__init__(self, match) @@ -32,14 +34,16 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_post = 0 def _init(self): - self.extags = self.config("tags", False) + self.find_tags = re.compile( + r'tag-type-([^"]+)">\s*<div [^>]+>\s*<a href="/\?tags=([^"]+)' + ).findall def items(self): self.login() data = self.metadata() for post_id in util.advance(self.post_ids(), self.start_post): - post = self._parse_post(post_id) + post = self._extract_post(post_id) url = post["file_url"] post.update(data) text.nameext_from_url(url, post) @@ -67,63 +71,75 @@ class IdolcomplexExtractor(SankakuExtractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = self.root + "/user/authenticate" + url = self.root + "/users/login" + page = self.request(url).text + + headers = { + "Referer": url, + } + url = self.root + (text.extr(page, '<form action="', '"') or + "/en/user/authenticate") data = { + "authenticity_token": text.unescape(text.extr( + page, 'name="authenticity_token" value="', '"')), "url" : "", "user[name]" : username, "user[password]": password, "commit" : "Login", } - response = self.request(url, method="POST", data=data) + response = self.request(url, method="POST", headers=headers, data=data) - if not response.history or response.url != self.root + "/user/home": + if not response.history or response.url.endswith("/user/home"): raise exception.AuthenticationError() - cookies = response.history[0].cookies - return {c: cookies[c] for c in self.cookies_names} + return {c.name: c.value for c in response.history[0].cookies} - def _parse_post(self, post_id): - """Extract metadata of a single post""" - url = self.root + "/post/show/" + post_id + def _extract_post(self, post_id): + url = self.root + "/posts/" + post_id page = self.request(url, retries=10).text - extr = text.extract + extr = text.extract_from(page) - tags , pos = extr(page, "<title>", " | ") - vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos) - vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos) - _ , pos = extr(page, "Posted: <", "", pos) - created, pos = extr(page, ' title="', '"', pos) - rating = extr(page, "<li>Rating: ", "<", pos)[0] + tags = extr("<title>", " | ") + vavg = extr('itemprop="ratingValue">', "<") + vcnt = extr('itemprop="reviewCount">', "<") + pid = extr(">Post ID:", "<") + created = extr(' title="', '"') - file_url, pos = extr(page, '<li>Original: <a href="', '"', pos) + file_url = extr('>Original:', 'id=') if file_url: - width , pos = extr(page, '>', 'x', pos) - height, pos = extr(page, '', ' ', pos) + file_url = extr(' href="', '"') + width = extr(">", "x") + height = extr("", " ") else: - width , pos = extr(page, '<object width=', ' ', pos) - height, pos = extr(page, 'height=', '>', pos) - file_url = extr(page, '<embed src="', '"', pos)[0] + width = extr('<object width=', ' ') + height = extr('height=', '>') + file_url = extr('<embed src="', '"') + + rating = extr(">Rating:", "<br") data = { - "id": text.parse_int(post_id), - "md5": file_url.rpartition("/")[2].partition(".")[0], - "tags": text.unescape(tags), + "id" : text.parse_int(pid), + "md5" : file_url.rpartition("/")[2].partition(".")[0], + "tags" : text.unescape(tags), "vote_average": text.parse_float(vavg), - "vote_count": text.parse_int(vcnt), - "created_at": created, - "rating": (rating or "?")[0].lower(), - "file_url": "https:" + text.unescape(file_url), - "width": text.parse_int(width), - "height": text.parse_int(height), + "vote_count" : text.parse_int(vcnt), + "created_at" : created, + "date" : text.parse_datetime( + created, "%Y-%m-%d %H:%M:%S.%f"), + "rating" : text.remove_html(rating).lower(), + "file_url" : "https:" + text.unescape(file_url), + "width" : text.parse_int(width), + "height" : text.parse_int(height), } - if self.extags: - tags = collections.defaultdict(list) - tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>') - pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') - for tag_type, tag_name in pattern.findall(tags_html or ""): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - data["tags_" + key] = " ".join(value) + tags = collections.defaultdict(list) + tags_list = [] + tags_html = text.extr(page, '<ul id="tag-sidebar"', '</ul>') + for tag_type, tag_name in self.find_tags(tags_html or ""): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + data["tags_" + key] = " ".join(value) + tags_list += value + data["tags"] = " ".join(tags_list) return data @@ -178,15 +194,16 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): while True: page = self.request(self.root, params=params, retries=10).text - pos = page.find("<div id=more-popular-posts-link>") + 1 - yield from text.extract_iter(page, '" id=p', '>', pos) + pos = ((page.find('id="more-popular-posts-link"') + 1) or + (page.find('<span class="thumb') + 1)) + yield from text.extract_iter(page, ' href="/posts/', '"', pos) next_url = text.extract(page, 'next-page-url="', '"', pos)[0] if not next_url: return - next_params = text.parse_query(text.unescape( - next_url).lstrip("?/")) + next_params = text.parse_query(text.unescape(text.unescape( + next_url).lstrip("?/"))) if "next" in next_params: # stop if the same "next" value occurs twice in a row (#265) @@ -201,8 +218,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" - example = "https://idol.sankakucomplex.com/pool/show/12345" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pools?/show/(\d+)" + example = "https://idol.sankakucomplex.com/pools/show/12345" per_page = 24 def __init__(self, match): @@ -219,15 +236,17 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): return {"pool": self.pool_id} def post_ids(self): - url = self.root + "/pool/show/" + self.pool_id + url = self.root + "/pools/show/" + self.pool_id params = {"page": self.start_page} while True: page = self.request(url, params=params, retries=10).text - ids = list(text.extract_iter(page, '" id=p', '>')) + pos = page.find('id="pool-show"') + 1 + post_ids = list(text.extract_iter( + page, ' href="/posts/', '"', pos)) - yield from ids - if len(ids) < self.per_page: + yield from post_ids + if len(post_ids) < self.per_page: return params["page"] += 1 @@ -236,8 +255,8 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor): """Extractor for single images from idol.sankakucomplex.com""" subcategory = "post" archive_fmt = "{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" - example = "https://idol.sankakucomplex.com/post/show/12345" + pattern = BASE_PATTERN + r"/posts?/(?:show/)?([0-9a-f]+)" + example = "https://idol.sankakucomplex.com/posts/0123456789abcdef" def __init__(self, match): IdolcomplexExtractor.__init__(self, match) From 625e94fa7dac9d8ccd274d83be0661675e3f6bba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 27 Nov 2023 18:30:53 +0100 Subject: [PATCH 166/344] update extractor test results still not everything, but good enough for now --- test/results/2chan.py | 12 +-- test/results/4chan.py | 2 +- test/results/4plebs.py | 3 +- test/results/8chan.py | 2 +- test/results/aibooru.py | 4 +- test/results/bbc.py | 2 +- test/results/bcbnsfw.py | 3 +- test/results/cavemanon.py | 2 +- test/results/comicvine.py | 2 +- test/results/cyberdrop.py | 60 ++++++++---- test/results/derpibooru.py | 2 +- test/results/e621.py | 66 +++++++------ test/results/horne.py | 2 +- test/results/idolcomplex.py | 47 +++++++-- test/results/imagefap.py | 9 +- test/results/issuu.py | 1 - test/results/itaku.py | 21 ++-- test/results/itchio.py | 6 +- test/results/khinsider.py | 2 +- test/results/lesbianenergy.py | 2 +- test/results/luscious.py | 8 +- test/results/mangadex.py | 2 +- test/results/misskeyio.py | 2 +- test/results/myhentaigallery.py | 2 +- test/results/nsfwalbum.py | 20 +++- test/results/paheal.py | 6 +- test/results/pillowfort.py | 2 +- test/results/pornhub.py | 30 ++++-- test/results/pornpics.py | 5 +- test/results/pururin.py | 2 +- test/results/reddit.py | 2 +- test/results/sexcom.py | 3 +- test/results/skeb.py | 2 +- test/results/smugloli.py | 8 +- test/results/tapas.py | 4 +- test/results/toyhouse.py | 2 +- test/results/tumblr.py | 6 +- test/results/turboimagehost.py | 5 +- test/results/twibooru.py | 2 +- test/results/unsplash.py | 163 +++++++++++++++++++++----------- test/results/weibo.py | 1 + test/results/wikiart.py | 62 +++++++++++- test/results/wikifeet.py | 2 +- 43 files changed, 396 insertions(+), 195 deletions(-) diff --git a/test/results/2chan.py b/test/results/2chan.py index 0a1cdd69..78f48e58 100644 --- a/test/results/2chan.py +++ b/test/results/2chan.py @@ -10,25 +10,25 @@ _2chan = getattr(gallery_dl.extractor, "2chan") __tests__ = ( { - "#url" : "https://dec.2chan.net/70/res/14565.htm", + "#url" : "https://dec.2chan.net/70/res/17222.htm", "#category": ("", "2chan", "thread"), "#class" : _2chan._2chanThreadExtractor, "#pattern" : r"https://dec\.2chan\.net/70/src/\d{13}\.jpg", - "#count" : ">= 3", + "#count" : ">= 2", "board" : "70", "board_name": "新板提案", "com" : str, "fsize" : r"re:\d+", "name" : "名無し", - "no" : r"re:1[45]\d\d\d", - "now" : r"re:22/../..\(.\)..:..:..", + "no" : r"re:17\d\d\d", + "now" : r"re:23/../..\(.\)..:..:..", "post" : "無題", "server" : "dec", - "thread" : "14565", + "thread" : "17222", "tim" : r"re:^\d{13}$", "time" : r"re:^\d{10}$", - "title" : "ヒロアカ板", + "title" : "画像会話板", }, ) diff --git a/test/results/4chan.py b/test/results/4chan.py index c90e4107..6219e165 100644 --- a/test/results/4chan.py +++ b/test/results/4chan.py @@ -15,7 +15,7 @@ __tests__ = ( "#class" : _4chan._4chanThreadExtractor, "#sha1_url" : "39082ad166161966d7ba8e37f2173a824eb540f0", "#sha1_metadata": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a", - "#sha1_content" : "20b7b51afa51c9c31a0020a0737b889532c8d7ec", + "#sha1_content" : "551e432d52700ff3711f14752124e9af86ecbbdf", }, { diff --git a/test/results/4plebs.py b/test/results/4plebs.py index 4f00027d..bae62608 100644 --- a/test/results/4plebs.py +++ b/test/results/4plebs.py @@ -12,7 +12,8 @@ __tests__ = ( "#url" : "https://archive.4plebs.org/tg/thread/54059290", "#category": ("foolfuuka", "4plebs", "thread"), "#class" : foolfuuka.FoolfuukaThreadExtractor, - "#sha1_url": "fd823f17b5001442b941fddcd9ec91bafedfbc79", + "#pattern" : "https://i\.4pcdn\.org/tg/1[34]\d{11}\.(jpg|png|gif)", + "#count" : 30, }, { diff --git a/test/results/8chan.py b/test/results/8chan.py index 43c08349..f7be8148 100644 --- a/test/results/8chan.py +++ b/test/results/8chan.py @@ -73,7 +73,7 @@ __tests__ = ( "#category": ("", "8chan", "board"), "#class" : _8chan._8chanBoardExtractor, "#pattern" : _8chan._8chanThreadExtractor.pattern, - "#count" : 23, + "#count" : 27, }, { diff --git a/test/results/aibooru.py b/test/results/aibooru.py index 78bd7273..41408423 100644 --- a/test/results/aibooru.py +++ b/test/results/aibooru.py @@ -12,8 +12,8 @@ __tests__ = ( "#url" : "https://aibooru.online/posts?tags=center_frills&z=1", "#category": ("Danbooru", "aibooru", "tag"), "#class" : danbooru.DanbooruTagExtractor, - "#pattern" : r"https://cdn\.aibooru\.online/original/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", - "#count" : ">= 3", + "#pattern" : r"https://cdn\.aibooru\.download/original/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", + "#count" : ">= 50", }, { diff --git a/test/results/bbc.py b/test/results/bbc.py index 83b4923c..e458a06a 100644 --- a/test/results/bbc.py +++ b/test/results/bbc.py @@ -39,7 +39,7 @@ __tests__ = ( }, { - "#url" : "https://www.bbc.co.uk/programmes/b006q2x0/galleries?page=40", + "#url" : "https://www.bbc.co.uk/programmes/b006q2x0/galleries?page=25", "#category": ("", "bbc", "programme"), "#class" : bbc.BbcProgrammeExtractor, "#pattern" : bbc.BbcGalleryExtractor.pattern, diff --git a/test/results/bcbnsfw.py b/test/results/bcbnsfw.py index 8d35e36e..e9fcf8b2 100644 --- a/test/results/bcbnsfw.py +++ b/test/results/bcbnsfw.py @@ -16,10 +16,11 @@ __tests__ = ( { "#url" : "https://booru.bcbnsfw.space/post/1599", + "#comment" : "now only available as WebP", "#category": ("szurubooru", "bcbnsfw", "post"), "#class" : szurubooru.SzurubooruPostExtractor, "#pattern" : r"https://booru\.bcbnsfw\.space/data/posts/1599_53784518e92086bd\.png", - "#sha1_content": "0c38fc612ba1f03950fad31c4f80a1fccdab1096", + "#sha1_content": "55f8b8d187adc82f2dcaf2aa89db0ae21b08c0b0", }, ) diff --git a/test/results/cavemanon.py b/test/results/cavemanon.py index 4fc0af3c..3d065f43 100644 --- a/test/results/cavemanon.py +++ b/test/results/cavemanon.py @@ -37,7 +37,7 @@ __tests__ = ( "id" : 8335, "md5" : "", "size" : 0, - "tags" : "Color Fang Food Pterodactyl discord_emote transparent", + "tags" : "Color discord_emote Fang Food Pterodactyl transparent", "width" : 459, }, diff --git a/test/results/comicvine.py b/test/results/comicvine.py index fa77c491..b41c24e8 100644 --- a/test/results/comicvine.py +++ b/test/results/comicvine.py @@ -21,7 +21,7 @@ __tests__ = ( "#category": ("", "comicvine", "tag"), "#class" : comicvine.ComicvineTagExtractor, "#pattern" : r"https://comicvine\.gamespot\.com/a/uploads/original/\d+/\d+/\d+-.+", - "#count" : ">= 450", + "#count" : ">= 400", }, ) diff --git a/test/results/cyberdrop.py b/test/results/cyberdrop.py index b163ca47..a47d244d 100644 --- a/test/results/cyberdrop.py +++ b/test/results/cyberdrop.py @@ -9,33 +9,53 @@ from gallery_dl.extractor import cyberdrop __tests__ = ( { - "#url" : "https://cyberdrop.me/a/keKRjm4t", - "#comment" : "images", + "#url" : "https://cyberdrop.me/a/8uE0wQiK", "#category": ("lolisafe", "cyberdrop", "album"), "#class" : cyberdrop.CyberdropAlbumExtractor, - "#pattern" : r"https://fs-\d+\.cyberdrop\.to/.*\.(jpg|png|webp)$", - - "album_id" : "keKRjm4t", - "album_name" : "Fate (SFW)", - "album_size" : 150069254, - "count" : 62, - "date" : "dt:2020-06-18 13:14:20", - "description": "", - "id" : r"re:\w{8}", + "#pattern" : r"https://sun\.cyberdrop\.ch/api/fc/yyK9y8xpQK5dP\?.+", + "#sha1_content": "0c8768055e4e20e7c7259608b67799171b691140", + + "album_id" : "8uE0wQiK", + "album_name" : "test テスト \"&>", + "album_size" : 182, + "count" : 1, + "date" : "dt:2023-11-26 00:00:00", + "description" : "test テスト \"&>", + "extension" : "png", + "filename" : "test-テスト--22->-rwU3x9LU", + "id" : "rwU3x9LU", + "name" : "test-テスト--22->", + "num" : 1, + "size" : 182, + "slug" : "yyK9y8xpQK5dP", + "thumbnailUrl": str, + "type" : "image/png", + "url" : str, }, { - "#url" : "https://cyberdrop.to/a/l8gIAXVD", - "#comment" : "videos", + "#url" : "https://cyberdrop.me/a/HriMgbuf", "#category": ("lolisafe", "cyberdrop", "album"), "#class" : cyberdrop.CyberdropAlbumExtractor, - "#pattern" : r"https://fs-\d+\.cyberdrop\.to/.*\.mp4$", - "#count" : 31, - - "album_id" : "l8gIAXVD", - "album_name": "Achelois17 videos", - "album_size": 652037121, - "date" : "dt:2020-06-16 15:40:44", + "#pattern" : r"https://sun\.cyberdrop\.ch/api/fc/\w+\?.+", + "#count" : 3, + + "album_id" : "HriMgbuf", + "album_name" : "animations", + "album_size" : 1090519, + "count" : 3, + "date" : "dt:2023-11-26 00:00:00", + "description" : "animated stuff", + "extension" : r"re:gif|webm", + "filename" : r"re:danbooru_\d+_\w+-\w+", + "id" : str, + "name" : r"re:danbooru_\d+_\w+", + "num" : range(1, 3), + "size" : int, + "slug" : str, + "thumbnailUrl": str, + "type" : r"re:image/gif|video/webm", + "url" : str, }, ) diff --git a/test/results/derpibooru.py b/test/results/derpibooru.py index 7ca6a0de..278fcaa0 100644 --- a/test/results/derpibooru.py +++ b/test/results/derpibooru.py @@ -40,7 +40,7 @@ __tests__ = ( "score" : int, "sha512_hash" : "f16c98e2848c2f1bfff3985e8f1a54375cc49f78125391aeb80534ce011ead14e3e452a5c4bc98a66f56bdfcd07ef7800663b994f3f343c572da5ecc22a9660f", "size" : 860914, - "source_url" : "https://www.deviantart.com/speccysy/art/Afternoon-Flight-215193985", + "source_url" : "https://web.archive.org/web/20110702164313/http://speccysy.deviantart.com:80/art/Afternoon-Flight-215193985", "spoilered" : False, "tag_count" : int, "tag_ids" : list, diff --git a/test/results/e621.py b/test/results/e621.py index dd9787cd..5cd5c74d 100644 --- a/test/results/e621.py +++ b/test/results/e621.py @@ -59,36 +59,42 @@ __tests__ = ( "#options" : {"metadata": "notes,pools"}, "#pattern" : r"https://static\d\.e621\.net/data/c6/8c/c68cca0643890b615f75fb2719589bff\.png", - "notes": [{ - "body" : "Little Legends 2", - "created_at" : "2022-05-16T13:58:38.877-04:00", - "creator_id" : 517450, - "creator_name": "EeveeCuddler69", - "height" : 475, - "id" : 321296, - "is_active" : True, - "post_id" : 3181052, - "updated_at" : "2022-05-16T13:59:02.050-04:00", - "version" : 3, - "width" : 809, - "x" : 83, - "y" : 117, -}], - "pools": [{ - "category" : "series", - "created_at" : "2022-02-17T00:29:22.669-05:00", - "creator_id" : 1077440, - "creator_name": "Yeetus90", - "description" : """* "Little Legends":/pools/27971 -* Little Legends 2 -* "Little Legends 3":/pools/27481""", - "id" : 27492, - "is_active" : False, - "name" : "Little Legends 2", - "post_count" : 39, - "post_ids" : list, - "updated_at" : "2022-03-27T06:30:03.382-04:00", -}], + "notes": [ + { + "body" : "Little Legends 2", + "created_at" : "2022-05-16T13:58:38.877-04:00", + "creator_id" : 517450, + "creator_name": "EeveeCuddler69", + "height" : 475, + "id" : 321296, + "is_active" : True, + "post_id" : 3181052, + "updated_at" : "2022-05-16T13:59:02.050-04:00", + "version" : 3, + "width" : 809, + "x" : 83, + "y" : 117, + }, + ], + "pools": [ + { + "category" : "series", + "created_at" : "2022-02-17T00:29:22.669-05:00", + "creator_id" : 1077440, + "creator_name": "Yeetus90", + "description" : """\ +* "Little Legends":/pools/27971\r +* Little Legends 2\r +* "Little Legends 3":/pools/27481\ +""", + "id" : 27492, + "is_active" : False, + "name" : "Little Legends 2", + "post_count" : 39, + "post_ids" : list, + "updated_at" : "2022-03-27T06:30:03.382-04:00", + }, + ], }, { diff --git a/test/results/horne.py b/test/results/horne.py index 1cea3a0e..8cbce012 100644 --- a/test/results/horne.py +++ b/test/results/horne.py @@ -19,7 +19,7 @@ __tests__ = ( "#url" : "https://horne.red/members_illust.php?id=58000", "#category": ("Nijie", "horne", "illustration"), "#class" : nijie.NijieIllustrationExtractor, - "#pattern" : r"https://pic\.nijie\.net/\d+/horne/\d+/\d+/\d+/illust/\d+_\d+_[0-9a-f]+_[0-9a-f]+\.png", + "#pattern" : r"https://pic\.nijie\.net/\d+/horne/\w+/\d+/\d+/illust/\d+_\d+_[0-9a-f]+_[0-9a-f]+\.png", "#range" : "1-20", "#count" : 20, diff --git a/test/results/idolcomplex.py b/test/results/idolcomplex.py index 6aced68c..5932dc72 100644 --- a/test/results/idolcomplex.py +++ b/test/results/idolcomplex.py @@ -12,37 +12,66 @@ __tests__ = ( "#url" : "https://idol.sankakucomplex.com/?tags=lyumos", "#category": ("booru", "idolcomplex", "tag"), "#class" : idolcomplex.IdolcomplexTagExtractor, - "#pattern" : r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", + "#pattern" : r"https://i[sv]\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", "#range" : "18-22", "#count" : 5, }, { - "#url" : "https://idol.sankakucomplex.com/?tags=order:favcount", + "#url" : "https://idol.sankakucomplex.com/?tags=lyumos+wreath&page=3&next=694215", "#category": ("booru", "idolcomplex", "tag"), "#class" : idolcomplex.IdolcomplexTagExtractor, - "#range" : "18-22", - "#count" : 5, }, { - "#url" : "https://idol.sankakucomplex.com/?tags=lyumos+wreath&page=3&next=694215", - "#category": ("booru", "idolcomplex", "tag"), - "#class" : idolcomplex.IdolcomplexTagExtractor, + "#url" : "https://idol.sankakucomplex.com/pools/show/145", + "#category": ("booru", "idolcomplex", "pool"), + "#class" : idolcomplex.IdolcomplexPoolExtractor, + "#count" : 3, }, { "#url" : "https://idol.sankakucomplex.com/pool/show/145", "#category": ("booru", "idolcomplex", "pool"), "#class" : idolcomplex.IdolcomplexPoolExtractor, - "#count" : 3, +}, + +{ + "#url" : "https://idol.sankakucomplex.com/en/posts/show/509eccbba54a43cea6b275a65b93c51d", + "#category": ("booru", "idolcomplex", "post"), + "#class" : idolcomplex.IdolcomplexPostExtractor, + "#sha1_content": "694ec2491240787d75bf5d0c75d0082b53a85afd", + + "created_at" : "2017-11-24 17:01:27.696", + "date" : "dt:2017-11-24 17:01:27", + "extension" : "jpg", + "file_url" : r"re:https://is\.sankakucomplex\.com/data/50/9e/509eccbba54a43cea6b275a65b93c51d\.jpg\?", + "filename" : "509eccbba54a43cea6b275a65b93c51d", + "height" : 683, + "id" : 694215, + "md5" : "509eccbba54a43cea6b275a65b93c51d", + "rating" : "g", + "tags" : "lyumos the_witcher shani_(the_witcher) 1girl cosplay green_eyes non-asian redhead waistcoat wreath 3:2_aspect_ratio", + "tags_character": "shani_(the_witcher)", + "tags_copyright": "the_witcher", + "tags_general" : "1girl cosplay green_eyes non-asian redhead waistcoat wreath", + "tags_idol" : "lyumos", + "tags_medium" : "3:2_aspect_ratio", + "vote_average" : range(4, 5), + "vote_count" : range(25, 40), + "width" : 1024, +}, + +{ + "#url" : "https://idol.sankakucomplex.com/posts/509eccbba54a43cea6b275a65b93c51d", + "#category": ("booru", "idolcomplex", "post"), + "#class" : idolcomplex.IdolcomplexPostExtractor, }, { "#url" : "https://idol.sankakucomplex.com/post/show/694215", "#category": ("booru", "idolcomplex", "post"), "#class" : idolcomplex.IdolcomplexPostExtractor, - "#options" : {"tags": True}, "#sha1_content": "694ec2491240787d75bf5d0c75d0082b53a85afd", "tags_character": "shani_(the_witcher)", diff --git a/test/results/imagefap.py b/test/results/imagefap.py index 99a22166..b4f3ab81 100644 --- a/test/results/imagefap.py +++ b/test/results/imagefap.py @@ -5,6 +5,7 @@ # published by the Free Software Foundation. from gallery_dl.extractor import imagefap +from gallery_dl import exception __tests__ = ( @@ -12,16 +13,14 @@ __tests__ = ( "#url" : "https://www.imagefap.com/gallery/7102714", "#category": ("", "imagefap", "gallery"), "#class" : imagefap.ImagefapGalleryExtractor, - "#pattern" : r"https://cdnh?\.imagefap\.com/images/full/\d+/\d+/\d+\.jpg", - "#sha1_metadata": "bdcb75b1e4b9dddc718f3d66e1a58afa9d81a38b", - "#sha1_content" : "694a0a57385980a6f90fbc296cadcd6c11ba2dab", + "#exception": exception.HttpError, }, { "#url" : "https://www.imagefap.com/gallery/7876223", "#category": ("", "imagefap", "gallery"), "#class" : imagefap.ImagefapGalleryExtractor, - "#pattern" : r"https://cdnh?\.imagefap\.com/images/full/\d+/\d+/\d+\.jpg", + "#pattern" : r"https://cdn[ch]?\.imagefap\.com/images/full/\d+/\d+/\d+\.jpg", "#count" : 44, "categories" : [ @@ -90,7 +89,7 @@ __tests__ = ( "#url" : "https://www.imagefap.com/photo/1962981893", "#category": ("", "imagefap", "image"), "#class" : imagefap.ImagefapImageExtractor, - "#pattern" : r"https://cdnh?\.imagefap\.com/images/full/65/196/1962981893\.jpg", + "#pattern" : r"https://cdn[ch]?\.imagefap\.com/images/full/65/196/1962981893\.jpg", "date" : "21/08/2014", "gallery_id": 7876223, diff --git a/test/results/issuu.py b/test/results/issuu.py index 5e086d1e..4a90be13 100644 --- a/test/results/issuu.py +++ b/test/results/issuu.py @@ -25,7 +25,6 @@ __tests__ = ( "date" : "dt:2019-09-16 00:00:00", "description" : r"re:Motions, the brand new publication by I", "documentName" : "motions-1-2019", - "downloadable" : False, "pageCount" : 36, "publicationId": "d99ec95935f15091b040cb8060f05510", "title" : "Motions by Issuu - Issue 1", diff --git a/test/results/itaku.py b/test/results/itaku.py index ef3d3679..8a5b5066 100644 --- a/test/results/itaku.py +++ b/test/results/itaku.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://itaku.ee/profile/piku/gallery", "#category": ("", "itaku", "gallery"), "#class" : itaku.ItakuGalleryExtractor, - "#pattern" : r"https://d1wmr8tlk3viaj\.cloudfront\.net/gallery_imgs/[^/?#]+\.(jpg|png|gif)", + "#pattern" : r"https://itaku\.ee/api/media/gallery_imgs/[^/?#]+\.(jpg|png|gif)", "#range" : "1-10", "#count" : 10, }, @@ -21,8 +21,7 @@ __tests__ = ( "#url" : "https://itaku.ee/images/100471", "#category": ("", "itaku", "image"), "#class" : itaku.ItakuImageExtractor, - "#pattern" : r"https://d1wmr8tlk3viaj\.cloudfront\.net/gallery_imgs/220504_oUNIAFT\.png", - "#count" : 1, + "#urls" : "https://itaku.ee/api/media/gallery_imgs/220504_oUNIAFT.png", "already_pinned" : None, "blacklisted" : { @@ -38,8 +37,8 @@ __tests__ = ( "filename" : "220504_oUNIAFT", "hotness_score" : float, "id" : 100471, - "image" : "https://d1wmr8tlk3viaj.cloudfront.net/gallery_imgs/220504_oUNIAFT.png", - "image_xl" : "https://d1wmr8tlk3viaj.cloudfront.net/gallery_imgs/220504_oUNIAFT/lg.jpg", + "image" : "https://itaku.ee/api/media/gallery_imgs/220504_oUNIAFT.png", + "image_xl" : "https://itaku.ee/api/media/gallery_imgs/220504_oUNIAFT/lg.jpg", "liked_by_you" : False, "maturity_rating" : "SFW", "num_comments" : int, @@ -47,7 +46,7 @@ __tests__ = ( "num_reshares" : int, "obj_tags" : 136446, "owner" : 16775, - "owner_avatar" : "https://d1wmr8tlk3viaj.cloudfront.net/profile_pics/av2022r_vKYVywc/md.jpg", + "owner_avatar" : "https://itaku.ee/api/media/profile_pics/av2022r_vKYVywc/md.jpg", "owner_displayname": "Piku", "owner_username" : "piku", "reshared_by_you" : False, @@ -55,13 +54,13 @@ __tests__ = ( "tags" : list, "tags_character" : ["hatsune_miku"], "tags_copyright" : ["vocaloid"], - "tags_general" : [ + "tags_general": [ + "female", + "green_eyes", "twintails", "green_hair", - "flag", "gloves", - "green_eyes", - "female", + "flag", "racing_miku", ], "title" : "Racing Miku 2022 Ver.", @@ -76,7 +75,7 @@ __tests__ = ( "#comment" : "video", "#category": ("", "itaku", "image"), "#class" : itaku.ItakuImageExtractor, - "#pattern" : r"https://d1wmr8tlk3viaj\.cloudfront\.net/gallery_vids/sleepy_af_OY5GHWw\.mp4", + "#urls" : "https://itaku.ee/api/media/gallery_vids/sleepy_af_OY5GHWw.mp4", }, ) diff --git a/test/results/itchio.py b/test/results/itchio.py index d2acfa61..f49bd69f 100644 --- a/test/results/itchio.py +++ b/test/results/itchio.py @@ -12,11 +12,11 @@ __tests__ = ( "#url" : "https://sirtartarus.itch.io/a-craft-of-mine", "#category": ("", "itchio", "game"), "#class" : itchio.ItchioGameExtractor, - "#pattern" : r"https://\w+\.ssl\.hwcdn\.net/upload2/game/1983311/7723751\?", - "#count" : 1, + "#pattern" : r"https://(dl.itch.zone|itchio-mirror.\w+.r2.cloudflarestorage.com)/upload2/game/1983311/\d+\?", + "#count" : 3, "extension": "", - "filename" : "7723751", + "filename" : r"re:\d+", "game" : { "id" : 1983311, "noun" : "game", diff --git a/test/results/khinsider.py b/test/results/khinsider.py index b680cab2..7013069f 100644 --- a/test/results/khinsider.py +++ b/test/results/khinsider.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://downloads.khinsider.com/game-soundtracks/album/horizon-riders-wii", "#category": ("", "khinsider", "soundtrack"), "#class" : khinsider.KhinsiderSoundtrackExtractor, - "#pattern" : r"https?://vgm(site|downloads)\.com/soundtracks/horizon-riders-wii/[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3", + "#pattern" : r"https?://(dl\.)?vgm(site|downloads)\.com/soundtracks/horizon-riders-wii/[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3", "#count" : 1, "album" : { diff --git a/test/results/lesbianenergy.py b/test/results/lesbianenergy.py index fffe8a0c..650671f9 100644 --- a/test/results/lesbianenergy.py +++ b/test/results/lesbianenergy.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://lesbian.energy/@rerorero", "#category": ("misskey", "lesbian.energy", "user"), "#class" : misskey.MisskeyUserExtractor, - "#pattern" : r"https://lesbian.energy/files/\w+", + "#pattern" : r"https://(lesbian.energy/files/\w+|.+/media_attachments/files/.+)", "#range" : "1-50", "#count" : 50, }, diff --git a/test/results/luscious.py b/test/results/luscious.py index d5429612..5e7a1460 100644 --- a/test/results/luscious.py +++ b/test/results/luscious.py @@ -20,12 +20,12 @@ __tests__ = ( "__typename" : "Album", "audiences" : list, "content" : "Hentai", - "cover" : r"re:https://\w+.luscious.net/.+/277031/", + "cover" : r"re:https://storage\.bhs\.cloud\.ovh\.net/v1/.+/277031/", "created" : 1479625853, - "created_by" : "NTRshouldbeillegal", + "created_by" : "Hive Mind", "date" : "dt:2016-11-20 07:10:53", "description" : "Enjoy.", - "download_url" : r"re:/download/(r/)?824778/277031/", + "download_url" : "/download/r/25/277031/", "genres" : list, "id" : 277031, "is_manga" : True, @@ -34,7 +34,7 @@ __tests__ = ( "like_status" : "none", "modified" : int, "permissions" : list, - "rating" : float, + "rating" : None, "slug" : "okinami-no-koigokoro", "status" : None, "tags" : list, diff --git a/test/results/mangadex.py b/test/results/mangadex.py index c6a9e53a..17b2157c 100644 --- a/test/results/mangadex.py +++ b/test/results/mangadex.py @@ -97,7 +97,7 @@ __tests__ = ( "#url" : "https://mangadex.org/title/7c1e2742-a086-4fd3-a3be-701fd6cf0be9", "#category": ("", "mangadex", "manga"), "#class" : mangadex.MangadexMangaExtractor, - "#count" : 1, + "#count" : ">= 25", }, { diff --git a/test/results/misskeyio.py b/test/results/misskeyio.py index 3a28ae87..9d005483 100644 --- a/test/results/misskeyio.py +++ b/test/results/misskeyio.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://misskey.io/@lithla", "#category": ("misskey", "misskey.io", "user"), "#class" : misskey.MisskeyUserExtractor, - "#pattern" : r"https://s\d+\.arkjp\.net/misskey/[\w-]+\.\w+", + "#pattern" : r"https://(media.misskeyusercontent.com/io|s\d+\.arkjp\.net/misskey)/[\w-]+\.\w+", "#range" : "1-50", "#count" : 50, }, diff --git a/test/results/myhentaigallery.py b/test/results/myhentaigallery.py index 6ddc2c2f..b7b5ac99 100644 --- a/test/results/myhentaigallery.py +++ b/test/results/myhentaigallery.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://myhentaigallery.com/gallery/thumbnails/16247", "#category": ("", "myhentaigallery", "gallery"), "#class" : myhentaigallery.MyhentaigalleryGalleryExtractor, - "#pattern" : r"https://images.myhentaicomics\.com/imagesgallery/images/[^/]+/original/\d+\.jpg", + "#pattern" : r"https://images\.myhentaicomics\.com/mhg/images/[^/]+/original/\d+\.jpg", "artist" : list, "count" : 11, diff --git a/test/results/nsfwalbum.py b/test/results/nsfwalbum.py index e89b8fb4..cabd3e10 100644 --- a/test/results/nsfwalbum.py +++ b/test/results/nsfwalbum.py @@ -13,8 +13,24 @@ __tests__ = ( "#category": ("", "nsfwalbum", "album"), "#class" : nsfwalbum.NsfwalbumAlbumExtractor, "#range" : "1-5", - "#sha1_url" : "b0481fc7fad5982da397b6359fbed8421b8ba284", - "#sha1_metadata": "e98f9b0d473c00000831618d0235863b1dd78294", + "#urls" : ( + "https://img70.imgspice.com/i/05457/mio2bu5xbrxe.jpg", + "https://img70.imgspice.com/i/05457/zgpxa8kr4h1d.jpg", + "https://img70.imgspice.com/i/05457/3379nxsm9lx8.jpg", + "https://img70.imgspice.com/i/05457/pncrkhspuoa3.jpg", + "https://img70.imgspice.com/i/05457/128b2odt216a.jpg", + ), + + "album_id" : 401611, + "extension": "jpg", + "filename" : str, + "height" : range(1365, 2048), + "id" : int, + "models" : [], + "num" : range(1, 5), + "studio" : "Met-Art", + "title" : "Met-Art - Katherine A - Difuza 25.05.2014 (134 photos)(4368 X 2912)", + "width" : range(1365, 2048), }, ) diff --git a/test/results/paheal.py b/test/results/paheal.py index 3ef0ec58..833f3f84 100644 --- a/test/results/paheal.py +++ b/test/results/paheal.py @@ -42,19 +42,19 @@ __tests__ = ( "#url" : "https://rule34.paheal.net/post/view/481609", "#category": ("shimmie2", "paheal", "post"), "#class" : paheal.PahealPostExtractor, - "#pattern" : r"https://tulip\.paheal\.net/_images/bbdc1c33410c2cdce7556c7990be26b7/481609%20-%20Azumanga_Daioh%20inanimate%20Osaka%20Vuvuzela\.jpg", + "#pattern" : r"https://tulip\.paheal\.net/_images/bbdc1c33410c2cdce7556c7990be26b7/481609%20-.+\.jpg", "#sha1_content": "7b924bcf150b352ac75c9d281d061e174c851a11", "date" : "dt:2010-06-17 15:40:23", "extension": "jpg", "file_url" : r"re:https://tulip.paheal.net/_images/bbdc1c33410c", - "filename" : "481609 - Azumanga_Daioh inanimate Osaka Vuvuzela", + "filename" : "481609 - Ayumu_Kasuga Azumanga_Daioh inanimate Vuvuzela", "height" : 660, "id" : 481609, "md5" : "bbdc1c33410c2cdce7556c7990be26b7", "size" : 157389, "source" : "", - "tags" : "Azumanga_Daioh inanimate Osaka Vuvuzela", + "tags" : "Ayumu_Kasuga Azumanga_Daioh inanimate Vuvuzela", "uploader" : "CaptainButtface", "width" : 614, }, diff --git a/test/results/pillowfort.py b/test/results/pillowfort.py index fea09746..b04be6f3 100644 --- a/test/results/pillowfort.py +++ b/test/results/pillowfort.py @@ -177,7 +177,7 @@ __tests__ = ( "#category": ("", "pillowfort", "user"), "#class" : pillowfort.PillowfortUserExtractor, "#pattern" : r"https://img\d+\.pillowfort\.social/posts/", - "#count" : 6, + "#count" : range(10, 20), }, ) diff --git a/test/results/pornhub.py b/test/results/pornhub.py index e7aaf8da..e2aa9818 100644 --- a/test/results/pornhub.py +++ b/test/results/pornhub.py @@ -39,22 +39,32 @@ __tests__ = ( }, { - "#url" : "https://www.pornhub.com/gif/33643461", + "#url" : "https://www.pornhub.com/gif/43726891", "#category": ("", "pornhub", "gif"), "#class" : pornhub.PornhubGifExtractor, - "#pattern" : r"https://\w+\.phncdn\.com/pics/gifs/033/643/461/33643461a\.webm", + "#pattern" : r"https://\w+\.phncdn\.com/pics/gifs/043/726/891/43726891a\.webm", - "date" : "dt:2020-10-31 00:00:00", + "date" : "dt:2023-04-20 00:00:00", "extension": "webm", - "filename" : "33643461a", - "id" : "33643461", + "filename" : "43726891a", + "id" : "43726891", "tags" : [ - "big boobs", - "lana rhoades", + "sloppy deepthroat", + "perfect body", + "petite brunette", + "mouth fuck", + "big dick", + "natural big tits", + "deepthroat swallow", + "amateur couple", + "homemade", + "girls wanking boys", + "hardcore sex", + "babes 18 year", ], - "title" : "Big boobs", - "url" : str, - "user" : "Lana Rhoades", + "title" : "Intense sloppy blowjob of Danika Mori", + "url" : "https://el.phncdn.com/pics/gifs/043/726/891/43726891a.webm", + "user" : "Danika Mori", }, { diff --git a/test/results/pornpics.py b/test/results/pornpics.py index 91a10ca0..2bcdcfec 100644 --- a/test/results/pornpics.py +++ b/test/results/pornpics.py @@ -15,10 +15,10 @@ __tests__ = ( "#pattern" : r"https://cdni\.pornpics\.com/1280/7/160/62610699/62610699_\d+_[0-9a-f]{4}\.jpg", "categories": [ + "Outdoor", "MILF", "Amateur", "Sexy", - "Outdoor", ], "channel" : "FTV MILFs", "count" : 17, @@ -28,6 +28,9 @@ __tests__ = ( "slug" : "british-beauty-danielle-flashes-hot-breasts-ass-and-snatch-in-the-forest", "tags" : [ "Amateur MILF", + "Nature", + "Amateur Outdoor", + "First Time", "Sexy MILF", ], "title" : "British beauty Danielle flashes hot breasts, ass and snatch in the forest", diff --git a/test/results/pururin.py b/test/results/pururin.py index e57c7fe7..971eb1d3 100644 --- a/test/results/pururin.py +++ b/test/results/pururin.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://pururin.to/gallery/38661/iowant-2", "#category": ("", "pururin", "gallery"), "#class" : pururin.PururinGalleryExtractor, - "#pattern" : r"https://i\.pururin\.to/38661/\d+\.jpg", + "#pattern" : r"https://i\.pururin\.[ct]o/38661/\d+\.jpg", "title" : r"re:I ?owant 2!!", "title_en" : r"re:I ?owant 2!!", diff --git a/test/results/reddit.py b/test/results/reddit.py index 8a4359cf..e5cd1c5e 100644 --- a/test/results/reddit.py +++ b/test/results/reddit.py @@ -189,7 +189,7 @@ __tests__ = ( "#comment" : "preview.redd.it (#4470)", "#category": ("", "reddit", "submission"), "#class" : reddit.RedditSubmissionExtractor, - "#pattern" : "https://preview.redd.it/u9ud4k6xaf271.jpg?auto=webp&s=19b1334cb4409111cda136c01f7b44c2c42bf9fb", + "#urls" : "https://preview.redd.it/u9ud4k6xaf271.jpg?auto=webp&s=19b1334cb4409111cda136c01f7b44c2c42bf9fb", }, { diff --git a/test/results/sexcom.py b/test/results/sexcom.py index 079bb5fa..2f212904 100644 --- a/test/results/sexcom.py +++ b/test/results/sexcom.py @@ -51,10 +51,9 @@ __tests__ = ( { "#url" : "https://www.sex.com/pin/55847384-very-nicely-animated/", - "#comment" : "pornhub embed", + "#comment" : "pornhub embed (404 gone)", "#category": ("", "sexcom", "pin"), "#class" : sexcom.SexcomPinExtractor, - "#pattern" : "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2", }, { diff --git a/test/results/skeb.py b/test/results/skeb.py index f956986b..a8b546ad 100644 --- a/test/results/skeb.py +++ b/test/results/skeb.py @@ -58,7 +58,7 @@ __tests__ = ( "#url" : "https://skeb.jp/@kanade_cocotte", "#category": ("", "skeb", "user"), "#class" : skeb.SkebUserExtractor, - "#pattern" : r"https://skeb\.imgix\.net/uploads/origins/[\w-]+\?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&fm=webp&w=800&s=\w+", + "#pattern" : r"https://si\.imgix\.net/\w+/uploads/origins/[\w-]+", "#range" : "1-5", }, diff --git a/test/results/smugloli.py b/test/results/smugloli.py index 5d044123..f0176067 100644 --- a/test/results/smugloli.py +++ b/test/results/smugloli.py @@ -9,15 +9,15 @@ from gallery_dl.extractor import vichan __tests__ = ( { - "#url" : "https://smuglo.li/a/res/1154380.html", + "#url" : "https://smuglo.li/a/res/1187531.html", "#category": ("vichan", "smugloli", "thread"), "#class" : vichan.VichanThreadExtractor, "#pattern" : r"https://smug.+/a/src/\d+(-\d)?\.\w+", - "#count" : ">= 18", + "#count" : ">= 50", "board" : "a", - "thread": "1154380", - "title" : "Mob Psycho 100 Season 3", + "thread": "1187531", + "title" : "Buta no Liver wa Kanetsu Shiro", }, { diff --git a/test/results/tapas.py b/test/results/tapas.py index a3d79842..1278d9f8 100644 --- a/test/results/tapas.py +++ b/test/results/tapas.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://tapas.io/series/just-leave-me-be", "#category": ("", "tapas", "series"), "#class" : tapas.TapasSeriesExtractor, - "#pattern" : r"https://\w+\.cloudfront\.net/pc/\w\w/[0-9a-f-]+\.jpg", + "#pattern" : r"https://us-a\.tapas\.io/pc/\w\w/[0-9a-f-]+\.jpg", "#count" : 132, }, @@ -64,7 +64,7 @@ __tests__ = ( "has_top_banner": True, "id" : 199931, "premium" : True, - "sale_type" : "PAID", + "sale_type" : "WAIT_OR_MUST_PAY", "subscribed" : bool, "thumbsup_cnt" : int, "title" : "Tomb Raider King", diff --git a/test/results/toyhouse.py b/test/results/toyhouse.py index 22bc1412..21d13ee1 100644 --- a/test/results/toyhouse.py +++ b/test/results/toyhouse.py @@ -60,7 +60,7 @@ __tests__ = ( "http://aminoapps.com/p/92sf3z", "kroksoc (Color)", ], - "characters": ["❀Reiichi❀"], + "characters": ["Reiichi❀"], "date" : "dt:2021-07-03 20:02:02", "hash" : "bqhGcwcnU", "id" : "36817425", diff --git a/test/results/tumblr.py b/test/results/tumblr.py index 01d8de73..70e334b5 100644 --- a/test/results/tumblr.py +++ b/test/results/tumblr.py @@ -26,8 +26,8 @@ __tests__ = ( "posts" : "all", "external": True, }, - "#pattern" : r"https?://(?:$|\d+\.media\.tumblr\.com/.+_1280\.jpg|a\.tumblr\.com/tumblr_\w+)", - "#count" : 3, + "#pattern" : r"https?://(?:$|\d+\.media\.tumblr\.com/.+\.(jpg|png|gif|mp3|mp4)|v?a\.(media\.)?tumblr\.com/tumblr_\w+)", + "#count" : 27, }, { @@ -103,7 +103,7 @@ __tests__ = ( "date-max" : "2015-04-25T00:00:00", "date-min" : "2015-04-01T00:00:00", }, - "#count" : 316, + "#count" : 197, }, { diff --git a/test/results/turboimagehost.py b/test/results/turboimagehost.py index 3e2069fa..642d9321 100644 --- a/test/results/turboimagehost.py +++ b/test/results/turboimagehost.py @@ -14,7 +14,10 @@ __tests__ = ( "#class" : imagehosts.TurboimagehostImageExtractor, "#sha1_url" : "b94de43612318771ced924cb5085976f13b3b90e", "#sha1_metadata": "704757ca8825f51cec516ec44c1e627c1f2058ca", - "#sha1_content" : "f38b54b17cd7462e687b58d83f00fca88b1b105a", + "#sha1_content" : ( + "f38b54b17cd7462e687b58d83f00fca88b1b105a", + "0c8768055e4e20e7c7259608b67799171b691140", + ), }, ) diff --git a/test/results/twibooru.py b/test/results/twibooru.py index 5dd0191d..ff87deec 100644 --- a/test/results/twibooru.py +++ b/test/results/twibooru.py @@ -44,7 +44,7 @@ __tests__ = ( "tag_ids" : list, "tags" : list, "thumbnails_generated": True, - "updated_at" : "2022-11-27T00:34:50.483Z", + "updated_at" : "2023-07-24T03:18:48.153Z", "upvotes" : int, "view_url" : "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width" : 576, diff --git a/test/results/unsplash.py b/test/results/unsplash.py index 1568ed3a..e3413aff 100644 --- a/test/results/unsplash.py +++ b/test/results/unsplash.py @@ -9,69 +9,127 @@ from gallery_dl.extractor import unsplash __tests__ = ( { - "#url" : "https://unsplash.com/photos/lsoogGC_5dg", + "#url" : "https://unsplash.com/photos/red-wooden-cross-on-gray-concrete-pathway-between-green-trees-during-daytime-kaoHI0iHJPM", "#category": ("", "unsplash", "image"), "#class" : unsplash.UnsplashImageExtractor, - "#pattern" : r"https://images\.unsplash\.com/photo-1586348943529-beaae6c28db9\?ixid=\w+&ixlib=rb-4.0.3", + "#urls" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3", - "alt_description": r"re:silhouette of trees near body of water ", - "blur_hash" : "LZP4uQS4jboe%#o0WCa}2doJNaaz", - "? categories" : list, - "color" : "#f3c08c", - "created_at" : "2020-04-08T12:29:42Z", - "date" : "dt:2020-04-08 12:29:42", - "description" : "The Island", - "downloads" : int, - "exif" : { - "aperture" : "11", - "exposure_time": "30", - "focal_length" : "70.0", - "iso" : 200, - "make" : "Canon", - "model" : "Canon EOS 5D Mark IV", + "alt_description": "red wooden cross on gray concrete pathway between green trees during daytime", + "blur_hash" : "LIAwhq%e4TRjXAIBMyt89GRj%fj[", + "breadcrumbs": list, + "color" : "#0c2626", + "created_at" : "2020-10-04T15:13:59Z", + "date" : "dt:2020-10-04 15:13:59", + "description": None, + "downloads" : range(50000, 300000), + "exif" : { + "aperture" : "9", + "exposure_time": "1/125", + "focal_length" : "35.0", + "iso" : 800, + "make" : "SONY", + "model" : "ILCE-7M3", + "name" : "SONY, ILCE-7M3", }, - "extension" : "jpg", - "filename" : "photo-1586348943529-beaae6c28db9", - "height" : 6272, - "id" : "lsoogGC_5dg", - "liked_by_user" : False, - "likes" : int, - "location" : { - "city" : "Beaver Dam", - "country" : "United States", - "name" : "Beaver Dam, WI 53916, USA", + "extension" : "jpg", + "filename" : "photo-1601823984263-b87b59798b70", + "height" : 5371, + "id" : "kaoHI0iHJPM", + "liked_by_user": False, + "likes" : range(1000, 10000), + "links" : dict, + "location" : { + "city" : "箱根町", + "country" : "日本", + "name" : "Hakone, 神奈川県 日本", "position": { - "latitude" : 43.457769, - "longitude": -88.837329, + "latitude" : 35.232383, + "longitude": 139.106936, }, }, - "promoted_at" : "2020-04-08T15:12:03Z", - "sponsorship" : None, - "tags" : list, - "updated_at" : str, - "user" : { + "meta" : { + "index": True, + }, + "plus" : False, + "premium" : False, + "promoted_at": "2020-10-05T13:04:43Z", + "public_domain": False, + "slug" : "red-wooden-cross-on-gray-concrete-pathway-between-green-trees-during-daytime-kaoHI0iHJPM", + "sponsorship": None, + "subcategory": "image", + "tags" : [ + "japan", + "hakone", + "神奈川県 日本", + "torii", + "hakone shrine", + "sunrise", + "traditional", + "shrine", + "grey", + "wallpaper", + "arbour", + "garden", + "outdoors", + "gate", + ], + "tags_preview": list, + "topic_submissions": {}, + "topics" : [], + "updated_at" : "2023-11-24T08:17:36Z", + "urls": { + "full" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=srgb&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=85", + "raw" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3", + "regular" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=1080", + "small" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=400", + "small_s3": "https://s3.us-west-2.amazonaws.com/images.unsplash.com/small/photo-1601823984263-b87b59798b70", + "thumb" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=200", + }, + "user": { "accepted_tos" : True, - "bio" : str, - "first_name" : "Dave", - "id" : "uMJXuywXLiU", - "instagram_username": "just_midwest_rock", - "last_name" : "Hoefler", - "location" : None, - "name" : "Dave Hoefler", - "portfolio_url" : None, - "total_collections" : int, - "total_likes" : int, - "total_photos" : int, + "bio" : "Professional photographer.\r\nBased in Japan.", + "first_name" : "Syuhei", + "for_hire" : True, + "id" : "F4HO358YSeo", + "instagram_username": "_______life_", + "last_name" : "Inoue", + "links": { + "followers": "https://api.unsplash.com/users/_______life_/followers", + "following": "https://api.unsplash.com/users/_______life_/following", + "html" : "https://unsplash.com/@_______life_", + "likes" : "https://api.unsplash.com/users/_______life_/likes", + "photos" : "https://api.unsplash.com/users/_______life_/photos", + "portfolio": "https://api.unsplash.com/users/_______life_/portfolio", + "self" : "https://api.unsplash.com/users/_______life_", + }, + "location" : "Yokohama, Japan", + "name" : "Syuhei Inoue", + "portfolio_url" : "https://syuheiinoue.life/", + "profile_image" : { + "large" : "https://images.unsplash.com/profile-1601689368522-8855bbd61be6image?ixlib=rb-4.0.3&crop=faces&fit=crop&w=128&h=128", + "medium": "https://images.unsplash.com/profile-1601689368522-8855bbd61be6image?ixlib=rb-4.0.3&crop=faces&fit=crop&w=64&h=64", + "small" : "https://images.unsplash.com/profile-1601689368522-8855bbd61be6image?ixlib=rb-4.0.3&crop=faces&fit=crop&w=32&h=32", + }, + "social" : { + "instagram_username": "_______life_", + "paypal_email" : None, + "portfolio_url" : "https://syuheiinoue.life/", + "twitter_username" : None, + }, + "total_collections" : 2, + "total_likes" : 32, + "total_photos" : 86, + "total_promoted_photos": 24, "twitter_username" : None, - "updated_at" : str, - "username" : "davehoefler", + "updated_at" : "2023-11-24T19:15:32Z", + "username" : "_______life_" }, - "views" : int, - "width" : 4480, + "views": range(2000000, 10000000), + "width": 3581, }, { - "#url" : "https://unsplash.com/@davehoefler", + "#url" : "https://unsplash.com/@_______life_", "#category": ("", "unsplash", "user"), "#class" : unsplash.UnsplashUserExtractor, "#pattern" : r"https://images\.unsplash\.com/(photo-\d+-\w+|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", @@ -80,12 +138,11 @@ __tests__ = ( }, { - "#url" : "https://unsplash.com/@davehoefler/likes", + "#url" : "https://unsplash.com/@_______life_/likes", "#category": ("", "unsplash", "favorite"), "#class" : unsplash.UnsplashFavoriteExtractor, "#pattern" : r"https://images\.unsplash\.com/(photo-\d+-\w+|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", - "#range" : "1-30", - "#count" : 30, + "#count" : 31, }, { diff --git a/test/results/weibo.py b/test/results/weibo.py index f98b7fd9..639994c0 100644 --- a/test/results/weibo.py +++ b/test/results/weibo.py @@ -48,6 +48,7 @@ __tests__ = ( { "#url" : "https://weibo.com/1758989602?tabtype=home", + "#comment" : "'tabtype=home' is broken on website itself", "#category": ("", "weibo", "home"), "#class" : weibo.WeiboHomeExtractor, "#range" : "1-30", diff --git a/test/results/wikiart.py b/test/results/wikiart.py index fef4bad5..47eb3ec7 100644 --- a/test/results/wikiart.py +++ b/test/results/wikiart.py @@ -12,8 +12,50 @@ __tests__ = ( "#url" : "https://www.wikiart.org/en/thomas-cole", "#category": ("", "wikiart", "artist"), "#class" : wikiart.WikiartArtistExtractor, - "#sha1_url" : "6844f207a5063c499fc1d5651b03127bc4fe2f73", - "#sha1_metadata": "09230b5f504697119e267349bf92487e657a7384", + "#pattern" : "https://uploads\d+\.wikiart\.org/(\d+/)?images/thomas-cole/[\w()-]+\.(jpg|png)", + "#count" : "> 100", + + "albums" : None, + "artist" : { + "OriginalArtistName": "Thomas Cole", + "activeYearsCompletion": None, + "activeYearsStart" : None, + "artistName" : "Thomas Cole", + "biography" : "Thomas Cole inspired the generation of American [url href=https://www.wikiart.org/en/paintings-by-genre/landscape]landscape[/url] painters that came to be known as the [url href=https://www.wikiart.org/en/artists-by-painting-school/hudson-river-school]Hudson River School[/url]. Born in Bolton-le-Moors, Lancashire, England, in 1801, at the age of seventeen he emigrated with his family to the United States, first working as a wood engraver in Philadelphia before going to Steubenville, Ohio, where his father had established a wallpaper manufacturing business. \n\nCole received rudimentary instruction from an itinerant artist, began painting portraits, genre scenes, and a few landscapes, and set out to seek his fortune through Ohio and Pennsylvania. He soon moved on to Philadelphia to pursue his art, inspired by paintings he saw at the Pennsylvania Academy of the Fine Arts. Moving to New York City in spring 1825, Cole made a trip up the Hudson River to the eastern Catskill Mountains. Based on his sketches there, he executed three landscapes that a city bookseller agreed to display in his window. Colonel [url href=https://www.wikiart.org/en/john-trumbull]John Trumbull[/url], already renowned as the painter of the American Revolution, saw Cole’s pictures and instantly purchased one, recommending the other two to his colleagues William Dunlap and [url href=https://www.wikiart.org/en/asher-brown-durand]Asher B. Durand[/url]. \n\nWhat Trumbull recognized in the work of the young painter was the perception of wildness inherent in American scenery that landscape artists had theretofore ignored. Trumbull brought Cole to the attention of various patrons, who began eagerly buying his work. Dunlap publicized the discovery of the new talent, and Cole was welcomed into New York’s cultural community, which included the poet and editor William Cullen Bryant and the author James Fenimore Cooper. Cole became one of the founding members of the National Academy of Design in 1825. Even as Cole expanded his travels and subjects to include scenes in the White Mountains of New Hampshire, he aspired to what he termed a “higher style of a landscape” that included narrative—some of the paintings in paired series—including biblical and literary subjects, such as Cooper’s popular [url href=https://www.wikiart.org/en/thomas-cole/scene-from-the-last-of-the-mohicans-by-james-fenimore-cooper-1827][i]Last of the Mohicans[/i][/url]. \n\nBy 1829, his success enabled him to take the Grand Tour of Europe and especially Italy, where he remained in 1831–32, visiting Florence, Rome, and Naples. Thereafter he painted many Italian subjects, like [url href=https://www.wikiart.org/en/thomas-cole/a-view-near-tivoli-morning-1832][i]View near Tivoli. Morning[/i][/url] (1832). The region around Rome, along with the classical myth, also inspired [url href=https://www.wikiart.org/en/thomas-cole/the-titan-s-goblet-1833][i]The Titan’s Goblet[/i][/url] (1833). Cole’s travels and the encouragement and patronage of the New York merchant Luman Reed culminated in his most ambitious historical landscape series, [url href=https://www.wikiart.org/en/thomas-cole/all-works#!#filterName:Series_the-course-of-empire,resultType:masonry][i]The Course of Empire[/i][/url] (1833–1836), five pictures dramatizing the rise and fall of an ancient classical state. \n\nCole also continued to paint, with ever-rising technical assurance, sublime American scenes such as the [url href=https://www.wikiart.org/en/thomas-cole/view-from-mount-holyoke-1836][i]View from Mount Holyoke[/i][/url] (1836), [url href=https://www.wikiart.org/en/thomas-cole/the-oxbow-the-connecticut-river-near-northampton-1836][i]The Oxbow[/i][/url] (1836), in which he included a portrait of himself painting the vista and [url href=https://www.wikiart.org/en/thomas-cole/view-on-the-catskill-early-autunm-1837][i]View on the Catskill—Early Autumn[/i][/url] (1836-1837), in which he pastorally interpreted the prospect of his beloved Catskill Mountains from the village of Catskill, where he had moved the year before and met his wife-to-be, Maria Bartow. \n\nThe artist’s marriage brought with it increasing religious piety manifested in the four-part series [url href=https://www.wikiart.org/en/thomas-cole/all-works#!#filterName:Series_the-voyage-of-life,resultType:masonry][i]The Voyage of Life[/i][/url] (1840). In it, a river journey represents the human passage through life to eternal reward. Cole painted and exhibited a replica of the series in Rome, where he returned in 1841–42, traveling south to Sicily. After his return, he lived and worked chiefly in Catskill, keeping up with art activity in New York primarily through Durand. He continued to produce American and foreign landscape subjects of incredible beauty, including the [url href=https://www.wikiart.org/en/thomas-cole/the-mountain-ford-1846][i]Mountain Ford[/i][/url] (1846). \n\nIn 1844, Cole welcomed into his Catskill studio the young [url href=https://www.wikiart.org/en/frederic-edwin-church]Frederic Church[/url], who studied with him until 1846 and went on to become the most renowned exponent of the generation that followed Cole. By 1846, Cole was at work on his largest and most ambitious series, [url href=https://www.wikiart.org/en/thomas-cole/all-works#!#filterName:Series_the-cross-and-the-world,resultType:masonry][i]The Cross and the World[/i][/url], but in February 1848 contracted pleurisy and died before completing it. \n\nThe paintings of Thomas Cole, like the writings of his contemporary Ralph Waldo Emerson, stand as monuments to the dreams and anxieties of the fledgling American nation during the mid-19th century; and they are also euphoric celebrations of its natural landscapes. Cole is considered the first artist to bring the eye of a European [url href=https://www.wikiart.org/en/artists-by-art-movement/romanticism]Romantic[/url] landscape painter to those environments, but also a figure whose idealism and religious sensibilities expressed a uniquely American spirit. In his works, we find the dramatic splendor of [url href=https://www.wikiart.org/en/caspar-david-friedrich]Caspar David Freidrich[/url] or [url href=https://www.wikiart.org/en/william-turner]J.M.W Turner[/url] transposed onto the Catskill and Adirondack Mountains. But whereas younger American painters such as [url href=https://www.wikiart.org/en/albert-bierstadt]Albert Bierstadt[/url] had come into direct contact with [url href=https://www.wikiart.org/en/artists-by-art-institution/kunstakademie-dusseldorf-dusseldorf-germany#!#resultType:masonry]The Düsseldorf School of painting[/url], and thus with the tradition in which they placed themselves, Cole was largely self-tutored, representing something of the archetypal American figure of the auto-didact.\n\nIn many ways, Cole's art epitomizes all contradictions of European settler culture in America. He was in love with the sublime wildness of the American landscape and sought to preserve it with his art, but his very presence in that landscape, and the development of his career, depended on the processes of urbanization and civilization which threatened it. From a modern perspective, Cole's Eurocentric gaze on seemingly empty wildernesses which had, in fact, been populated for centuries, also seems troubling; where Native Americans do appear in his work, as in [url href=https://www.wikiart.org/en/thomas-cole/falls-of-the-kaaterskill-1826][i]Falls of the Kaaterskill[/i][/url] (1826), it is as picturesque flecks rather than characterized participants in the scene.\n\nCole's legacy is evident in the work of future American artists who advanced the Hudson River style, including his student Frederic Edwin Church, Albert Bierstadt, Jasper Cropsey, Asher B. Durand, [url href=https://www.wikiart.org/en/george-inness]George Inness[/url], [url href=https://www.wikiart.org/en/john-frederick-kensett]John Kensett[/url], and [url href=https://www.wikiart.org/en/thomas-moran]Thomas Moran[/url]. Speaking more broadly, a whole sweep of 20th-century North-American art, from [url href=https://www.wikiart.org/en/artists-by-art-movement/precisionism]Precisionism[/url] to [url href=https://www.wikiart.org/en/artists-by-art-movement/environmental-art]Land Art[/url], might be seen to have inherited something of the grand scale and ambition of Cole's work. In this sense, his paintings capture not only the character of American culture during the mid-19th century but perhaps something more enduring about the open and expansive quality of that culture.", + "birthDay" : "/Date(-5330448000000)/", + "birthDayAsString" : "February 1, 1801", + "contentId" : 254330, + "deathDay" : "/Date(-3846441600000)/", + "deathDayAsString" : "February 11, 1848", + "dictonaries" : [ + 1368, + 11415, + 310, + ], + "gender" : "male", + "image" : "https://uploads8.wikiart.org/temp/19f6a140-59d2-4959-8d11-fd4ca582b7f2.jpg!Portrait.jpg", + "lastNameFirst" : "Cole Thomas", + "periodsOfWork" : "", + "relatedArtistsIds" : [], + "series" : "The Cross and the World\r\nThe Course of Empire\r\nThe Voyage of Life", + "story" : "http://en.wikipedia.org/wiki/Thomas_Cole", + "themes" : "", + "url" : "thomas-cole", + "wikipediaUrl" : "http://en.wikipedia.org/wiki/Thomas_Cole" + }, + "artistName" : "Thomas Cole", + "artistUrl" : "/en/thomas-cole", + "extension" : str, + "filename" : str, + "flags" : int, + "height" : int, + "id" : r"re:[0-9a-f]+", + "image" : str, + "map" : str, + "paintingUrl": r"re:/en/thomas-cole/.+", + "title" : str, + "width" : int, + "year" : str, }, { @@ -30,6 +72,9 @@ __tests__ = ( "#category": ("", "wikiart", "image"), "#class" : wikiart.WikiartImageExtractor, "#sha1_url": "d7f60118c34067b2b37d9577e412dc1477b94207", + "#urls" : ( + "https://uploads5.wikiart.org/images/huang-shen/summer.jpg", + ), }, { @@ -37,6 +82,19 @@ __tests__ = ( "#category": ("", "wikiart", "artworks"), "#class" : wikiart.WikiartArtworksExtractor, "#sha1_url": "36e054fcb3363b7f085c81f4778e6db3994e56a3", + "#urls" : ( + "https://uploads4.wikiart.org/images/hieronymus-bosch/triptych-of-last-judgement.jpg", + "https://uploads6.wikiart.org/images/hieronymus-bosch/triptych-of-last-judgement-1.jpg", + "https://uploads0.wikiart.org/images/hieronymus-bosch/tiptych-of-temptation-of-st-anthony-1506.jpg", + "https://uploads7.wikiart.org/images/matthias-grünewald/st-elizabeth-and-a-saint-woman-with-palm-1511.jpg", + "https://uploads2.wikiart.org/images/matthias-grünewald/st-lawrence-and-st-cyricus-1511.jpg", + "https://uploads0.wikiart.org/images/pieter-bruegel-the-elder/the-death-of-the-virgin.jpg", + "https://uploads4.wikiart.org/images/pieter-bruegel-the-elder/christ-and-the-woman-taken-in-adultery-1565-1.jpg", + "https://uploads6.wikiart.org/images/giovanni-battista-tiepolo/not_detected_241014.jpg", + "https://uploads4.wikiart.org/images/edgar-degas/interior-the-rape-1869.jpg", + "https://uploads3.wikiart.org/00265/images/john-singer-sargent/1396294310-dame-alice-ellen-terry-by-john-singer-sargent.jpg", + "https://uploads0.wikiart.org/00293/images/hryhorii-havrylenko/1954-18-5-32-5.jpg", + ), }, { diff --git a/test/results/wikifeet.py b/test/results/wikifeet.py index dade42e7..56e391c5 100644 --- a/test/results/wikifeet.py +++ b/test/results/wikifeet.py @@ -23,7 +23,7 @@ __tests__ = ( "pid" : int, "width" : int, "height" : int, - "shoesize" : "9 US", + "shoesize" : "10 US", "type" : "women", "tags" : list, }, From 1137d72d4852fde5067f53f4fc364cce7f2d8355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 27 Nov 2023 18:36:15 +0100 Subject: [PATCH 167/344] [tests] skip test_init for BaseExtractor classes without instances --- test/test_extractor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_extractor.py b/test/test_extractor.py index 29ccf97f..d2dd643c 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -142,6 +142,8 @@ class TestExtractorModule(unittest.TestCase): if cls.category == "ytdl": continue extr = cls.from_url(cls.example) + if not extr and cls.basecategory and not cls.instances: + continue extr.initialize() extr.finalize() From 4dde36889ca24a1ae39bf99eb865968254cf533c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 27 Nov 2023 21:49:20 +0100 Subject: [PATCH 168/344] release version 1.26.3 --- CHANGELOG.md | 59 +++++++++++++++++++++++++++++++++++++++++++ README.rst | 4 +-- gallery_dl/version.py | 2 +- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad34930f..39f58849 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## 1.26.3 - 2023-11-27 +### Extractors +#### Additions +- [behance] support `text` modules ([#4799](https://github.com/mikf/gallery-dl/issues/4799)) +- [behance] add `modules` option ([#4799](https://github.com/mikf/gallery-dl/issues/4799)) +- [blogger] support `www.micmicidol.club` ([#4759](https://github.com/mikf/gallery-dl/issues/4759)) +- [erome] add `count` metadata ([#4812](https://github.com/mikf/gallery-dl/issues/4812)) +- [exhentai] add `gp` option ([#4576](https://github.com/mikf/gallery-dl/issues/4576)) +- [fapello] support `.su` TLD ([#4840](https://github.com/mikf/gallery-dl/issues/4840), [#4841](https://github.com/mikf/gallery-dl/issues/4841)) +- [pixeldrain] add `file` and `album` extractors ([#4839](https://github.com/mikf/gallery-dl/issues/4839)) +- [pixeldrain] add `api-key` option ([#4839](https://github.com/mikf/gallery-dl/issues/4839)) +- [tmohentai] add `gallery` extractor ([#4808](https://github.com/mikf/gallery-dl/issues/4808), [#4832](https://github.com/mikf/gallery-dl/issues/4832)) +#### Fixes +- [cyberdrop] update to site layout changes +- [exhentai] handle `Downloading … requires GP` errors ([#4576](https://github.com/mikf/gallery-dl/issues/4576), [#4763](https://github.com/mikf/gallery-dl/issues/4763)) +- [exhentai] fix empty API URL with `"source": "hitomi"` ([#4829](https://github.com/mikf/gallery-dl/issues/4829)) +- [hentaifoundry] check for and update expired sessions ([#4694](https://github.com/mikf/gallery-dl/issues/4694)) +- [hiperdex] fix `manga` metadata +- [idolcomplex] update to site layout changes +- [imagefap] fix resolution of single images +- [instagram] fix exception on empty `video_versions` ([#4795](https://github.com/mikf/gallery-dl/issues/4795)) +- [mangaread] fix extraction +- [mastodon] fix reblogs ([#4580](https://github.com/mikf/gallery-dl/issues/4580)) +- [nitter] fix video extraction ([#4853](https://github.com/mikf/gallery-dl/issues/4853), [#4855](https://github.com/mikf/gallery-dl/issues/4855)) +- [pornhub] fix `user` metadata for gifs +- [tumblr] fix `day` extractor +- [wallpapercave] fix extraction +- [warosu] fix file URLs +- [webtoons] fix pagination when receiving an HTTP redirect +- [xvideos] fix metadata extraction +- [zerochan] fix metadata extraction +#### Improvements +- [hentaicosplays] force `https://` for download URLs +- [oauth] warn when cache is enabled but not writeable ([#4771](https://github.com/mikf/gallery-dl/issues/4771)) +- [sankaku] update URL patterns +- [twitter] ignore promoted Tweets ([#3894](https://github.com/mikf/gallery-dl/issues/3894), [#4790](https://github.com/mikf/gallery-dl/issues/4790)) +- [weibo] detect redirects to login page ([#4773](https://github.com/mikf/gallery-dl/issues/4773)) +#### Removals +- [foolslide] remove `powermanga.org` +### Downloaders +#### Changes +- [http] treat files not passing `filesize-min`/`-max` as skipped ([#4821](https://github.com/mikf/gallery-dl/issues/4821)) +### Options +#### Additions +- add `metadata-extractor` option ([#4549](https://github.com/mikf/gallery-dl/issues/4549)) +- support `metadata-*` names for `*-metadata` options + (for example `url-metadata` is now also recognized as `metadata-url`) +### CLI +#### Additions +- implement `-I/--input-file-comment` and `-x/--input-file-delete` options ([#4732](https://github.com/mikf/gallery-dl/issues/4732)) +- add `--ugoira` as a general version of `--ugoira-conv` and co. +- add `--mtime` as a general version of `--mtime-from-date` +- add `--cbz` +#### Fixes +- allow `--mtime-from-date` to work with Weibo`s metadata structure +### Miscellaneous +#### Additions +- add a simple Dockerfile ([#4831](https://github.com/mikf/gallery-dl/issues/4831)) + ## 1.26.2 - 2023-11-04 ### Extractors #### Additions diff --git a/README.rst b/README.rst index 9c1b3388..776ba434 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.3/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.3/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index dd816f06..5034fb23 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.3-dev" +__version__ = "1.26.3" From 43ca49c1b4ce6fd7deecb8af274f6678adf43a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 2 Dec 2023 13:56:21 +0100 Subject: [PATCH 169/344] [github] add workflow to build and push docker images heavily inspired by and adapted from https://github.com/danbooru/danbooru/blob/master/.github/workflows/docker-build.yaml --- .github/workflows/docker.yml | 61 +++++++++++++++++++++++++++++++ .github/workflows/executables.yml | 2 +- .github/workflows/tests.yml | 2 +- gallery_dl/version.py | 2 +- scripts/release.sh | 2 +- 5 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/docker.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 00000000..57c1d95b --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,61 @@ +name: docker + +on: + push: + branches: + - master + tags: + - v[0-9]+.[0-9]+.[0-9]+ + +permissions: + packages: write + +jobs: + docker: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + # https://github.com/docker/setup-buildx-action + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # https://github.com/docker/login-action + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ github.token }} + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # https://github.com/docker/metadata-action + - name: Generate Docker tags + uses: docker/metadata-action@v5 + id: metadata + with: + images: | + mikf123/gallery-dl + ghcr.io/mikf/gallery-dl + tags: | + type=sha,format=long,prefix= + type=ref,event=tag + # https://github.com/docker/metadata-action/issues/112 + flavor: | + latest=${{ github.ref == 'refs/heads/master' }} + + # https://github.com/docker/build-push-action + - name: Build image + uses: docker/build-push-action@v5 + with: + push: true + tags: ${{ steps.metadata.outputs.tags }} + labels: ${{ steps.metadata.outputs.labels }} + platforms: linux/amd64 diff --git a/.github/workflows/executables.yml b/.github/workflows/executables.yml index 03251c86..f5b45672 100644 --- a/.github/workflows/executables.yml +++ b/.github/workflows/executables.yml @@ -24,7 +24,7 @@ jobs: python-packages: "toml" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }} uses: actions/setup-python@v4 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2c6dfd93..67520018 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check file permissions run: | diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 5034fb23..fdf1e3ca 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.3" +__version__ = "1.26.4-dev" diff --git a/scripts/release.sh b/scripts/release.sh index f32c796d..8b84b980 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -161,6 +161,6 @@ build-python build-linux build-windows sign -upload-git upload-pypi +upload-git update-dev From ad0134daf7dad84b2075f5173c9d6988c4751d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 2 Dec 2023 17:57:24 +0100 Subject: [PATCH 170/344] add Python 3.12 to classifiers list --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c91549a0..ee66f5f5 100644 --- a/setup.py +++ b/setup.py @@ -135,6 +135,7 @@ def build_setuptools(): "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Internet :: WWW/HTTP", From da0da0faaab4a787173f122bf4a5b00ca7b61465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 4 Dec 2023 22:49:28 +0100 Subject: [PATCH 171/344] [exhentai] store more cookies when logging in (#4881) include 'igneous', 'hath_perks', etc and not just 'ipb_member_id' and 'ipb_pass_hash' like before --- gallery_dl/extractor/exhentai.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 5dc498f1..4facd75c 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -85,6 +85,7 @@ class ExhentaiExtractor(Extractor): @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) + url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01" headers = { "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1", @@ -98,10 +99,19 @@ class ExhentaiExtractor(Extractor): "ipb_login_submit": "Login!", } + self.cookies.clear() + response = self.request(url, method="POST", headers=headers, data=data) if b"You are now logged in as:" not in response.content: raise exception.AuthenticationError() - return {c: response.cookies[c] for c in self.cookies_names} + + # collect more cookies + url = self.root + "/favorites.php" + response = self.request(url) + if response.history: + self.request(url) + + return self.cookies class ExhentaiGalleryExtractor(ExhentaiExtractor): From 1770c31e633d24bb4a6da23f599d900f70d77f80 Mon Sep 17 00:00:00 2001 From: jsouthgb <justin.southern@groundblock.dev> Date: Tue, 5 Dec 2023 07:07:06 -0500 Subject: [PATCH 172/344] [urlgalleries] add support --- docs/supportedsites.md | 6 ++++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/urlgalleries.py | 43 ++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 gallery_dl/extractor/urlgalleries.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8f54b157..003dcaa9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -889,6 +889,12 @@ Consider all sites to be NSFW unless otherwise known. <td>Files</td> <td></td> </tr> +<tr> + <td>Urlgalleries</td> + <td>https://urlgalleries.net/</td> + <td>Galleries</td> + <td></td> +</tr> <tr> <td>Vipergirls</td> <td>https://vipergirls.to/</td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 72239d5c..d074de22 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -155,6 +155,7 @@ modules = [ "tumblrgallery", "twibooru", "twitter", + "urlgalleries", "unsplash", "uploadir", "urlshortener", diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py new file mode 100644 index 00000000..ae2b7205 --- /dev/null +++ b/gallery_dl/extractor/urlgalleries.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://urlgalleries.net/""" + +from .common import GalleryExtractor +from .. import text + + +class UrlgalleriesExtractor(GalleryExtractor): + """Base class for Urlgalleries extractors""" + category = "urlgalleries" + root = "urlgalleries.net" + directory_fmt = ("{category}", "{title}") + pattern = r"(?:https?://)([^/?#]+)?\.urlgalleries\.net/([^/?#]+)/([^/?#]+)" + example = "https://blog.urlgalleries.net/gallery-1234567/a-title--1234" + + def __init__(self, match): + self.blog = match.group(1) + self.gallery_id = match.group(2) + self.title = match.group(3) + url = "{}.urlgalleries.net/{}/{}&a=10000".format( + self.blog, self.gallery_id, self.title) + GalleryExtractor.__init__(self, match, text.ensure_http_scheme(url)) + + def images(self, page): + extr = text.extr(page, 'id="wtf"', "</div>") + url = "{}{{}}".format(self.root).format + return [ + (text.ensure_http_scheme(url(i)), None) + for i in text.extract_iter(extr, "href='", "'") + ] + + def metadata(self, page): + date = text.extr( + page, "float:left;'> ", '</div>').split(" | ")[-1] + return { + 'title': self.title, + 'date': text.parse_datetime(date, format='%B %d, %Y T%H:%M') + } From ecaa0feb5d9fc39d4b26aefa211d250e817f90fd Mon Sep 17 00:00:00 2001 From: jsouthgb <justin.southern@groundblock.dev> Date: Tue, 5 Dec 2023 07:08:11 -0500 Subject: [PATCH 173/344] [urlgalleries] add support --- gallery_dl/extractor/urlgalleries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index ae2b7205..aa6e7db5 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -10,7 +10,7 @@ from .common import GalleryExtractor from .. import text -class UrlgalleriesExtractor(GalleryExtractor): +class UrlgalleriesGalleryExtractor(GalleryExtractor): """Base class for Urlgalleries extractors""" category = "urlgalleries" root = "urlgalleries.net" From cf5702c84333fc375a7f984dccb9deb1d29f1903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 5 Dec 2023 15:13:58 +0100 Subject: [PATCH 174/344] [twitter] generalize "Login Required" error (#4734, #4324) --- gallery_dl/extractor/twitter.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ca1e9067..56877ede 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1289,11 +1289,9 @@ class TwitterAPI(): self.extractor.wait(until=until, seconds=seconds) continue - if response.status_code == 403 and \ - not self.headers["x-twitter-auth-type"] and \ - endpoint == "/2/search/adaptive.json": - raise exception.AuthorizationError( - "Login required to access search results") + if response.status_code in (403, 404) and \ + not self.headers["x-twitter-auth-type"]: + raise exception.AuthorizationError("Login required") # error try: From a4e6ea667bf2860c72b90afb36ba4750cff98600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 5 Dec 2023 15:57:26 +0100 Subject: [PATCH 175/344] [twitter] retry API calls when their response contains errors (#4811) --- gallery_dl/extractor/twitter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 56877ede..92c4ce4a 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1276,8 +1276,18 @@ class TwitterAPI(): self.headers["x-csrf-token"] = csrf_token if response.status_code < 400: - # success - return response.json() + data = response.json() + if not data.get("errors"): + return data # success + + msg = data["errors"][0].get("message") or "Unspecified" + self.extractor.log.debug("internal error: '%s'", msg) + + if self.headers["x-twitter-auth-type"]: + continue # retry + + # fall through to "Login Required" + response.status_code = 404 if response.status_code == 429: # rate limit exceeded From 4eb3590103bc3a19db7f7cf5b79593f6d2645bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 5 Dec 2023 17:48:50 +0100 Subject: [PATCH 176/344] [nijie] fix image URLs of multi-image posts (#4876) --- gallery_dl/extractor/nijie.py | 34 ++++++++++---------- test/results/horne.py | 36 +++++++++++++++++++++- test/results/nijie.py | 58 ++++++++++++++++++++++++++++++++++- 3 files changed, 108 insertions(+), 20 deletions(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 76c5404e..54f29429 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -57,7 +57,11 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): data["user_name"] = data["artist_name"] yield Message.Directory, data - for image in self._extract_images(page): + for num, url in enumerate(self._extract_images(image_id, page)): + image = text.nameext_from_url(url, { + "num": num, + "url": "https:" + url, + }) image.update(data) if not image["extension"]: image["extension"] = "jpg" @@ -72,7 +76,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): extr = text.extract_from(page) keywords = text.unescape(extr( 'name="keywords" content="', '" />')).split(",") - data = { + return { "title" : keywords[0].strip(), "description": text.unescape(extr( '"description": "', '"').replace("&", "&")), @@ -82,7 +86,6 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): "artist_name": keywords[1], "tags" : keywords[2:-1], } - return data @staticmethod def _extract_data_horne(page): @@ -90,7 +93,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): extr = text.extract_from(page) keywords = text.unescape(extr( 'name="keywords" content="', '" />')).split(",") - data = { + return { "title" : keywords[0].strip(), "description": text.unescape(extr( 'property="og:description" content="', '"')), @@ -101,21 +104,16 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): "itemprop='datePublished' content=", "<").rpartition(">")[2], "%Y-%m-%d %H:%M:%S", 9), } - return data - @staticmethod - def _extract_images(page): - """Extract image URLs from 'page'""" - images = text.extract_iter(page, "/view_popup.php", "</a>") - for num, image in enumerate(images): - src = text.extr(image, 'src="', '"') - if not src: - continue - url = ("https:" + src).replace("/__rs_l120x120/", "/") - yield text.nameext_from_url(url, { - "num": num, - "url": url, - }) + def _extract_images(self, image_id, page): + if '&#diff_1" ' in page: + # multiple images + url = "{}/view_popup.php?id={}".format(self.root, image_id) + page = self.request(url).text + yield from text.extract_iter( + page, 'href="javascript:void(0);"><img src="', '"') + else: + yield text.extr(page, 'itemprop="image" src="', '"') @staticmethod def _extract_user_name(page): diff --git a/test/results/horne.py b/test/results/horne.py index 8cbce012..9058a481 100644 --- a/test/results/horne.py +++ b/test/results/horne.py @@ -13,6 +13,10 @@ __tests__ = ( "#url" : "https://horne.red/members.php?id=58000", "#category": ("Nijie", "horne", "user"), "#class" : nijie.NijieUserExtractor, + "#urls" : ( + "https://horne.red/members_illust.php?id=58000", + "https://horne.red/members_dojin.php?id=58000", + ), }, { @@ -71,11 +75,41 @@ __tests__ = ( "#class" : nijie.NijieFollowedExtractor, }, +{ + "#url" : "https://horne.red/view.php?id=8708", + "#category": ("Nijie", "horne", "image"), + "#class" : nijie.NijieImageExtractor, + "#urls" : "https://pic.nijie.net/07/horne/18/00/58000/illust/0_0_c8f715a8f3d53943_db6231.png", + + "artist_id" : 58000, + "artist_name": "のえるわ", + "date" : "dt:2018-01-29 14:25:39", + "description": "前回とシチュがまるかぶり \r\n竿野郎は塗るのだるかった", + "extension" : "png", + "filename" : "0_0_c8f715a8f3d53943_db6231", + "image_id" : 8708, + "num" : 0, + "tags" : [ + "男の娘", + "オリキャラ", + "うちのこ", + ], + "title" : "うちのこえっち", + "url" : "https://pic.nijie.net/07/horne/18/00/58000/illust/0_0_c8f715a8f3d53943_db6231.png", + "user_id" : 58000, + "user_name" : "のえるわ", +}, + { "#url" : "https://horne.red/view.php?id=8716", "#category": ("Nijie", "horne", "image"), "#class" : nijie.NijieImageExtractor, - "#count" : 4, + "#urls" : ( + "https://pic.nijie.net/07/horne/18/00/58000/illust/0_0_b4ffb4b6f7ec6d51_1a32c0.png", + "https://pic.nijie.net/07/horne/18/00/58000/illust/8716_0_2690972a4f6270bb_85ed8f.png", + "https://pic.nijie.net/07/horne/18/00/58000/illust/8716_1_09348508d8b76f36_f6cf47.png", + "https://pic.nijie.net/08/horne/18/00/58000/illust/8716_2_5151d956d3789277_d76e75.png", + ), "artist_id" : 58000, "artist_name": "のえるわ", diff --git a/test/results/nijie.py b/test/results/nijie.py index 2b5785af..01ac8fac 100644 --- a/test/results/nijie.py +++ b/test/results/nijie.py @@ -14,13 +14,20 @@ __tests__ = ( "#url" : "https://nijie.info/members.php?id=44", "#category": ("Nijie", "nijie", "user"), "#class" : nijie.NijieUserExtractor, + "#urls" : ( + "https://nijie.info/members_illust.php?id=44", + "https://nijie.info/members_dojin.php?id=44", + ), }, { "#url" : "https://nijie.info/members_illust.php?id=44", "#category": ("Nijie", "nijie", "illustration"), "#class" : nijie.NijieIllustrationExtractor, - "#sha1_url": "1553e5144df50a676f5947d02469299b401ad6c0", + "#urls": ( + "https://pic.nijie.net/04/nijie/14/44/44/illust/0_0_f46c08462568c2f1_be95d7.jpg", + "https://pic.nijie.net/06/nijie/14/44/44/illust/0_0_28e8c02d921bee33_9222d3.jpg", + ), "artist_id" : 44, "artist_name": "ED", @@ -93,9 +100,58 @@ __tests__ = ( "#url" : "https://nijie.info/view.php?id=70720", "#category": ("Nijie", "nijie", "image"), "#class" : nijie.NijieImageExtractor, + "#urls" : "https://pic.nijie.net/06/nijie/14/44/44/illust/0_0_28e8c02d921bee33_9222d3.jpg", "#sha1_url" : "3d654e890212ba823c9647754767336aebc0a743", "#sha1_metadata": "41da5d0e178b04f01fe72460185df52fadc3c91b", "#sha1_content" : "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", + + "artist_id" : 44, + "artist_name": "ED", + "date" : "dt:2014-01-18 19:58:21", + "description": "租絵にてお邪魔いたし候\r\n是非ともこの”おっぱい”をご高覧賜りたく馳せ参じた次第\r\n長文にて失礼仕る\r\n\r\nまず全景でありますが、首を右に傾けてみて頂きたい\r\nこの絵図は茶碗を眺めていた私が思わぬ美しさにて昇天したときのものを、筆をとり、したためたものである(トレースではない)\r\n筆は疾風の如く走り、半刻過ぎには私好みの”おっぱい”になっていたのである!\r\n次に細部をみて頂きたい\r\n絵図を正面から見直して頂くと、なんとはんなりと美しいお椀型をしたおっぱいであろうか  右手から緩やかに生まれる曲線は左手に進むにつれて、穏やかな歪みを含み流れる  これは所謂轆轤目であるが三重の紐でおっぱいをぐるぐると巻きつけた情景そのままであり、この歪みから茶碗の均整は崩れ、たぷんたぷんのおっぱいの重量感を醸し出している!\r\nさらに左手に進めば梅花皮(カイラギ)を孕んだ高大が現れる 今回は点線にて表現するが、その姿は乳首から母乳が噴出するが如く 或は精子をぶっかけられたが如く 白くとろっとした釉薬の凝固が素晴しい景色をつくりだしているのである!\r\n最後には極めつけ、すくっと螺旋を帯びながらそそり立つ兜巾(ときん)!この情景はまさしく乳首である!  全体をふんわりと盛り上げさせる乳輪にちょこっと存在する乳頭はぺろりと舌で確かめ勃起させたくなる風情がある!\r\n\r\nこれを”おっぱい”と呼ばずなんと呼ぼうや!?\r\n\r\n興奮のあまり失礼致した\r\n御免", + "extension" : "jpg", + "filename" : "0_0_28e8c02d921bee33_9222d3", + "image_id" : 70720, + "num" : 0, + "tags" : ["おっぱい"], + "title" : "俺好高麗井戸茶碗 銘おっぱい", + "url" : "https://pic.nijie.net/06/nijie/14/44/44/illust/0_0_28e8c02d921bee33_9222d3.jpg", + "user_id" : 44, + "user_name" : "ED", +}, + +{ + "#url" : "https://nijie.info/view.php?id=594044", + "#category": ("Nijie", "nijie", "image"), + "#class" : nijie.NijieImageExtractor, + "#urls": ( + "https://pic.nijie.net/02/nijie/23m12/09/49509/illust/0_0_63568cc428259d50_45ca51.jpg", + "https://pic.nijie.net/01/nijie/23m12/09/49509/illust/594044_0_1c94b7cc4503589f_79c66c.jpg", + "https://pic.nijie.net/02/nijie/23m12/09/49509/illust/594044_1_9f4737ad48bf43c7_8f1e8e.jpg", + "https://pic.nijie.net/01/nijie/23m12/09/49509/illust/594044_2_a162861fac970a45_38c5f8.jpg", + ), + + "artist_id" : 49509, + "artist_name": "黒川 竜", + "date" : "dt:2023-12-02 04:19:29", + "description": "【DLサイトコム】ウィンターセール 30%OFF\r\n期間:2024年2月14日まで\r\n【toloveるドリンク】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ042727.html\r\n【toloveるドリンク2】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ043289.html\r\n【クランクランBIG】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ043564.html", + "image_id" : 594044, + "num" : range(0, 3), + "tags" : [ + "オリジナル", + "漫画", + "中出し", + "爆乳", + "巨乳", + "ToLOVEる", + "宣伝", + "クラン・クラン", + "マクロスF", + ], + "title" : "【DLサイトコム】ウィンターセール", + "url" : str, + "user_id" : 49509, + "user_name" : "黒川 竜", }, { From 99b76628f7ba66fd19a11c454e23572164b55315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 5 Dec 2023 20:49:51 +0100 Subject: [PATCH 177/344] implement '-e/--error-file' command-line option (#4732) copying per-URL options from regular, read-only input files does currently not work --- docs/options.md | 4 ++- gallery_dl/__init__.py | 68 ++++++++++++++++++++++++++++++++---------- gallery_dl/option.py | 5 ++++ 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/docs/options.md b/docs/options.md index 6d22062b..45ce7eca 100644 --- a/docs/options.md +++ b/docs/options.md @@ -39,6 +39,7 @@ -E, --extractor-info Print extractor defaults and settings -K, --list-keywords Print a list of available keywords and example values for the given URLs + -e, --error-file FILE Add input URLs which returned an error to FILE --list-modules Print a list of available extractor modules --list-extractors Print a list of extractor classes with description, (sub)category and example URL @@ -51,7 +52,8 @@ ## Downloader Options: -r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M) -R, --retries N Maximum number of retries for failed HTTP - requests or -1 for infinite retries (default: 4) + requests or -1 for infinite retries (default: + 4) --http-timeout SECONDS Timeout for HTTP connections (default: 30.0) --sleep SECONDS Number of seconds to wait before each download. This can be either a constant value or a range diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 287faf18..0f9d1cae 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -249,6 +249,9 @@ def main(): input_log.error(exc) return getattr(exc, "code", 128) + if args.error_file: + input_manager.error_file(args.error_file) + pformat = config.get(("output",), "progress", True) if pformat and len(input_manager.urls) > 1 and \ args.loglevel < logging.ERROR: @@ -270,6 +273,7 @@ def main(): if status: retval |= status + input_manager.error() else: input_manager.success() @@ -281,6 +285,7 @@ def main(): except exception.NoExtractorError: log.error("Unsupported URL '%s'", url) retval |= 64 + input_manager.error() input_manager.next() return retval @@ -301,9 +306,12 @@ class InputManager(): def __init__(self): self.urls = [] self.files = () + + self._url = "" + self._item = None self._index = 0 - self._current = None self._pformat = None + self._error_fp = None def add_url(self, url): self.urls.append(url) @@ -428,6 +436,15 @@ class InputManager(): else: append(url) + def error_file(self, path): + try: + path = util.expand_path(path) + self._error_fp = open(path, "a", encoding="utf-8") + except Exception as exc: + self.log.warning( + "Unable to open error file (%s: %s)", + exc.__class__.__name__, exc) + def progress(self, pformat=True): if pformat is True: pformat = "[{current}/{total}] {url}\n" @@ -439,17 +456,37 @@ class InputManager(): self._index += 1 def success(self): - if self._current: - url, path, action, indicies = self._current - lines = self.files[path] - action(lines, indicies) + if self._item: + self._rewrite() + + def error(self): + if self._error_fp: + if self._item: + url, path, action, indicies = self._item + lines = self.files[path] + out = "".join(lines[i] for i in indicies) + self._rewrite() + else: + out = str(self._url) + "\n" + try: - with open(path, "w", encoding="utf-8") as fp: - fp.writelines(lines) + self._error_fp.write(out) except Exception as exc: self.log.warning( "Unable to update '%s' (%s: %s)", - path, exc.__class__.__name__, exc) + self._error_fp.name, exc.__class__.__name__, exc) + + def _rewrite(self): + url, path, action, indicies = self._item + lines = self.files[path] + action(lines, indicies) + try: + with open(path, "w", encoding="utf-8") as fp: + fp.writelines(lines) + except Exception as exc: + self.log.warning( + "Unable to update '%s' (%s: %s)", + path, exc.__class__.__name__, exc) @staticmethod def _action_comment(lines, indicies): @@ -467,23 +504,24 @@ class InputManager(): def __next__(self): try: - item = self.urls[self._index] + url = self.urls[self._index] except IndexError: raise StopIteration - if isinstance(item, tuple): - self._current = item - item = item[0] + if isinstance(url, tuple): + self._item = url + url = url[0] else: - self._current = None + self._item = None + self._url = url if self._pformat: output.stderr_write(self._pformat({ "total" : len(self.urls), "current": self._index + 1, - "url" : item, + "url" : url, })) - return item + return url class ExtendedUrl(): diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 255d9f29..5966f253 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -286,6 +286,11 @@ def build_parser(): help=("Print a list of available keywords and example values " "for the given URLs"), ) + output.add_argument( + "-e", "--error-file", + dest="error_file", metavar="FILE", + help="Add input URLs which returned an error to FILE", + ) output.add_argument( "--list-modules", dest="list_modules", action="store_true", From d1ea60c057a7d64c916bf874e15f20eae8a013f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 5 Dec 2023 22:55:07 +0100 Subject: [PATCH 178/344] [github] fix docker workflow (#4831) - run only on tagged commits and not on every commit to master - use the correct github token --- .github/workflows/docker.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 57c1d95b..b0be8efa 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -2,8 +2,6 @@ name: docker on: push: - branches: - - master tags: - v[0-9]+.[0-9]+.[0-9]+ @@ -28,7 +26,7 @@ jobs: with: registry: ghcr.io username: ${{ github.repository_owner }} - password: ${{ github.token }} + password: ${{ secrets.github_token }} - name: Login to DockerHub uses: docker/login-action@v3 From db978b34f19fc215ad6156c93ed86d6dc1fbd0a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 6 Dec 2023 12:28:52 +0100 Subject: [PATCH 179/344] [docker] use PAK for GHCR login (#4831) --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index b0be8efa..6d69e886 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -26,7 +26,7 @@ jobs: with: registry: ghcr.io username: ${{ github.repository_owner }} - password: ${{ secrets.github_token }} + password: ${{ secrets.GHCR_TOKEN }} - name: Login to DockerHub uses: docker/login-action@v3 From 9dd5cb8c8a1b1a49403b88b17a7b03fc1f027bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 6 Dec 2023 21:31:31 +0100 Subject: [PATCH 180/344] interactively prompt for passwords on login when none is provided --- docs/configuration.rst | 6 +++++- gallery_dl/extractor/common.py | 2 +- gallery_dl/util.py | 8 ++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index eb263c17..546ab189 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -379,7 +379,7 @@ Description The username and password to use when attempting to log in to another site. - Specifying a username and password is required for + Specifying username and password is required for * ``nijie`` @@ -415,6 +415,10 @@ Description (*) The password value for these sites should be the API key found in your user profile, not the actual account password. + Note: Leave the ``password`` value empty or undefined + to get prompted for a passeword when performing a login + (see `getpass() <https://docs.python.org/3/library/getpass.html#getpass.getpass>`__). + extractor.*.netrc ----------------- diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f3784272..bf4de4d4 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -233,7 +233,7 @@ class Extractor(): password = None if username: - password = self.config("password") + password = self.config("password") or util.LazyPrompt() elif self.config("netrc", False): try: info = netrc.netrc().authenticators(self.category) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 62aa12da..2fbba3ce 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -14,6 +14,7 @@ import sys import json import time import random +import getpass import hashlib import sqlite3 import binascii @@ -487,6 +488,13 @@ CODES = { } +class LazyPrompt(): + __slots__ = () + + def __str__(self): + return getpass.getpass() + + class CustomNone(): """None-style type that supports more operations than regular None""" __slots__ = () From 6a4218aa238d1074904dd39ec553a4df5127ebed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 6 Dec 2023 21:33:40 +0100 Subject: [PATCH 181/344] handle 'json' parameter in Extractor.request() manually Mainly to allow passing custom classes like util.LazyPrompt, but also to simplify and streamline how requests handles it. --- gallery_dl/extractor/common.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index bf4de4d4..9b010c59 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -136,6 +136,18 @@ class Extractor(): kwargs["timeout"] = self._timeout if "verify" not in kwargs: kwargs["verify"] = self._verify + + if "json" in kwargs: + json = kwargs["json"] + if json is not None: + kwargs["data"] = util.json_dumps(json).encode() + del kwargs["json"] + headers = kwargs.get("headers") + if headers: + headers["Content-Type"] = "application/json" + else: + kwargs["headers"] = {"Content-Type": "application/json"} + response = None tries = 1 From bdebe4597a5024ccb2436bf7e9eabbe9aff89b28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 6 Dec 2023 23:03:34 +0100 Subject: [PATCH 182/344] fix util.dump_response to work with bytes as header values --- gallery_dl/util.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 2fbba3ce..59be4d9b 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -275,7 +275,7 @@ Response Headers if hide_auth: authorization = req_headers.get("Authorization") if authorization: - atype, sep, _ = authorization.partition(" ") + atype, sep, _ = str(authorization).partition(" ") req_headers["Authorization"] = atype + " ***" if sep else "***" cookie = req_headers.get("Cookie") @@ -291,15 +291,17 @@ Response Headers r"(^|, )([^ =]+)=[^,;]*", r"\1\2=***", set_cookie, ) + fmt_nv = "{}: {}".format + fp.write(outfmt.format( request=request, response=response, request_headers="\n".join( - name + ": " + value + fmt_nv(name, value) for name, value in req_headers.items() ), response_headers="\n".join( - name + ": " + value + fmt_nv(name, value) for name, value in res_headers.items() ), ).encode()) From e256434c9e8e4f92470d6fc4ed76afcb2d6b14b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 6 Dec 2023 23:39:23 +0100 Subject: [PATCH 183/344] use custom HTTPBasicAuth class to support LazyPrompt as password and to generate the Authorization header only once instead of for every request --- gallery_dl/extractor/danbooru.py | 2 +- gallery_dl/extractor/deviantart.py | 2 +- gallery_dl/extractor/oauth.py | 2 +- gallery_dl/extractor/pixeldrain.py | 4 ++-- gallery_dl/extractor/reddit.py | 3 ++- gallery_dl/util.py | 13 +++++++++++++ 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 56d81e5b..9e6516e0 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -36,7 +36,7 @@ class DanbooruExtractor(BaseExtractor): username, api_key = self._get_auth_info() if username: self.log.debug("Using HTTP Basic Auth for user '%s'", username) - self.session.auth = (username, api_key) + self.session.auth = util.HTTPBasicAuth(username, api_key) def skip(self, num): pages = num // self.per_page diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2c37ef12..1852dc1b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1239,7 +1239,7 @@ class DeviantartOAuthAPI(): self.log.info("Requesting public access token") data = {"grant_type": "client_credentials"} - auth = (self.client_id, self.client_secret) + auth = util.HTTPBasicAuth(self.client_id, self.client_secret) response = self.extractor.request( url, method="POST", data=data, auth=auth, fatal=False) data = response.json() diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index d1f135d8..65db94d0 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -183,7 +183,7 @@ class OAuthBase(Extractor): } if auth: - auth = (client_id, client_secret) + auth = util.HTTPBasicAuth(client_id, client_secret) else: auth = None data["client_id"] = client_id diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index 34b4ebff..5cfdc43f 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -9,7 +9,7 @@ """Extractors for https://pixeldrain.com/""" from .common import Extractor, Message -from .. import text +from .. import text, util BASE_PATTERN = r"(?:https?://)?pixeldrain\.com" @@ -23,7 +23,7 @@ class PixeldrainExtractor(Extractor): def _init(self): api_key = self.config("api-key") if api_key: - self.session.auth = ("", api_key) + self.session.auth = util.HTTPBasicAuth("", api_key) def parse_datetime(self, date_string): return text.parse_datetime( diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index c0bf5b3e..feb6d1fe 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -423,9 +423,10 @@ class RedditAPI(): "grants/installed_client"), "device_id": "DO_NOT_TRACK_THIS_DEVICE"} + auth = util.HTTPBasicAuth(self.client_id, "") response = self.extractor.request( url, method="POST", headers=self.headers, - data=data, auth=(self.client_id, ""), fatal=False) + data=data, auth=auth, fatal=False) data = response.json() if response.status_code != 200: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 59be4d9b..53502ef0 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -490,6 +490,19 @@ CODES = { } +class HTTPBasicAuth(): + __slots__ = ("authorization",) + + def __init__(self, username, password): + self.authorization = b"Basic " + binascii.b2a_base64( + username.encode("latin1") + b":" + str(password).encode("latin1") + )[:-1] + + def __call__(self, request): + request.headers["Authorization"] = self.authorization + return request + + class LazyPrompt(): __slots__ = () From 042a9da4515f15ae41fdfc9b0dab83f10edb4190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 8 Dec 2023 17:29:54 +0100 Subject: [PATCH 184/344] add 'output.errorfile' config option --- docs/configuration.rst | 15 +++++++++++++++ gallery_dl/__init__.py | 6 ++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 546ab189..6e5820c1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4408,6 +4408,21 @@ Description The default format string here is ``"{message}"``. +output.errorfile +---------------- +Type + |Path|_ +Description + File to write input URLs which returned an error to. + + When combined with + ``-I``/``--input-file-comment`` or + ``-x``/``--input-file-delete``, + this option will cause all input URLs from these files + to be commented/deleted after processing them + and not just successful ones. + + output.num-to-str ----------------- Type diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 0f9d1cae..1ec0de6a 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -249,8 +249,10 @@ def main(): input_log.error(exc) return getattr(exc, "code", 128) - if args.error_file: - input_manager.error_file(args.error_file) + error_file = (args.error_file or + config.get(("output",), "errorfile")) + if error_file: + input_manager.error_file(error_file) pformat = config.get(("output",), "progress", True) if pformat and len(input_manager.urls) > 1 and \ From c29ae9af0830367d9b1f9063018d00654c634c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 8 Dec 2023 22:43:56 +0100 Subject: [PATCH 185/344] [urlgalleries] simplify + resolve redirects --- gallery_dl/extractor/urlgalleries.py | 56 +++++++++++++++++----------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index aa6e7db5..b21709a9 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -6,7 +6,7 @@ """Extractors for https://urlgalleries.net/""" -from .common import GalleryExtractor +from .common import GalleryExtractor, Message from .. import text @@ -14,30 +14,42 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): """Base class for Urlgalleries extractors""" category = "urlgalleries" root = "urlgalleries.net" - directory_fmt = ("{category}", "{title}") - pattern = r"(?:https?://)([^/?#]+)?\.urlgalleries\.net/([^/?#]+)/([^/?#]+)" - example = "https://blog.urlgalleries.net/gallery-1234567/a-title--1234" + request_interval = (0.5, 1.0) + pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" + example = "https://blog.urlgalleries.net/gallery-12345/TITLE" def __init__(self, match): - self.blog = match.group(1) - self.gallery_id = match.group(2) - self.title = match.group(3) - url = "{}.urlgalleries.net/{}/{}&a=10000".format( - self.blog, self.gallery_id, self.title) - GalleryExtractor.__init__(self, match, text.ensure_http_scheme(url)) - - def images(self, page): - extr = text.extr(page, 'id="wtf"', "</div>") - url = "{}{{}}".format(self.root).format - return [ - (text.ensure_http_scheme(url(i)), None) - for i in text.extract_iter(extr, "href='", "'") - ] + self.blog, self.gallery_id = match.groups() + url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( + self.blog, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def items(self): + page = self.request(self.gallery_url).text + imgs = self.images(page) + data = self.metadata(page) + data["count"] = len(imgs) + del page + + root = "https://{}.urlgalleries.net".format(self.blog) + yield Message.Directory, data + for data["num"], img in enumerate(imgs, 1): + response = self.request( + root + img, method="HEAD", allow_redirects=False) + yield Message.Queue, response.headers["Location"], data def metadata(self, page): - date = text.extr( - page, "float:left;'> ", '</div>').split(" | ")[-1] + extr = text.extract_from(page) return { - 'title': self.title, - 'date': text.parse_datetime(date, format='%B %d, %Y T%H:%M') + "gallery_id": self.gallery_id, + "_site": extr(' title="', '"'), # site name + "blog" : text.unescape(extr(' title="', '"')), + "_rprt": extr(' title="', '"'), # report button + "title": text.unescape(extr(' title="', '"').strip()), + "date" : text.parse_datetime( + extr(" images in gallery | ", "<"), "%B %d, %Y %H:%M"), } + + def images(self, page): + imgs = text.extr(page, 'id="wtf"', "</div>") + return list(text.extract_iter(imgs, " href='", "'")) From ade93c539776eb0b550ca6ddf6b0f28ad6685e82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 8 Dec 2023 22:55:16 +0100 Subject: [PATCH 186/344] [urlgalleries] add tests --- test/results/urlgalleries.py | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 test/results/urlgalleries.py diff --git a/test/results/urlgalleries.py b/test/results/urlgalleries.py new file mode 100644 index 00000000..88a321e7 --- /dev/null +++ b/test/results/urlgalleries.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import urlgalleries + + +__tests__ = ( +{ + "#url" : "https://photos2q.urlgalleries.net/porn-gallery-7851311/clarice-window-8", + "#category": ("", "urlgalleries", "gallery"), + "#class" : urlgalleries.UrlgalleriesGalleryExtractor, + "#range" : "1-3", + "#urls" : ( + "https://fappic.com/x207mqkn2463/4gq1yv.jpg", + "https://fappic.com/q684ua2rp0j9/4gq1xv.jpg", + "https://fappic.com/8vf3n8fgz9po/4gq1ya.jpg", + ), + + "blog" : "photos2q", + "count" : 39, + "date" : "dt:2023-12-08 13:59:00", + "gallery_id": "7851311", + "num" : range(1, 3), + "title" : "Clarice window 8", +}, + +{ + "#url" : "https://dreamer.urlgalleries.net/7645840", + "#category": ("", "urlgalleries", "gallery"), + "#class" : urlgalleries.UrlgalleriesGalleryExtractor, + "#range" : "1-3", + "#urls" : ( + "https://www.fappic.com/vj7up04ny487/AmourAngels-0001.jpg", + "https://www.fappic.com/zfgsmpm36iyv/AmourAngels-0002.jpg", + "https://www.fappic.com/rqpt37rdbwa5/AmourAngels-0003.jpg", + ), + + "blog" : "Dreamer", + "count" : 105, + "date" : "dt:2020-03-10 21:17:00", + "gallery_id": "7645840", + "num" : range(1, 3), + "title" : "Angelika - Rustic Charm - AmourAngels 2016-09-27", +}, + +) From 9a8dc6b02b2073472c24eb01a11122a1b189f02d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 9 Dec 2023 01:58:08 +0100 Subject: [PATCH 187/344] [exhentai] add 'fallback-retries' option (#4792) --- docs/configuration.rst | 11 ++++++ gallery_dl/extractor/exhentai.py | 61 +++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6e5820c1..06307d3f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1556,6 +1556,17 @@ Description * ``"exhentai.org"``: Use ``exhentai.org`` for all URLs +extractor.exhentai.fallback-retries +----------------------------------- +Type + ``integer`` +Default + ``2`` +Description + Number of times a failed image gets retried. + Use ``-1`` for infinite retries + + extractor.exhentai.fav ---------------------- Type diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 4facd75c..a479d002 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -47,14 +47,6 @@ class ExhentaiExtractor(Extractor): if self.version != "ex": self.cookies.set("nw", "1", domain=self.cookies_domain) - self.original = self.config("original", True) - - limits = self.config("limits", False) - if limits and limits.__class__ is int: - self.limits = limits - self._remaining = 0 - else: - self.limits = False def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -138,6 +130,19 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if source == "hitomi": self.items = self._items_hitomi + limits = self.config("limits", False) + if limits and limits.__class__ is int: + self.limits = limits + self._remaining = 0 + else: + self.limits = False + + self.fallback_retries = self.config("fallback-retries", 2) + if self.fallback_retries < 0: + self.fallback_retries = float("inf") + + self.original = self.config("original", True) + def favorite(self, slot="0"): url = self.root + "/gallerypopups.php" params = { @@ -311,12 +316,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.original and orig: url = self.root + "/fullimg" + text.unescape(orig) data = self._parse_original_info(extr('ownload original', '<')) - data["_fallback"] = ("{}?nl={}".format(url, nl),) + data["_fallback"] = self._fallback_original(nl, url) else: url = iurl data = self._parse_image_info(url) - data["_fallback"] = self._fallback( - None, self.image_num, nl) + data["_fallback"] = self._fallback_1280(nl, self.image_num) except IndexError: self.log.debug("Page content:\n%s", page) raise exception.StopExtraction( @@ -325,6 +329,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') data["_url_1280"] = iurl + data["_nl"] = nl self.key_show = extr('var showkey="', '";') self._check_509(iurl, data) @@ -361,12 +366,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): url = text.unescape(origurl) data = self._parse_original_info(text.extract( i6, "ownload original", "<", pos)[0]) - data["_fallback"] = ("{}?nl={}".format(url, nl),) + data["_fallback"] = self._fallback_original(nl, url) else: url = imgurl data = self._parse_image_info(url) - data["_fallback"] = self._fallback( - imgkey, request["page"], nl) + data["_fallback"] = self._fallback_1280( + nl, request["page"], imgkey) except IndexError: self.log.debug("Page content:\n%s", page) raise exception.StopExtraction( @@ -375,6 +380,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey data["_url_1280"] = imgurl + data["_nl"] = nl self._check_509(imgurl, data) yield url, text.nameext_from_url(url, data) @@ -441,13 +447,26 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.NotFoundError("image page") return page - def _fallback(self, imgkey, num, nl): - url = "{}/s/{}/{}-{}?nl={}".format( - self.root, imgkey or self.key_start, self.gallery_id, num, nl) - page = self.request(url, fatal=False).text - if page.startswith(("Invalid page", "Keep trying")): - return - yield self.image_from_page(page)[0] + def _fallback_original(self, nl, fullimg): + url = "{}?nl={}".format(fullimg, nl) + for _ in range(self.fallback_retries): + yield url + + def _fallback_1280(self, nl, num, token=None): + if not token: + token = self.key_start + + for _ in range(self.fallback_retries): + url = "{}/s/{}/{}-{}?nl={}".format( + self.root, token, self.gallery_id, num, nl) + + page = self.request(url, fatal=False).text + if page.startswith(("Invalid page", "Keep trying")): + return + url, data = self.image_from_page(page) + yield url + + nl = data["_nl"] @staticmethod def _parse_image_info(url): From c55955db031c058fcfdf2d031f9300e70261888b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 9 Dec 2023 15:38:42 +0100 Subject: [PATCH 188/344] [twitter] quick and dirty fix for /media changes (#4898) --- gallery_dl/extractor/twitter.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 92c4ce4a..6bad99e7 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1439,7 +1439,12 @@ class TwitterAPI(): for instr in instructions: instr_type = instr.get("type") if instr_type == "TimelineAddEntries": - entries = instr["entries"] + if entries: + entries.extend(instr["entries"]) + else: + entries = instr["entries"] + elif instr_type == "TimelineAddToModule": + entries = instr["moduleItems"] elif instr_type == "TimelineReplaceEntry": entry = instr["entry"] if entry["entryId"].startswith("cursor-bottom-"): @@ -1487,6 +1492,11 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) + elif esw("profile-grid-"): + if "content" in entry: + tweets.extend(entry["content"]["items"]) + else: + tweets.append(entry) elif esw(("homeConversation-", "profile-conversation-", "conversationthread-")): From ac22bbe80c17c20ece8e5bb6f7d59417d7807cab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 9 Dec 2023 22:30:48 +0100 Subject: [PATCH 189/344] [twitter] retry API requests only for Timeout errors (#4811) --- gallery_dl/extractor/twitter.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 6bad99e7..f874f127 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1277,13 +1277,16 @@ class TwitterAPI(): if response.status_code < 400: data = response.json() - if not data.get("errors"): - return data # success + if not data.get("errors") or not any( + (e.get("message") or "").lower().startswith("timeout") + for e in data["errors"]): + return data # success or non-timeout errors msg = data["errors"][0].get("message") or "Unspecified" - self.extractor.log.debug("internal error: '%s'", msg) + self.extractor.log.debug("Internal Twitter error: '%s'", msg) if self.headers["x-twitter-auth-type"]: + self.extractor.log.debug("Retrying API request") continue # retry # fall through to "Login Required" From 75697dfb2687247fbf443ab2a957a7d99d4ee96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 10 Dec 2023 00:00:57 +0100 Subject: [PATCH 190/344] implement -e/--error-file as a logging handler similar to --write-unsupported --- docs/configuration.rst | 7 +++++-- gallery_dl/__init__.py | 46 +++++++++++++++++------------------------- gallery_dl/option.py | 2 +- gallery_dl/output.py | 4 ++-- 4 files changed, 26 insertions(+), 33 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 06307d3f..a749743c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4422,14 +4422,17 @@ Description output.errorfile ---------------- Type - |Path|_ + * |Path|_ + * |Logging Configuration|_ Description File to write input URLs which returned an error to. + The default format string here is also ``"{message}"``. + When combined with ``-I``/``--input-file-comment`` or ``-x``/``--input-file-delete``, - this option will cause all input URLs from these files + this option will cause *all* input URLs from these files to be commented/deleted after processing them and not just successful ones. diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 1ec0de6a..fff53eb5 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -226,18 +226,26 @@ def main(): else: jobtype = args.jobtype or job.DownloadJob + input_manager = InputManager() + input_manager.log = input_log = logging.getLogger("inputfile") + # unsupported file logging handler handler = output.setup_logging_handler( "unsupportedfile", fmt="{message}") if handler: - ulog = logging.getLogger("unsupported") + ulog = job.Job.ulog = logging.getLogger("unsupported") ulog.addHandler(handler) ulog.propagate = False - job.Job.ulog = ulog + + # error file logging handler + handler = output.setup_logging_handler( + "errorfile", fmt="{message}", mode="a") + if handler: + elog = input_manager.err = logging.getLogger("errorfile") + elog.addHandler(handler) + elog.propagate = False # collect input URLs - input_manager = InputManager() - input_manager.log = input_log = logging.getLogger("inputfile") input_manager.add_list(args.urls) if args.input_files: @@ -249,11 +257,6 @@ def main(): input_log.error(exc) return getattr(exc, "code", 128) - error_file = (args.error_file or - config.get(("output",), "errorfile")) - if error_file: - input_manager.error_file(error_file) - pformat = config.get(("output",), "progress", True) if pformat and len(input_manager.urls) > 1 and \ args.loglevel < logging.ERROR: @@ -308,12 +311,12 @@ class InputManager(): def __init__(self): self.urls = [] self.files = () + self.log = self.err = None self._url = "" self._item = None self._index = 0 self._pformat = None - self._error_fp = None def add_url(self, url): self.urls.append(url) @@ -438,15 +441,6 @@ class InputManager(): else: append(url) - def error_file(self, path): - try: - path = util.expand_path(path) - self._error_fp = open(path, "a", encoding="utf-8") - except Exception as exc: - self.log.warning( - "Unable to open error file (%s: %s)", - exc.__class__.__name__, exc) - def progress(self, pformat=True): if pformat is True: pformat = "[{current}/{total}] {url}\n" @@ -462,21 +456,17 @@ class InputManager(): self._rewrite() def error(self): - if self._error_fp: + if self.err: if self._item: url, path, action, indicies = self._item lines = self.files[path] out = "".join(lines[i] for i in indicies) + if out and out[-1] == "\n": + out = out[:-1] self._rewrite() else: - out = str(self._url) + "\n" - - try: - self._error_fp.write(out) - except Exception as exc: - self.log.warning( - "Unable to update '%s' (%s: %s)", - self._error_fp.name, exc.__class__.__name__, exc) + out = str(self._url) + self.err.info(out) def _rewrite(self): url, path, action, indicies = self._item diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 5966f253..72a602f2 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -288,7 +288,7 @@ def build_parser(): ) output.add_argument( "-e", "--error-file", - dest="error_file", metavar="FILE", + dest="errorfile", metavar="FILE", action=ConfigAction, help="Add input URLs which returned an error to FILE", ) output.add_argument( diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 9508ff33..c0971f03 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -210,7 +210,7 @@ def configure_logging(loglevel): root.setLevel(minlevel) -def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): +def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w"): """Setup a new logging handler""" opts = config.interpolate(("output",), key) if not opts: @@ -219,7 +219,7 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): opts = {"path": opts} path = opts.get("path") - mode = opts.get("mode", "w") + mode = opts.get("mode", mode) encoding = opts.get("encoding", "utf-8") try: path = util.expand_path(path) From 5ff7106d4f907775f8902c5aded0a68b0e4cf528 Mon Sep 17 00:00:00 2001 From: Tobi823 <Tobi823@users.noreply.github.com> Date: Sun, 10 Dec 2023 16:10:46 +0100 Subject: [PATCH 191/344] - add code for the situation when Patreon is using window.patreon = wrapInProxy({"bootstrap":' to store metadata - refactor code to make it more readable - output page content when the HTML structure is unknown (to make debugging easier) --- gallery_dl/extractor/patreon.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6aef9cbe..b89fddcf 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -249,23 +249,18 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - bootstrap = text.extr( - page, 'window.patreon = {"bootstrap":', '},"apiServer"') - if bootstrap: - return util.json_loads(bootstrap + "}") - - bootstrap = text.extr(page, "window.patreon.bootstrap,", "});") - if bootstrap: - return util.json_loads(bootstrap + "}") - - data = text.extr(page, "window.patreon = {", "};\n") - if data: - try: - return util.json_loads("{" + data + "}")["bootstrap"] - except Exception: - pass - - raise exception.StopExtraction("Unable to extract bootstrap data") + if "window.patreon.bootstrap," in page: + page_content = text.extr(page, "window.patreon.bootstrap,", "});") + json_string = page_content + "}" + elif 'window.patreon = {"bootstrap":' in page: + page_content = text.extr(page, 'window.patreon = {"bootstrap":', '},"apiServer"') + json_string = page_content + "}" + elif 'window.patreon = wrapInProxy({"bootstrap":' in page: + page_content = text.extr(page, 'window.patreon = wrapInProxy({"bootstrap":', '},"apiServer"') + json_string = page_content + "}" + else: + raise Exception(f"Unknown HTML and JS structure. Page content is: {page}") + return util.json_loads(json_string) class PatreonCreatorExtractor(PatreonExtractor): From fd06255f93895e3d58ea5d62c5eb5666f1a29f35 Mon Sep 17 00:00:00 2001 From: Tobi823 <Tobi823@users.noreply.github.com> Date: Sun, 10 Dec 2023 16:17:34 +0100 Subject: [PATCH 192/344] - reformat and refactor to pass tests --- gallery_dl/extractor/patreon.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index b89fddcf..2ff1e9e4 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -250,16 +250,19 @@ class PatreonExtractor(Extractor): def _extract_bootstrap(self, page): if "window.patreon.bootstrap," in page: - page_content = text.extr(page, "window.patreon.bootstrap,", "});") - json_string = page_content + "}" + content_begin = "window.patreon.bootstrap," + content_end = "});" + json_string = text.extr(page, content_begin, content_end) + "}" elif 'window.patreon = {"bootstrap":' in page: - page_content = text.extr(page, 'window.patreon = {"bootstrap":', '},"apiServer"') - json_string = page_content + "}" + content_begin = 'window.patreon = {"bootstrap":' + content_end = '},"apiServer"' + json_string = text.extr(page, content_begin, content_end) + "}" elif 'window.patreon = wrapInProxy({"bootstrap":' in page: - page_content = text.extr(page, 'window.patreon = wrapInProxy({"bootstrap":', '},"apiServer"') - json_string = page_content + "}" + content_begin = 'window.patreon = wrapInProxy({"bootstrap":' + content_end = '},"apiServer"' + json_string = text.extr(page, content_begin, content_end) + "}" else: - raise Exception(f"Unknown HTML and JS structure. Page content is: {page}") + raise Exception("Unknown HTML and JS structure. Page:" + page) return util.json_loads(json_string) From 244444b194ef7176530394fdff4e571ff8822528 Mon Sep 17 00:00:00 2001 From: Tobi823 <Tobi823@users.noreply.github.com> Date: Sun, 10 Dec 2023 16:22:32 +0100 Subject: [PATCH 193/344] - adapt code to current code style --- gallery_dl/extractor/patreon.py | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 2ff1e9e4..0b0e9ebd 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -249,21 +249,28 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - if "window.patreon.bootstrap," in page: - content_begin = "window.patreon.bootstrap," - content_end = "});" - json_string = text.extr(page, content_begin, content_end) + "}" - elif 'window.patreon = {"bootstrap":' in page: - content_begin = 'window.patreon = {"bootstrap":' - content_end = '},"apiServer"' - json_string = text.extr(page, content_begin, content_end) + "}" - elif 'window.patreon = wrapInProxy({"bootstrap":' in page: - content_begin = 'window.patreon = wrapInProxy({"bootstrap":' - content_end = '},"apiServer"' - json_string = text.extr(page, content_begin, content_end) + "}" - else: - raise Exception("Unknown HTML and JS structure. Page:" + page) - return util.json_loads(json_string) + bootstrap = text.extr( + page, 'window.patreon = {"bootstrap":', '},"apiServer"') + if bootstrap: + return util.json_loads(bootstrap + "}") + + bootstrap = text.extr( + page, 'window.patreon = wrapInProxy({"bootstrap":', '},"apiServer"') + if bootstrap: + return util.json_loads(bootstrap + "}") + + bootstrap = text.extr(page, "window.patreon.bootstrap,", "});") + if bootstrap: + return util.json_loads(bootstrap + "}") + + data = text.extr(page, "window.patreon = {", "};\n") + if data: + try: + return util.json_loads("{" + data + "}")["bootstrap"] + except Exception: + pass + + raise exception.StopExtraction("Unable to extract bootstrap data") class PatreonCreatorExtractor(PatreonExtractor): From 66cbe9da410ba08465d051fbfb7a06c093d6eeb0 Mon Sep 17 00:00:00 2001 From: Tobi823 <Tobi823@users.noreply.github.com> Date: Sun, 10 Dec 2023 16:24:00 +0100 Subject: [PATCH 194/344] - fix style check failure "line to long" --- gallery_dl/extractor/patreon.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 0b0e9ebd..fb560e96 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -255,7 +255,9 @@ class PatreonExtractor(Extractor): return util.json_loads(bootstrap + "}") bootstrap = text.extr( - page, 'window.patreon = wrapInProxy({"bootstrap":', '},"apiServer"') + page, + 'window.patreon = wrapInProxy({"bootstrap":', + '},"apiServer"') if bootstrap: return util.json_loads(bootstrap + "}") From 28d60e35467d1ae2ebfc607ca782f7f57bb9c897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 10 Dec 2023 17:43:32 +0100 Subject: [PATCH 195/344] release version 1.26.4 --- CHANGELOG.md | 20 ++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39f58849..88dbc44a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## 1.26.4 - 2023-12-10 +### Extractors +#### Additions +- [exhentai] add `fallback-retries` option ([#4792](https://github.com/mikf/gallery-dl/issues/4792)) +- [urlgalleries] add `gallery` extractor ([#919](https://github.com/mikf/gallery-dl/issues/919), [#1184](https://github.com/mikf/gallery-dl/issues/1184), [#2905](https://github.com/mikf/gallery-dl/issues/2905), [#4886](https://github.com/mikf/gallery-dl/issues/4886)) +#### Fixes +- [nijie] fix image URLs of multi-image posts ([#4876](https://github.com/mikf/gallery-dl/issues/4876)) +- [patreon] fix bootstrap data extraction ([#4904](https://github.com/mikf/gallery-dl/issues/4904), [#4906](https://github.com/mikf/gallery-dl/issues/4906)) +- [twitter] fix `/media` timelines ([#4898](https://github.com/mikf/gallery-dl/issues/4898), [#4899](https://github.com/mikf/gallery-dl/issues/4899)) +- [twitter] retry API requests when response contains incomplete results ([#4811](https://github.com/mikf/gallery-dl/issues/4811)) +#### Improvements +- [exhentai] store more cookies when logging in with username & password ([#4881](https://github.com/mikf/gallery-dl/issues/4881)) +- [twitter] generalize "Login Required" errors ([#4734](https://github.com/mikf/gallery-dl/issues/4734), [#4324](https://github.com/mikf/gallery-dl/issues/4324)) +### Options +- add `-e/--error-file` command-line and `output.errorfile` config option ([#4732](https://github.com/mikf/gallery-dl/issues/4732)) +### Miscellaneous +- automatically build and push Docker images +- prompt for passwords on login when necessary +- fix `util.dump_response()` to work with `bytes` header values + ## 1.26.3 - 2023-11-27 ### Extractors #### Additions diff --git a/README.rst b/README.rst index 776ba434..5603929c 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.3/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.4/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.3/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.4/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index fdf1e3ca..f0d55f6b 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.4-dev" +__version__ = "1.26.4" From 1d5ee4239d8ba0cbd35e9beab4154c020e78ad93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 10 Dec 2023 22:11:39 +0100 Subject: [PATCH 196/344] [docker] let metadata-action automatically generate 'latest' tags --- .github/workflows/docker.yml | 3 --- gallery_dl/version.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6d69e886..c91ac0d5 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -45,9 +45,6 @@ jobs: tags: | type=sha,format=long,prefix= type=ref,event=tag - # https://github.com/docker/metadata-action/issues/112 - flavor: | - latest=${{ github.ref == 'refs/heads/master' }} # https://github.com/docker/build-push-action - name: Build image diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f0d55f6b..8cd2351f 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.4" +__version__ = "1.26.5-dev" From 92fbf09643393bad2c4b45ffc39d0518c17510b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 11 Dec 2023 19:13:45 +0100 Subject: [PATCH 197/344] remove single quotes in some logging messages (#4908) ('FileNotFoundError: [Errno 2] No such file or directory: ''') -> (FileNotFoundError: [Errno 2] No such file or directory: '') --- gallery_dl/extractor/reddit.py | 2 +- gallery_dl/job.py | 2 +- gallery_dl/postprocessor/common.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index feb6d1fe..4c6dd2c7 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -159,7 +159,7 @@ class RedditExtractor(Extractor): data = meta[item["media_id"]] if data["status"] != "valid" or "s" not in data: self.log.warning( - "gallery %s: skipping item %s ('status: %s')", + "gallery %s: skipping item %s (status: %s)", submission["id"], item["media_id"], data.get("status")) continue src = data["s"] diff --git a/gallery_dl/job.py b/gallery_dl/job.py index ac2ac7ae..eb10a0ce 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -520,7 +520,7 @@ class DownloadJob(Job): archive, archive_format, archive_pragma) except Exception as exc: extr.log.warning( - "Failed to open download archive at '%s' ('%s: %s')", + "Failed to open download archive at '%s' (%s: %s)", archive, exc.__class__.__name__, exc) else: extr.log.debug("Using download archive '%s'", archive) diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 10d9fbab..1d2fba87 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -41,7 +41,7 @@ class PostProcessor(): "_archive_" + self.name) except Exception as exc: self.log.warning( - "Failed to open %s archive at '%s' ('%s: %s')", + "Failed to open %s archive at '%s' (%s: %s)", self.name, archive, exc.__class__.__name__, exc) else: self.log.debug("Using %s archive '%s'", self.name, archive) From a24b82e67daf190c8c0cdc04985591522d413e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 11 Dec 2023 23:32:28 +0100 Subject: [PATCH 198/344] add 'util.repeat()' --- gallery_dl/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 53502ef0..751c3986 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -55,6 +55,13 @@ def advance(iterable, num): return iterator +def repeat(times): + """Return an iterator that returns None""" + if times < 0: + return itertools.repeat(None) + return itertools.repeat(None, times) + + def unique(iterable): """Yield unique elements from 'iterable' while preserving order""" seen = set() From 2d5cda2b922412a4a37b20e2d8ec3d585dd5e4f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 11 Dec 2023 23:33:09 +0100 Subject: [PATCH 199/344] [exhentai] fix TypeError for infinite 'fallback-retries' (#4911) --- gallery_dl/extractor/exhentai.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index a479d002..16398d7a 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -138,9 +138,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.limits = False self.fallback_retries = self.config("fallback-retries", 2) - if self.fallback_retries < 0: - self.fallback_retries = float("inf") - self.original = self.config("original", True) def favorite(self, slot="0"): @@ -449,14 +446,14 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _fallback_original(self, nl, fullimg): url = "{}?nl={}".format(fullimg, nl) - for _ in range(self.fallback_retries): + for _ in util.repeat(self.fallback_retries): yield url def _fallback_1280(self, nl, num, token=None): if not token: token = self.key_start - for _ in range(self.fallback_retries): + for _ in util.repeat(self.fallback_retries): url = "{}/s/{}/{}-{}?nl={}".format( self.root, token, self.gallery_id, num, nl) From d59d4ebff42c5f06455296ba14b84123aef9cf13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 11 Dec 2023 23:38:39 +0100 Subject: [PATCH 200/344] [tumblr] support infinite 'fallback-retries' --- docs/configuration.rst | 7 ++++--- gallery_dl/extractor/tumblr.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a749743c..e7f4fc03 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1563,8 +1563,8 @@ Type Default ``2`` Description - Number of times a failed image gets retried. - Use ``-1`` for infinite retries + Number of times a failed image gets retried + or ``-1`` for infinite retries. extractor.exhentai.fav @@ -3144,7 +3144,8 @@ Type Default ``2`` Description - Number of retries for fetching full-resolution images. + Number of retries for fetching full-resolution images + or ``-1`` for infinite retries. extractor.twibooru.api-key diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index f50ddb79..10bd1533 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -9,7 +9,7 @@ """Extractors for https://www.tumblr.com/""" from .common import Extractor, Message -from .. import text, oauth, exception +from .. import text, util, oauth, exception from datetime import datetime, date, timedelta import re @@ -262,7 +262,7 @@ class TumblrExtractor(Extractor): return updated, (resized == updated) def _original_image_fallback(self, url, post_id): - for _ in range(self.fallback_retries): + for _ in util.repeat(self.fallback_retries): self.sleep(self.fallback_delay, "image token") yield self._update_image_token(url)[0] self.log.warning("Unable to fetch higher-resolution " From fbe14a2745b8af4a6c9a87d27f9c7cb6698e4370 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 12 Dec 2023 20:54:34 +1100 Subject: [PATCH 201/344] [postmill] add support --- docs/configuration.rst | 10 ++ docs/supportedsites.md | 10 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/postmill.py | 204 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 7 ++ test/results/raddle.py | 112 +++++++++++++++++ 6 files changed, 344 insertions(+) create mode 100644 gallery_dl/extractor/postmill.py create mode 100644 test/results/raddle.py diff --git a/docs/configuration.rst b/docs/configuration.rst index a749743c..c49dc2c9 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2734,6 +2734,16 @@ Description Also search Plurk comments for URLs. +extractor.[postmill].save-link-post-body +------------------------ +Type + ``bool`` +Default + ``false`` +Description + Whether or not to save the body for link/image posts. + + extractor.reactor.gif --------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 003dcaa9..ce490aa2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1316,6 +1316,16 @@ Consider all sites to be NSFW unless otherwise known. <td></td> </tr> +<tr> + <td colspan="4"><strong>Postmill Instances</strong></td> +</tr> +<tr> + <td>Raddle</td> + <td>https://raddle.me/</td> + <td>Forums, Home Feed, Individual Posts, Search Results, Tag Searches, User Profiles</td> + <td></td> +</tr> + <tr> <td colspan="4"><strong>Reactor Instances</strong></td> </tr> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d074de22..695b8b2a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -124,6 +124,7 @@ modules = [ "poipiku", "pornhub", "pornpics", + "postmill", "pururin", "reactor", "readcomiconline", diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py new file mode 100644 index 00000000..4d4b38a2 --- /dev/null +++ b/gallery_dl/extractor/postmill.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Postmill instances""" + +import re +import urllib.parse +from .common import BaseExtractor, Message +from .. import text, exception + + +class PostmillExtractor(BaseExtractor): + """Base class for Postmill extractors""" + basecategory = "postmill" + directory_fmt = ("{category}", "{instance}", "{forum}") + filename_fmt = "{id}_{title[:220]}.{extension}" + archive_fmt = "{filename}" + + def _init(self): + self.instance = self.root.partition("://")[2] + self.save_link_post_body = self.config("save-link-post-body", False) + self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search + self._search_image_tag = re.compile( + r'<a href="[^"]+"\n +class="submission__image-link"').search + + def items(self): + for post_url in self.post_urls(): + response = self.request(post_url) + extr = text.extract_from(response.text) + + title = text.unescape(extr( + '<meta property="og:title" content="', '">')) + date = text.parse_datetime(extr( + '<meta property="og:article:published_time" content="', '">')) + username = extr( + '<meta property="og:article:author" content="', '">') + post_canonical_url = text.unescape(extr( + '<link rel="canonical" href="', '">')) + + url = text.unescape(extr( + '<h1 class="submission__title unheaderize inline"><a href="', + '"')) + body = extr( + '<div class="submission__body break-text text-flow">', + '</div>') + + match = self._search_canonical_url(post_canonical_url) + forum = match.group(1) + id = int(match.group(2)) + + is_text_post = url.startswith("/") + is_image_post = self._search_image_tag(response.text) is not None + data = { + "title": title, + "date": date, + "username": username, + "forum": forum, + "id": id, + "flair": [text.unescape(i) for i in text.extract_iter( + response.text, '<span class="flair__label">', '</span>')], + "instance": self.instance, + } + + urls = [] + if is_text_post or self.save_link_post_body: + urls.append((Message.Url, "text:" + body)) + + if is_image_post: + urls.append((Message.Url, url)) + elif not is_text_post: + urls.append((Message.Queue, url)) + + data["count"] = len(urls) + yield Message.Directory, data + for data["num"], (msg, url) in enumerate(urls, 1): + if url.startswith("text:"): + data["filename"], data["extension"] = "", "htm" + else: + data = text.nameext_from_url(url, data) + + yield msg, url, data + + +class PostmillSubmissionsExtractor(PostmillExtractor): + """Base class for Postmill submissions extractors""" + whitelisted_parameters = () + + def __init__(self, match): + PostmillExtractor.__init__(self, match) + self.base = match.group(3) + self.sorting_path = match.group(4) or "" + self.query = {key: value for key, value in text.parse_query( + match.group(5) or "").items() if self.acceptable_query(key)} + + def items(self): + url = self.root + self.base + self.sorting_path + if self.query: + url += "?" + urllib.parse.urlencode(self.query) + + while url: + response = self.request(url) + if response.history: + redirect_url = response.url + if redirect_url == self.root + "/login": + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", redirect_url) + + for nav in text.extract_iter(response.text, + '<nav class="submission__nav">', + '</nav>'): + post_url = text.unescape(text.extr(nav, '<a href="', '"')) + yield Message.Queue, text.urljoin(url, post_url), \ + {"_extractor": PostmillPostExtractor} + + url = text.unescape(text.extr(response.text, + '<link rel="next" href="', '">')) + + def acceptable_query(self, key): + return key in self.whitelisted_parameters or key == "t" or \ + (key.startswith("next[") and key.endswith("]")) + + +BASE_PATTERN = PostmillExtractor.update({ + "raddle": { + "root" : None, + "pattern": (r"(?:raddle\.me|" + r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid" + r"\.onion)"), + } +}) +SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" +QUERY_RE = r"(?:\?([^#]+))?" + + +class PostmillPostExtractor(PostmillExtractor): + """Extractor for a single submission URL""" + subcategory = "post" + pattern = BASE_PATTERN + r"/f/([\w\d_]+)/(\d+)(?:/.+)?$" + example = "https://raddle.me/f/FORUM/123/TITLE" + + def __init__(self, match): + PostmillExtractor.__init__(self, match) + self.forum = match.group(3) + self.post_id = match.group(4) + + def post_urls(self): + return (self.root + "/f/" + self.forum + "/" + self.post_id,) + + +class PostmillShortURLExtractor(PostmillExtractor): + """Extractor for short submission URLs""" + subcategory = "shorturl" + pattern = BASE_PATTERN + r"/(\d+)$" + example = "https://raddle.me/123" + + def __init__(self, match): + PostmillExtractor.__init__(self, match) + self.post_id = match.group(3) + + def items(self): + url = self.root + "/" + self.post_id + response = self.request(url, method="HEAD", allow_redirects=False) + full_url = text.urljoin(url, response.headers["Location"]) + yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor} + + +class PostmillHomeExtractor(PostmillSubmissionsExtractor): + """Extractor for the home page""" + subcategory = "home" + pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE \ + + QUERY_RE + "$" + example = "https://raddle.me/" + + +class PostmillForumExtractor(PostmillSubmissionsExtractor): + """Extractor for submissions on a forum""" + subcategory = "forum" + pattern = BASE_PATTERN + r"(/f/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$" + example = "https://raddle.me/f/FORUM" + + +class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): + """Extractor for submissions made by a user""" + subcategory = "usersubmissions" + pattern = BASE_PATTERN + r"(/user/[\w\d_]+/submissions)()" + QUERY_RE + "$" + example = "https://raddle.me/user/USER/submissions" + + +class PostmillTagExtractor(PostmillSubmissionsExtractor): + """Extractor for submissions on a forum with a specific tag""" + subcategory = "tag" + pattern = BASE_PATTERN + r"(/tag/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$" + example = "https://raddle.me/tag/TAG" + + +class PostmillSearchExtractor(PostmillSubmissionsExtractor): + """Extractor for search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$" + example = "https://raddle.me/search?q=QUERY" + whitelisted_parameters = ("q",) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 3afac13f..f8276657 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -101,6 +101,7 @@ CATEGORY_MAP = { "pornimagesxxx" : "Porn Image", "pornpics" : "PornPics.com", "pornreactor" : "PornReactor", + "postmill" : "Postmill", "readcomiconline": "Read Comic Online", "rbt" : "RebeccaBlackTech", "redgifs" : "RedGIFs", @@ -232,6 +233,12 @@ SUBCATEGORY_MAP = { "pornhub": { "gifs": "", }, + "raddle": { + "home" : "Home Feed", + "usersubmissions": "User Profiles", + "post" : "Individual Posts", + "shorturl" : "", + }, "reddit": { "home": "Home Feed", }, diff --git a/test/results/raddle.py b/test/results/raddle.py new file mode 100644 index 00000000..4e60abb7 --- /dev/null +++ b/test/results/raddle.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import postmill + + +__tests__ = ( +{ + "#url" : "https://raddle.me/", + "#category": ("postmill", "raddle.me", "home"), + "#class" : postmill.PostmillHomeExtractor, + "#range" : "1-25", + "#count" : 25, +}, + +{ + "#url" : "https://raddle.me/f/traa", + "#category": ("postmill", "raddle.me", "forum"), + "#class" : postmill.PostmillForumExtractor, + "#count" : 1, + "#pattern" : "^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", +}, + +{ + "#url" : "https://raddle.me/user/Sam_the_enby/submissions", + "#category": ("postmill", "raddle.me", "usersubmissions"), + "#class" : postmill.PostmillUserSubmissionsExtractor, + "#range" : "1-25", + "#count" : 25, +}, + +{ + "#url" : "https://raddle.me/tag/Trans", + "#category": ("postmill", "raddle.me", "tag"), + "#class" : postmill.PostmillTagExtractor, +}, + +{ + "#url" : "https://raddle.me/search?q=tw", + "#category": ("postmill", "raddle.me", "search"), + "#class" : postmill.PostmillSearchExtractor, + "#range" : "1-50", + "#count" : 50, +}, + +{ + "#url" : "https://raddle.me/160845", + "#category": ("postmill", "raddle.me", "shorturl"), + "#class" : postmill.PostmillShortURLExtractor, + "#pattern" : r"^https://raddle\.me/f/egg_irl/160845/egg_irl$", +}, + +{ + "#url" : "https://raddle.me/f/NonBinary/179017/scattered-thoughts-would-appreciate-advice-immensely-tw", + "#comment" : "Text post", + "#category": ("postmill", "raddle.me", "post"), + "#class" : postmill.PostmillPostExtractor, + "#sha1_url" : "99277f815820810d9d7e219d455f818601858378", + "#sha1_content": "7a1159e1e45f2ce8e2c8b5959f6d66b042776f3b", + "#count" : 1, +}, + +{ + "#url" : "https://raddle.me/f/egg_irl/160845", + "#comment" : "Image post", + "#category": ("postmill", "raddle.me", "post"), + "#class" : postmill.PostmillPostExtractor, + "#sha1_url" : "48663f767ea258fcd545ab5aa0e734f98f434388", + "#sha1_content": "431e938082c2b59c44888a83cfc711cd1f0e910a", + "#count" : 1, +}, + +{ + "#url" : "https://raddle.me/f/trans/177042/tw-vent-nsfw-suicide-i-lost-no-nut-november-tw-trauma", + "#comment" : "Image + text post (with text enabled)", + "#category": ("postmill", "raddle.me", "post"), + "#class" : postmill.PostmillPostExtractor, + "#options" : {"save-link-post-body": True}, + "#pattern" : r"^(text:[\s\S]+|https://raddle\.me/submission_images/[0-9a-f]+\.png)$", + "#count" : 2, +}, + +{ + "#url" : "https://raddle.me/f/videos/179541/raisins-and-sprite", + "#comment" : "Link post", + "#category": ("postmill", "raddle.me", "post"), + "#class" : postmill.PostmillPostExtractor, + "#urls" : "https://m.youtube.com/watch?v=RFJCA5zcZxI", + "#count" : 1, +}, + +{ + "#url" : "https://raddle.me/f/Anime/150698/neo-tokyo-1987-link-to-the-english-dub-version-last-link", + "#comment" : "Link + text post (with text disabled)", + "#category": ("postmill", "raddle.me", "post"), + "#class" : postmill.PostmillPostExtractor, + "#pattern" : "^https://fantasyanime\.com/anime/neo-tokyo-dub$", + "#count" : 1, +}, + +{ + "#url" : "https://raddle.me/f/egg_irl/166855/4th-wall-breaking-please-let-this-be-a-flair-egg-irl", + "#comment" : "Post with multiple flairs", + "#category": ("postmill", "raddle.me", "post"), + "#class" : postmill.PostmillPostExtractor, + "flair" : ["Gender non-specific", "4th wall breaking"], +}, + +) From da76e13e3b37311428e8b7004fbf1b6aa807022d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 12 Dec 2023 19:12:13 +0100 Subject: [PATCH 202/344] [tumblr] fix exception after waiting for rate limit (#4916) use a loop instead of recursive function calls --- gallery_dl/extractor/tumblr.py | 114 +++++++++++++++++---------------- 1 file changed, 59 insertions(+), 55 deletions(-) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 10bd1533..fee0145d 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -404,66 +404,70 @@ class TumblrAPI(oauth.OAuth1API): def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint kwargs["params"] = params - response = self.request(url, **kwargs) - try: - data = response.json() - except ValueError: - data = response.text - status = response.status_code - else: - status = data["meta"]["status"] - if 200 <= status < 400: - return data["response"] - - self.log.debug(data) - if status == 403: - raise exception.AuthorizationError() + while True: + response = self.request(url, **kwargs) - elif status == 404: try: - error = data["errors"][0]["detail"] - board = ("only viewable within the Tumblr dashboard" in error) - except Exception: - board = False - - if board: - self.log.info("Run 'gallery-dl oauth:tumblr' " - "to access dashboard-only blogs") - raise exception.AuthorizationError(error) - raise exception.NotFoundError("user or post") - - elif status == 429: - # daily rate limit - if response.headers.get("x-ratelimit-perday-remaining") == "0": - self.log.info("Daily API rate limit exceeded") - reset = response.headers.get("x-ratelimit-perday-reset") - - api_key = self.api_key or self.session.auth.consumer_key - if api_key == self.API_KEY: - self.log.info("Register your own OAuth application and " - "use its credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/mas" - "ter/docs/configuration.rst#extractortumblra" - "pi-key--api-secret") - - if self.extractor.config("ratelimit") == "wait": + data = response.json() + except ValueError: + data = response.text + status = response.status_code + else: + status = data["meta"]["status"] + if 200 <= status < 400: + return data["response"] + + self.log.debug(data) + + if status == 403: + raise exception.AuthorizationError() + + elif status == 404: + try: + error = data["errors"][0]["detail"] + board = ("only viewable within the Tumblr dashboard" + in error) + except Exception: + board = False + + if board: + self.log.info("Run 'gallery-dl oauth:tumblr' " + "to access dashboard-only blogs") + raise exception.AuthorizationError(error) + raise exception.NotFoundError("user or post") + + elif status == 429: + # daily rate limit + if response.headers.get("x-ratelimit-perday-remaining") == "0": + self.log.info("Daily API rate limit exceeded") + reset = response.headers.get("x-ratelimit-perday-reset") + + api_key = self.api_key or self.session.auth.consumer_key + if api_key == self.API_KEY: + self.log.info( + "Register your own OAuth application and use its " + "credentials to prevent this error: https://githu" + "b.com/mikf/gallery-dl/blob/master/docs/configurat" + "ion.rst#extractortumblrapi-key--api-secret") + + if self.extractor.config("ratelimit") == "wait": + self.extractor.wait(seconds=reset) + continue + + t = (datetime.now() + timedelta(0, float(reset))).time() + raise exception.StopExtraction( + "Aborting - Rate limit will reset at %s", + "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)) + + # hourly rate limit + reset = response.headers.get("x-ratelimit-perhour-reset") + if reset: + self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) - return self._call(endpoint, params, **kwargs) - - t = (datetime.now() + timedelta(seconds=float(reset))).time() - raise exception.StopExtraction( - "Aborting - Rate limit will reset at %s", - "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)) - - # hourly rate limit - reset = response.headers.get("x-ratelimit-perhour-reset") - if reset: - self.log.info("Hourly API rate limit exceeded") - self.extractor.wait(seconds=reset) - return self._call(endpoint, params, **kwargs) + continue - raise exception.StopExtraction(data) + raise exception.StopExtraction(data) def _pagination(self, blog, endpoint, params, key="posts", cache=False): endpoint = "/v2/blog/{}{}".format(blog, endpoint) From a37b7759bccdba8798a76afb0cb44535296cba25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 12 Dec 2023 20:02:28 +0100 Subject: [PATCH 203/344] [myhentaigallery] recognize '/g/' URLs (#4920) --- gallery_dl/extractor/myhentaigallery.py | 6 +++--- test/results/myhentaigallery.py | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index 33a22849..5e8179ec 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -16,12 +16,12 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): root = "https://myhentaigallery.com" directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}") pattern = (r"(?:https?://)?myhentaigallery\.com" - r"/gallery/(?:thumbnails|show)/(\d+)") - example = "https://myhentaigallery.com/gallery/thumbnails/12345" + r"/g(?:allery/(?:thumbnails|show))?/(\d+)") + example = "https://myhentaigallery.com/g/12345" def __init__(self, match): self.gallery_id = match.group(1) - url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id) + url = "{}/g/{}".format(self.root, self.gallery_id) GalleryExtractor.__init__(self, match, url) def _init(self): diff --git a/test/results/myhentaigallery.py b/test/results/myhentaigallery.py index b7b5ac99..283d6d01 100644 --- a/test/results/myhentaigallery.py +++ b/test/results/myhentaigallery.py @@ -9,7 +9,7 @@ from gallery_dl.extractor import myhentaigallery __tests__ = ( { - "#url" : "https://myhentaigallery.com/gallery/thumbnails/16247", + "#url" : "https://myhentaigallery.com/g/16247", "#category": ("", "myhentaigallery", "gallery"), "#class" : myhentaigallery.MyhentaigalleryGalleryExtractor, "#pattern" : r"https://images\.myhentaicomics\.com/mhg/images/[^/]+/original/\d+\.jpg", @@ -23,6 +23,12 @@ __tests__ = ( "title" : "Attack Of The 50ft Woman 1", }, +{ + "#url" : "https://myhentaigallery.com/gallery/thumbnails/16247", + "#category": ("", "myhentaigallery", "gallery"), + "#class" : myhentaigallery.MyhentaigalleryGalleryExtractor, +}, + { "#url" : "https://myhentaigallery.com/gallery/show/16247/1", "#category": ("", "myhentaigallery", "gallery"), From 9951c112f823c48a207ee7c05a45ffee11919bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 14 Dec 2023 16:25:06 +0100 Subject: [PATCH 204/344] [deviantart] workaround for integer client_id values (#4924) --- gallery_dl/extractor/deviantart.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 1852dc1b..5363ea4b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1005,6 +1005,8 @@ class DeviantartOAuthAPI(): self.client_id = extractor.config("client-id") if self.client_id: + if not isinstance(self.client_id, str): + self.client_id = str(self.client_id) self.client_secret = extractor.config("client-secret") else: self.client_id = self.CLIENT_ID From aac8bb4eaece6a53ae8bea3a43d05c7e938a2c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 15 Dec 2023 17:39:42 +0100 Subject: [PATCH 205/344] [deviantart] simplify 9951c112 --- gallery_dl/extractor/deviantart.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 5363ea4b..d34f312d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1003,10 +1003,9 @@ class DeviantartOAuthAPI(): self.strategy = extractor.config("pagination") self.public = extractor.config("public", True) - self.client_id = extractor.config("client-id") - if self.client_id: - if not isinstance(self.client_id, str): - self.client_id = str(self.client_id) + client_id = extractor.config("client-id") + if client_id: + self.client_id = str(client_id) self.client_secret = extractor.config("client-secret") else: self.client_id = self.CLIENT_ID @@ -1014,7 +1013,7 @@ class DeviantartOAuthAPI(): token = extractor.config("refresh-token") if token is None or token == "cache": - token = "#" + str(self.client_id) + token = "#" + self.client_id if not _refresh_token_cache(token): token = None self.refresh_token_key = token From 39fb96a84539151c844e24e5f192b74348c9c88b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 15 Dec 2023 17:51:21 +0100 Subject: [PATCH 206/344] [docs] reword NSFW warning --- docs/supportedsites.md | 2 +- scripts/supportedsites.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 003dcaa9..9ef1fd09 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,7 +1,7 @@ # Supported Sites <!-- auto-generated by scripts/supportedsites.py --> -Consider all sites to be NSFW unless otherwise known. +Consider all listed sites to potentially be NSFW. <table> <thead valign="bottom"> diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 3afac13f..a0856f00 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -522,7 +522,7 @@ def generate_output(columns, categories, domains): TEMPLATE = """# Supported Sites <!-- auto-generated by {} --> -Consider all sites to be NSFW unless otherwise known. +Consider all listed sites to potentially be NSFW. <table> <thead valign="bottom"> From d95be2537a0ba440ec07732a91dc1ead3bd86da9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 15 Dec 2023 18:05:44 +0100 Subject: [PATCH 207/344] move files from gh-pages branch to master --- docs/index.html | 9 +++++++++ docs/oauth-redirect.html | 12 ++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 docs/index.html create mode 100644 docs/oauth-redirect.html diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 00000000..43d89820 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,9 @@ +<!DOCTYPE html> +<html> +<head> +<meta charset="utf-8"> +<title>gallery-dl + + + + diff --git a/docs/oauth-redirect.html b/docs/oauth-redirect.html new file mode 100644 index 00000000..e22053d2 --- /dev/null +++ b/docs/oauth-redirect.html @@ -0,0 +1,12 @@ + + + + + gallery-dl - OAuth Redirect + + + + + From 6cd5e6adadeedb9e49fb773c98466e58eccd1bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Dec 2023 18:24:52 +0100 Subject: [PATCH 208/344] [patreon] fix bootstrap data extraction (#4904) --- gallery_dl/extractor/patreon.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index fb560e96..6c2f39dc 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -249,6 +249,15 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): + data = text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', ' Date: Fri, 15 Dec 2023 21:03:22 +0100 Subject: [PATCH 209/344] [inkbunny] stop pagination on empty results --- gallery_dl/extractor/inkbunny.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 4ad37fc1..7076dfec 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -324,6 +324,9 @@ class InkbunnyAPI(): while True: data = self._call("search", params) + if not data["submissions"]: + return + yield from self.detail(data["submissions"]) if data["page"] >= data["pages_count"]: From 2852404e4949d8c54a401c720f7b4f216692b4be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Dec 2023 21:20:12 +0100 Subject: [PATCH 210/344] [inkbunny] add 'unread' extractor (#4934) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/inkbunny.py | 20 ++++++++++++++++++++ scripts/supportedsites.py | 3 +++ test/results/inkbunny.py | 6 ++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9ef1fd09..e3b4ea7e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -400,7 +400,7 @@ Consider all listed sites to potentially be NSFW.
- + diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 7076dfec..c52e3570 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -161,6 +161,26 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): return self.api.search(params) +class InkbunnyUnreadExtractor(InkbunnyExtractor): + """Extractor for unread inkbunny submissions""" + subcategory = "unread" + pattern = (BASE_PATTERN + + r"/submissionsviewall\.php\?([^#]+&mode=unreadsubs&[^#]+)") + example = ("https://inkbunny.net/submissionsviewall.php" + "?text=&mode=unreadsubs&type=") + + def __init__(self, match): + InkbunnyExtractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + + def posts(self): + params = self.params.copy() + params.pop("rid", None) + params.pop("mode", None) + params["unread_submissions"] = "yes" + return self.api.search(params) + + class InkbunnySearchExtractor(InkbunnyExtractor): """Extractor for inkbunny search results""" subcategory = "search" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index a0856f00..2a625e76 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -192,6 +192,9 @@ SUBCATEGORY_MAP = { "imgur": { "favorite-folder": "Favorites Folders", }, + "inkbunny": { + "unread": "Unread Submissions", + }, "instagram": { "posts": "", "saved": "Saved Posts", diff --git a/test/results/inkbunny.py b/test/results/inkbunny.py index 7129877e..58cecb68 100644 --- a/test/results/inkbunny.py +++ b/test/results/inkbunny.py @@ -94,6 +94,12 @@ __tests__ = ( "#class" : inkbunny.InkbunnyFavoriteExtractor, }, +{ + "#url" : "https://inkbunny.net/submissionsviewall.php?rid=ffffffffff&mode=unreadsubs&page=1&orderby=unread_datetime", + "#category": ("", "inkbunny", "unread"), + "#class" : inkbunny.InkbunnyUnreadExtractor, +}, + { "#url" : "https://inkbunny.net/submissionsviewall.php?rid=ffffffffff&mode=search&page=1&orderby=create_datetime&text=cute&stringtype=and&keywords=yes&title=yes&description=no&artist=&favsby=&type=&days=&keyword_id=&user_id=&random=&md5=", "#category": ("", "inkbunny", "search"), From 3f9c113d7860b8687af993e98089caf362479e8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 16 Dec 2023 01:52:31 +0100 Subject: [PATCH 211/344] [mastodon] Support non-numeric status IDs (#4936) --- gallery_dl/extractor/mastodon.py | 2 +- test/results/mastodon.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 test/results/mastodon.py diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index c5fe8407..e277eef0 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -152,7 +152,7 @@ class MastodonFollowingExtractor(MastodonExtractor): class MastodonStatusExtractor(MastodonExtractor): """Extractor for images from a status""" subcategory = "status" - pattern = BASE_PATTERN + r"/@[^/?#]+/(\d+)" + pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)" example = "https://mastodon.social/@USER/12345" def statuses(self): diff --git a/test/results/mastodon.py b/test/results/mastodon.py new file mode 100644 index 00000000..cf881968 --- /dev/null +++ b/test/results/mastodon.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import mastodon + + +__tests__ = ( +{ + "#url" : "mastodon:https://donotsta.re/@elly/AcoUaA7EH1igiYKmFU", + "#category": ("mastodon", "donotsta.re", "status"), + "#class" : mastodon.MastodonStatusExtractor, + "#urls" : "https://asdf.donotsta.re/media/917e7722dd30d510686ce9f3717a1f722dac96fd974b5af5ec2ccbc8cbd740c6.png", + + "instance": "donotsta.re", + "instance_remote": None, +}, + +) From 99aa9233223d94a2bffab8ce1fb61deb12f7cad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 16 Dec 2023 19:21:20 +0100 Subject: [PATCH 212/344] [inkbunny] improve '/submissionsviewall.php' patterns (#4934) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit allow 'mode=…' to be in any position don't require it to be somewhere in the middle --- gallery_dl/extractor/inkbunny.py | 17 ++++++++++------- test/results/inkbunny.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index c52e3570..bdc45c38 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -103,7 +103,8 @@ class InkbunnyPoolExtractor(InkbunnyExtractor): subcategory = "pool" pattern = (BASE_PATTERN + r"/(?:" r"poolview_process\.php\?pool_id=(\d+)|" - r"submissionsviewall\.php\?([^#]+&mode=pool&[^#]+))") + r"submissionsviewall\.php" + r"\?((?:[^#]+&)?mode=pool(?:&[^#]+)?))") example = "https://inkbunny.net/poolview_process.php?pool_id=12345" def __init__(self, match): @@ -133,7 +134,8 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): subcategory = "favorite" pattern = (BASE_PATTERN + r"/(?:" r"userfavorites_process\.php\?favs_user_id=(\d+)|" - r"submissionsviewall\.php\?([^#]+&mode=userfavs&[^#]+))") + r"submissionsviewall\.php" + r"\?((?:[^#]+&)?mode=userfavs(?:&[^#]+)?))") example = ("https://inkbunny.net/userfavorites_process.php" "?favs_user_id=12345") @@ -164,8 +166,8 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): class InkbunnyUnreadExtractor(InkbunnyExtractor): """Extractor for unread inkbunny submissions""" subcategory = "unread" - pattern = (BASE_PATTERN + - r"/submissionsviewall\.php\?([^#]+&mode=unreadsubs&[^#]+)") + pattern = (BASE_PATTERN + r"/submissionsviewall\.php" + r"\?((?:[^#]+&)?mode=unreadsubs(?:&[^#]+)?)") example = ("https://inkbunny.net/submissionsviewall.php" "?text=&mode=unreadsubs&type=") @@ -184,8 +186,8 @@ class InkbunnyUnreadExtractor(InkbunnyExtractor): class InkbunnySearchExtractor(InkbunnyExtractor): """Extractor for inkbunny search results""" subcategory = "search" - pattern = (BASE_PATTERN + - r"/submissionsviewall\.php\?([^#]+&mode=search&[^#]+)") + pattern = (BASE_PATTERN + r"/submissionsviewall\.php" + r"\?((?:[^#]+&)?mode=search(?:&[^#]+)?)") example = ("https://inkbunny.net/submissionsviewall.php" "?text=TAG&mode=search&type=") @@ -221,7 +223,8 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor): subcategory = "following" pattern = (BASE_PATTERN + r"/(?:" r"watchlist_process\.php\?mode=watching&user_id=(\d+)|" - r"usersviewall\.php\?([^#]+&mode=watching&[^#]+))") + r"usersviewall\.php" + r"\?((?:[^#]+&)?mode=watching(?:&[^#]+)?))") example = ("https://inkbunny.net/watchlist_process.php" "?mode=watching&user_id=12345") diff --git a/test/results/inkbunny.py b/test/results/inkbunny.py index 58cecb68..3e2edaf6 100644 --- a/test/results/inkbunny.py +++ b/test/results/inkbunny.py @@ -78,6 +78,12 @@ __tests__ = ( "#class" : inkbunny.InkbunnyPoolExtractor, }, +{ + "#url" : "https://inkbunny.net/submissionsviewall.php?mode=pool&pool_id=28985", + "#category": ("", "inkbunny", "pool"), + "#class" : inkbunny.InkbunnyPoolExtractor, +}, + { "#url" : "https://inkbunny.net/userfavorites_process.php?favs_user_id=20969", "#category": ("", "inkbunny", "favorite"), @@ -94,12 +100,24 @@ __tests__ = ( "#class" : inkbunny.InkbunnyFavoriteExtractor, }, +{ + "#url" : "https://inkbunny.net/submissionsviewall.php?mode=userfavs&user_id=20969", + "#category": ("", "inkbunny", "favorite"), + "#class" : inkbunny.InkbunnyFavoriteExtractor, +}, + { "#url" : "https://inkbunny.net/submissionsviewall.php?rid=ffffffffff&mode=unreadsubs&page=1&orderby=unread_datetime", "#category": ("", "inkbunny", "unread"), "#class" : inkbunny.InkbunnyUnreadExtractor, }, +{ + "#url" : "https://inkbunny.net/submissionsviewall.php?mode=unreadsubs", + "#category": ("", "inkbunny", "unread"), + "#class" : inkbunny.InkbunnyUnreadExtractor, +}, + { "#url" : "https://inkbunny.net/submissionsviewall.php?rid=ffffffffff&mode=search&page=1&orderby=create_datetime&text=cute&stringtype=and&keywords=yes&title=yes&description=no&artist=&favsby=&type=&days=&keyword_id=&user_id=&random=&md5=", "#category": ("", "inkbunny", "search"), @@ -120,6 +138,12 @@ __tests__ = ( }, }, +{ + "#url" : "https://inkbunny.net/submissionsviewall.php?mode=search", + "#category": ("", "inkbunny", "search"), + "#class" : inkbunny.InkbunnySearchExtractor, +}, + { "#url" : "https://inkbunny.net/watchlist_process.php?mode=watching&user_id=20969", "#category": ("", "inkbunny", "following"), @@ -134,6 +158,12 @@ __tests__ = ( "#class" : inkbunny.InkbunnyFollowingExtractor, }, +{ + "#url" : "https://inkbunny.net/usersviewall.php?mode=watching&user_id=20969", + "#category": ("", "inkbunny", "following"), + "#class" : inkbunny.InkbunnyFollowingExtractor, +}, + { "#url" : "https://inkbunny.net/s/1829715", "#category": ("", "inkbunny", "post"), From e097aaf64a6f15ad5823aab9a08ac6b5cdc8f0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 17 Dec 2023 23:25:47 +0100 Subject: [PATCH 213/344] [exhentai] output continuation URL when interrupted (#4782) --- gallery_dl/extractor/exhentai.py | 73 +++++++++++++++++--------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 16398d7a..2ddca7f7 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -124,6 +124,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key_show = None self.key_next = None self.count = 0 + self.data = None def _init(self): source = self.config("source") @@ -140,6 +141,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.fallback_retries = self.config("fallback-retries", 2) self.original = self.config("original", True) + def finalize(self): + if self.data: + self.log.info("Use '%s/s/%s/%s-%s' as input URL " + "to continue downloading from the current position", + self.root, self.data["image_token"], + self.gallery_id, self.data["num"]) + def favorite(self, slot="0"): url = self.root + "/gallerypopups.php" params = { @@ -175,32 +183,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.gallery_token = part.split("/")[1] gpage = self._gallery_page() - data = self.get_metadata(gpage) + self.data = data = self.get_metadata(gpage) self.count = text.parse_int(data["filecount"]) yield Message.Directory, data - def _validate_response(response): - # declared inside 'items()' to be able to access 'data' - if not response.history and response.headers.get( - "content-type", "").startswith("text/html"): - page = response.text - self.log.warning("'%s'", page) - - if " requires GP" in page: - gp = self.config("gp") - if gp == "stop": - raise exception.StopExtraction("Not enough GP") - elif gp == "wait": - input("Press ENTER to continue.") - return response.url - - self.log.info("Falling back to non-original downloads") - self.original = False - return data["_url_1280"] - - self._report_limits(data) - return True - images = itertools.chain( (self.image_from_page(ipage),), self.images_from_api()) for url, image in images: @@ -208,7 +194,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.limits: self._check_limits(data) if "/fullimg" in url: - data["_http_validate"] = _validate_response + data["_http_validate"] = self._validate_response else: data["_http_validate"] = None yield Message.Url, url, data @@ -216,6 +202,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): fav = self.config("fav") if fav is not None: self.favorite(fav) + self.data = None def _items_hitomi(self): if self.config("metadata", False): @@ -329,7 +316,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["_nl"] = nl self.key_show = extr('var showkey="', '";') - self._check_509(iurl, data) + self._check_509(iurl) return url, text.nameext_from_url(url, data) def images_from_api(self): @@ -379,33 +366,51 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["_url_1280"] = imgurl data["_nl"] = nl - self._check_509(imgurl, data) + self._check_509(imgurl) yield url, text.nameext_from_url(url, data) request["imgkey"] = nextkey - def _report_limits(self, data): + def _validate_response(self, response): + if not response.history and response.headers.get( + "content-type", "").startswith("text/html"): + page = response.text + self.log.warning("'%s'", page) + + if " requires GP" in page: + gp = self.config("gp") + if gp == "stop": + raise exception.StopExtraction("Not enough GP") + elif gp == "wait": + input("Press ENTER to continue.") + return response.url + + self.log.info("Falling back to non-original downloads") + self.original = False + return self.data["_url_1280"] + + self._report_limits() + return True + + def _report_limits(self): ExhentaiExtractor.LIMIT = True - raise exception.StopExtraction( - "Image limit reached! " - "Continue with '%s/s/%s/%s-%s' as URL after resetting it.", - self.root, data["image_token"], self.gallery_id, data["num"]) + raise exception.StopExtraction("Image limit reached!") def _check_limits(self, data): if not self._remaining or data["num"] % 25 == 0: self._update_limits() self._remaining -= data["cost"] if self._remaining <= 0: - self._report_limits(data) + self._report_limits() - def _check_509(self, url, data): + def _check_509(self, url): # full 509.gif URLs # - https://exhentai.org/img/509.gif # - https://ehgt.org/g/509.gif if url.endswith(("hentai.org/img/509.gif", "ehgt.org/g/509.gif")): self.log.debug(url) - self._report_limits(data) + self._report_limits() def _update_limits(self): url = "https://e-hentai.org/home.php" From b127321b5c5a5752571c9fa13c5971567d1e09f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Dec 2023 15:27:03 +0100 Subject: [PATCH 214/344] [exhentai] only show 'using e-hentai.org' warning for exh domains --- gallery_dl/extractor/exhentai.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 2ddca7f7..b294bdb6 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -67,10 +67,11 @@ class ExhentaiExtractor(Extractor): if username: return self.cookies_update(self._login_impl(username, password)) - self.log.info("no username given; using e-hentai.org") - self.root = "https://e-hentai.org" - self.cookies_domain = ".e-hentai.org" - self.cookies.set("nw", "1", domain=self.cookies_domain) + if self.version == "ex": + self.log.info("No username or cookies given; using e-hentai.org") + self.root = "https://e-hentai.org" + self.cookies_domain = ".e-hentai.org" + self.cookies.set("nw", "1", domain=self.cookies_domain) self.original = False self.limits = False From 1f9b16a70bb6105e19c11d75a98a97558306fd9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Dec 2023 22:06:26 +0100 Subject: [PATCH 215/344] replace static 'sleep-request' defaults with dynamic ones --- gallery_dl/extractor/danbooru.py | 2 +- gallery_dl/extractor/exhentai.py | 2 +- gallery_dl/extractor/foolfuuka.py | 2 +- gallery_dl/extractor/idolcomplex.py | 2 +- gallery_dl/extractor/newgrounds.py | 2 +- gallery_dl/extractor/philomena.py | 2 +- gallery_dl/extractor/pixiv.py | 2 +- gallery_dl/extractor/plurk.py | 2 +- gallery_dl/extractor/reactor.py | 2 +- gallery_dl/extractor/readcomiconline.py | 2 +- gallery_dl/extractor/twibooru.py | 4 ++-- gallery_dl/extractor/vk.py | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 9e6516e0..881bd91a 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -20,7 +20,7 @@ class DanbooruExtractor(BaseExtractor): page_limit = 1000 page_start = None per_page = 200 - request_interval = 1.0 + request_interval = (0.5, 1.5) def _init(self): self.ugoira = self.config("ugoira", False) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index b294bdb6..e6c38893 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -26,7 +26,7 @@ class ExhentaiExtractor(Extractor): cookies_domain = ".exhentai.org" cookies_names = ("ipb_member_id", "ipb_pass_hash") root = "https://exhentai.org" - request_interval = 5.0 + request_interval = (3.0, 6.0) ciphers = "DEFAULT:!DH" LIMIT = False diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 93ac5416..cedac0c3 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -169,7 +169,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): directory_fmt = ("{category}", "search", "{search}") pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" example = "https://archived.moe/_/search/text/QUERY/" - request_interval = 1.0 + request_interval = (0.5, 1.5) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 5c7a1b3a..af07ebd9 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -25,7 +25,7 @@ class IdolcomplexExtractor(SankakuExtractor): cookies_domain = "idol.sankakucomplex.com" cookies_names = ("_idolcomplex_session",) referer = False - request_interval = (4.0, 6.0) + request_interval = (3.0, 6.0) def __init__(self, match): SankakuExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index a6971e84..5f43690e 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -23,7 +23,7 @@ class NewgroundsExtractor(Extractor): root = "https://www.newgrounds.com" cookies_domain = ".newgrounds.com" cookies_names = ("NG_GG_username", "vmk1du5I8m") - request_interval = 1.0 + request_interval = (0.5, 1.5) def __init__(self, match): Extractor.__init__(self, match) diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 3a0f5b02..ac6a391e 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -18,7 +18,7 @@ class PhilomenaExtractor(BooruExtractor): basecategory = "philomena" filename_fmt = "{filename}.{extension}" archive_fmt = "{id}" - request_interval = 1.0 + request_interval = (0.5, 1.5) page_start = 1 per_page = 50 diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 411d1912..5c80c24e 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -594,7 +594,7 @@ class PixivSeriesExtractor(PixivExtractor): class PixivNovelExtractor(PixivExtractor): """Extractor for pixiv novels""" subcategory = "novel" - request_interval = 1.0 + request_interval = (0.5, 1.5) pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)" example = "https://www.pixiv.net/novel/show.php?id=12345" diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 5a3bf5a9..be0dbde2 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -18,7 +18,7 @@ class PlurkExtractor(Extractor): """Base class for plurk extractors""" category = "plurk" root = "https://www.plurk.com" - request_interval = 1.0 + request_interval = (0.5, 1.5) def items(self): urls = self._urls_ex if self.config("comments", False) else self._urls diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 9a6c8a5a..ab555d8d 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -18,7 +18,7 @@ class ReactorExtractor(BaseExtractor): basecategory = "reactor" filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" archive_fmt = "{post_id}_{num}" - request_interval = 5.0 + request_interval = (3.0, 6.0) def __init__(self, match): BaseExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 93e41be1..35698605 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -23,7 +23,7 @@ class ReadcomiconlineBase(): filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" archive_fmt = "{issue_id}_{page}" root = "https://readcomiconline.li" - request_interval = (3.0, 7.0) + request_interval = (3.0, 6.0) def request(self, url, **kwargs): """Detect and handle redirects to CAPTCHA pages""" diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 49c84195..f57f4798 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -22,7 +22,7 @@ class TwibooruExtractor(BooruExtractor): root = "https://twibooru.org" filename_fmt = "{id}_{filename}.{extension}" archive_fmt = "{id}" - request_interval = 6.05 + request_interval = (6.0, 6.1) page_start = 1 per_page = 50 @@ -44,7 +44,7 @@ class TwibooruExtractor(BooruExtractor): class TwibooruPostExtractor(TwibooruExtractor): """Extractor for single twibooru posts""" subcategory = "post" - request_interval = 1.0 + request_interval = (0.5, 1.5) pattern = BASE_PATTERN + r"/(\d+)" example = "https://twibooru.org/12345" diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index c9cd02f3..c22e67e6 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -21,7 +21,7 @@ class VkExtractor(Extractor): filename_fmt = "{id}.{extension}" archive_fmt = "{id}" root = "https://vk.com" - request_interval = 1.0 + request_interval = (0.5, 1.5) def items(self): sizes = "wzyxrqpo" From 57fc6fcf832a0a1706686be5168201c630b12513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Dec 2023 23:19:44 +0100 Subject: [PATCH 216/344] replace '24*3600' with '86400' and generalize cache maxage values --- gallery_dl/extractor/aryion.py | 2 +- gallery_dl/extractor/deviantart.py | 2 +- gallery_dl/extractor/exhentai.py | 2 +- gallery_dl/extractor/idolcomplex.py | 2 +- gallery_dl/extractor/imgbb.py | 2 +- gallery_dl/extractor/inkbunny.py | 2 +- gallery_dl/extractor/instagram.py | 2 +- gallery_dl/extractor/kemonoparty.py | 2 +- gallery_dl/extractor/mangadex.py | 2 +- gallery_dl/extractor/mastodon.py | 2 +- gallery_dl/extractor/newgrounds.py | 2 +- gallery_dl/extractor/nijie.py | 2 +- gallery_dl/extractor/oauth.py | 2 +- gallery_dl/extractor/pillowfort.py | 2 +- gallery_dl/extractor/pinterest.py | 2 +- gallery_dl/extractor/pixiv.py | 2 +- gallery_dl/extractor/reddit.py | 2 +- gallery_dl/extractor/sankaku.py | 2 +- gallery_dl/extractor/subscribestar.py | 2 +- gallery_dl/extractor/tapas.py | 2 +- gallery_dl/extractor/tsumino.py | 2 +- gallery_dl/extractor/twitter.py | 2 +- gallery_dl/extractor/vipergirls.py | 2 +- gallery_dl/oauth.py | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 576bc832..ec862632 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -40,7 +40,7 @@ class AryionExtractor(Extractor): if username: self.cookies_update(self._login_impl(username, password)) - @cache(maxage=14*24*3600, keyarg=1) + @cache(maxage=14*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index d34f312d..89cd6d1f 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1579,7 +1579,7 @@ class DeviantartEclipseAPI(): return token -@cache(maxage=100*365*86400, keyarg=0) +@cache(maxage=36500*86400, keyarg=0) def _refresh_token_cache(token): if token and token[0] == "#": return None diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index e6c38893..acad95ce 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -75,7 +75,7 @@ class ExhentaiExtractor(Extractor): self.original = False self.limits = False - @cache(maxage=90*24*3600, keyarg=1) + @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index af07ebd9..b9e2c3dd 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -67,7 +67,7 @@ class IdolcomplexExtractor(SankakuExtractor): self.logged_in = False - @cache(maxage=90*24*3600, keyarg=1) + @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 6c0684ed..b926cb21 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -64,7 +64,7 @@ class ImgbbExtractor(Extractor): if username: self.cookies_update(self._login_impl(username, password)) - @cache(maxage=360*24*3600, keyarg=1) + @cache(maxage=365*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index bdc45c38..62586af5 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -360,7 +360,7 @@ class InkbunnyAPI(): params["page"] += 1 -@cache(maxage=360*24*3600, keyarg=1) +@cache(maxage=365*86400, keyarg=1) def _authenticate_impl(api, username, password): api.extractor.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 8ec6741d..6eae7db3 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -977,7 +977,7 @@ class InstagramGraphqlAPI(): variables["after"] = extr._update_cursor(info["end_cursor"]) -@cache(maxage=90*24*3600, keyarg=1) +@cache(maxage=90*86400, keyarg=1) def _login_impl(extr, username, password): extr.log.error("Login with username & password is no longer supported. " "Use browser cookies instead.") diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index cba62110..c24e57d1 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -129,7 +129,7 @@ class KemonopartyExtractor(Extractor): self.cookies_update(self._login_impl( (username, self.cookies_domain), password)) - @cache(maxage=28*24*3600, keyarg=1) + @cache(maxage=28*86400, keyarg=1) def _login_impl(self, username, password): username = username[0] self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index dbaf4cb8..94bea570 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -266,6 +266,6 @@ class MangadexAPI(): return -@cache(maxage=28*24*3600, keyarg=0) +@cache(maxage=28*86400, keyarg=0) def _refresh_token_cache(username): return None diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index e277eef0..0b63d6c1 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -277,6 +277,6 @@ class MastodonAPI(): params = None -@cache(maxage=100*365*24*3600, keyarg=0) +@cache(maxage=36500*86400, keyarg=0) def _access_token_cache(instance): return None diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 5f43690e..4cdcf875 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -98,7 +98,7 @@ class NewgroundsExtractor(Extractor): if username: self.cookies_update(self._login_impl(username, password)) - @cache(maxage=360*24*3600, keyarg=1) + @cache(maxage=365*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 54f29429..814d0843 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -126,7 +126,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): username, password = self._get_auth_info() self.cookies_update(self._login_impl(username, password)) - @cache(maxage=90*24*3600, keyarg=1) + @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): if not username or not password: raise exception.AuthenticationError( diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 65db94d0..16901607 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -376,7 +376,7 @@ class OAuthMastodon(OAuthBase): cache=mastodon._access_token_cache, ) - @cache(maxage=10*365*24*3600, keyarg=1) + @cache(maxage=36500*86400, keyarg=1) def _register(self, instance): self.log.info("Registering application for '%s'", instance) diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index ff591fb1..5362f13e 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -91,7 +91,7 @@ class PillowfortExtractor(Extractor): if username: self.cookies_update(self._login_impl(username, password)) - @cache(maxage=14*24*3600, keyarg=1) + @cache(maxage=14*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index e9f124f1..93ef7ff6 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -422,7 +422,7 @@ class PinterestAPI(): if username: self.cookies.update(self._login_impl(username, password)) - @cache(maxage=180*24*3600, keyarg=1) + @cache(maxage=180*86400, keyarg=1) def _login_impl(self, username, password): self.extractor.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 5c80c24e..4414c71c 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -996,6 +996,6 @@ class PixivAppAPI(): params = text.parse_query(query) -@cache(maxage=10*365*24*3600, keyarg=0) +@cache(maxage=36500*86400, keyarg=0) def _refresh_token_cache(username): return None diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 4c6dd2c7..2ef0f9fb 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -531,7 +531,7 @@ class RedditAPI(): return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz") -@cache(maxage=100*365*24*3600, keyarg=0) +@cache(maxage=36500*86400, keyarg=0) def _refresh_token_cache(token): if token and token[0] == "#": return None diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 89412584..602895c4 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -285,7 +285,7 @@ class SankakuAPI(): return -@cache(maxage=365*24*3600, keyarg=1) +@cache(maxage=365*86400, keyarg=1) def _authenticate_impl(extr, username, password): extr.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 6b4cba21..31fb891a 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -56,7 +56,7 @@ class SubscribestarExtractor(Extractor): if username: self.cookies_update(self._login_impl(username, password)) - @cache(maxage=28*24*3600, keyarg=1) + @cache(maxage=28*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index bfca7a62..0a9df20c 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -81,7 +81,7 @@ class TapasExtractor(Extractor): self.cookies.set( "adjustedBirthDate", "1981-02-03", domain=self.cookies_domain) - @cache(maxage=14*24*3600, keyarg=1) + @cache(maxage=14*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index de7cdfc0..bce661a5 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -27,7 +27,7 @@ class TsuminoBase(): self.cookies.setdefault( "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5") - @cache(maxage=14*24*3600, keyarg=1) + @cache(maxage=14*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = "{}/Account/Login".format(self.root) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index f874f127..726b8a19 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1713,7 +1713,7 @@ class TwitterAPI(): } -@cache(maxage=360*86400, keyarg=1) +@cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): import re diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 4ee252ef..5374f1ce 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -45,7 +45,7 @@ class VipergirlsExtractor(Extractor): if username: self.cookies_update(self._login_impl(username, password)) - @cache(maxage=90*24*3600, keyarg=1) + @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py index ac38c4dc..8508ee1e 100644 --- a/gallery_dl/oauth.py +++ b/gallery_dl/oauth.py @@ -138,6 +138,6 @@ class OAuth1API(): return self.extractor.request(url, **kwargs) -@cache(maxage=100*365*24*3600, keyarg=0) +@cache(maxage=36500*86400, keyarg=0) def _token_cache(key): return None, None From a30a3e44d5e9ed4caf9cd602e88ad67ab49ef6ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Dec 2023 23:45:58 +0100 Subject: [PATCH 217/344] [nijie] move 'username required' out of _login_impl --- gallery_dl/extractor/nijie.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 814d0843..57c31184 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -124,15 +124,15 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): return username, password = self._get_auth_info() - self.cookies_update(self._login_impl(username, password)) + if username: + return self.cookies_update(self._login_impl(username, password)) + + raise exception.AuthenticationError("Username and password required") @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): - if not username or not password: - raise exception.AuthenticationError( - "Username and password required") - self.log.info("Logging in as %s", username) + url = "{}/login_int.php".format(self.root) data = {"email": username, "password": password, "save": "on"} From 2a36937e45148fef8bc2b2ccf8b8a429d57ce879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 19 Dec 2023 01:53:17 +0100 Subject: [PATCH 218/344] add instructions for pulling dockerhub & ghcr images --- README.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index fc43900c..9ecf0aed 100644 --- a/README.rst +++ b/README.rst @@ -142,9 +142,23 @@ Using the Dockerfile in the repository: cd gallery-dl/ docker build -t gallery-dl:latest . +Pulling image from `Docker Hub `__: + +.. code:: bash + + docker pull mikf123/gallery-dl + docker tag mikf123/gallery-dl gallery-dl + +Pulling image from `GitHub Container Registry `__: + +.. code:: bash + + docker pull ghcr.io/mikf/gallery-dl + docker tag ghcr.io/mikf/gallery-dl gallery-dl + To run the container you will probably want to attach some directories on the host so that the config file and downloads can persist across runs. -Make sure to either download the example config file reference in the repo and place it in the mounted volume location or touch an empty file there. +Make sure to either download the example config file reference in the repo and place it in the mounted volume location or touch an empty file there. If you gave the container a different tag or are using podman then make sure you adjust. Run ``docker image ls`` to check the name if you are not sure. From a94f9441487573ea84700936117f4535e78d32c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 20 Dec 2023 01:57:18 +0100 Subject: [PATCH 219/344] [twitter] default to 'tweets' timeline when 'replies' are enabled (#4953) --- docs/configuration.rst | 2 +- gallery_dl/extractor/twitter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index e7f4fc03..7d9f11f1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3447,7 +3447,7 @@ Description * ``"tweets"``: `/tweets `__ timeline + search * ``"media"``: `/media `__ timeline + search * ``"with_replies"``: `/with_replies `__ timeline + search - * ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets `__ and `text-tweets `__ settings + * ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets `__, `replies `__, and `text-tweets `__ settings extractor.twitter.text-tweets diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 726b8a19..a607d67b 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -550,7 +550,7 @@ class TwitterTimelineExtractor(TwitterExtractor): def _select_tweet_source(self): strategy = self.config("strategy") if strategy is None or strategy == "auto": - if self.retweets or self.textonly: + if self.retweets or self.replies or self.textonly: return self.api.user_tweets else: return self.api.user_media From a75f85a2c20c4052610659dda434ac646264bd2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 20 Dec 2023 14:12:49 +0100 Subject: [PATCH 220/344] [twitter] remove 'date_liked' (#3850, #4108, #4657) Twitter's 'sortIndex' can't be used to calculate the timestamp of when a Tweet was liked anymore. --- gallery_dl/extractor/twitter.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a607d67b..e0effb1c 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -603,12 +603,6 @@ class TwitterLikesExtractor(TwitterExtractor): def tweets(self): return self.api.user_likes(self.user) - def _transform_tweet(self, tweet): - tdata = TwitterExtractor._transform_tweet(self, tweet) - tdata["date_liked"] = text.parse_timestamp( - (int(tweet["sortIndex"] or 0) >> 20) // 1000) - return tdata - class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" From 92ff99c8e55910ecb0c91d7cac67c76a336324dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 20 Dec 2023 14:38:36 +0100 Subject: [PATCH 221/344] [twitter] remove 'syndication' option (#3889) --- docs/configuration.rst | 26 -------------- gallery_dl/extractor/twitter.py | 62 +-------------------------------- 2 files changed, 1 insertion(+), 87 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 7d9f11f1..c19480e3 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3262,8 +3262,6 @@ Description for each Tweet in said timeline. Note: This requires at least 1 additional API call per initial Tweet. - Age-restricted replies cannot be expanded when using the - `syndication `__ API. extractor.twitter.include @@ -3331,30 +3329,6 @@ Description ``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``. -extractor.twitter.syndication ------------------------------ -Type - * ``bool`` - * ``string`` -Default - ``false`` -Description - Controls how to retrieve age-restricted content when not logged in. - - * ``false``: Skip age-restricted Tweets. - * ``true``: Download using Twitter's syndication API. - * ``"extended"``: Try to fetch Tweet metadata using the normal API - in addition to the syndication API. This requires additional HTTP - requests in some cases (e.g. when `retweets `_ - are enabled). - - Note: This does not apply to search results (including - `timeline strategies `__). - To retrieve such content from search results, you must log in and - disable "Hide sensitive content" in your `search settings - `__. - - extractor.twitter.logout ------------------------ Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index e0effb1c..fdcefddc 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -45,7 +45,6 @@ class TwitterExtractor(Extractor): self.cards = self.config("cards", False) self.ads = self.config("ads", False) self.cards_blacklist = self.config("cards-blacklist") - self.syndication = self.config("syndication") if not self.config("transform", True): self._transform_user = util.identity @@ -367,9 +366,6 @@ class TwitterExtractor(Extractor): if "legacy" in user: user = user["legacy"] - elif "statuses_count" not in user and self.syndication == "extended": - # try to fetch extended user data - user = self.api.user_by_screen_name(user["screen_name"])["legacy"] uget = user.get if uget("withheld_scope"): @@ -865,7 +861,6 @@ class TwitterAPI(): self.root = "https://twitter.com/i/api" self._nsfw_warning = True - self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode cookies = extractor.cookies @@ -1645,67 +1640,12 @@ class TwitterAPI(): tweet_id = entry["entryId"].rpartition("-")[2] if text.startswith("Age-restricted"): - if self._syndication: - return self._syndication_tweet(tweet_id) - elif self._nsfw_warning: + if self._nsfw_warning: self._nsfw_warning = False self.extractor.log.warning('"%s"', text) self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text) - def _syndication_tweet(self, tweet_id): - base_url = "https://cdn.syndication.twimg.com/tweet-result?id=" - tweet = self.extractor.request(base_url + tweet_id).json() - - tweet["user"]["description"] = "" - tweet["user"]["entities"] = {"description": {}} - tweet["user_id_str"] = tweet["user"]["id_str"] - - if tweet["id_str"] != tweet_id: - tweet["retweeted_status_id_str"] = tweet["id_str"] - tweet["id_str"] = retweet_id = tweet_id - else: - retweet_id = None - - # assume 'conversation_id' is the same as 'id' when the tweet - # is not a reply - if "conversation_id_str" not in tweet and \ - "in_reply_to_status_id_str" not in tweet: - tweet["conversation_id_str"] = tweet["id_str"] - - if int(tweet_id) < 300000000000000: - tweet["created_at"] = text.parse_datetime( - tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime( - "%a %b %d %H:%M:%S +0000 %Y") - - if "video" in tweet: - video = tweet["video"] - video["variants"] = (max( - (v for v in video["variants"] if v["type"] == "video/mp4"), - key=lambda v: text.parse_int( - v["src"].split("/")[-2].partition("x")[0]) - ),) - video["variants"][0]["url"] = video["variants"][0]["src"] - tweet["extended_entities"] = {"media": [{ - "video_info" : video, - "original_info": {"width" : 0, "height": 0}, - }]} - elif "photos" in tweet: - for p in tweet["photos"]: - p["media_url_https"] = p["url"] - p["original_info"] = { - "width" : p["width"], - "height": p["height"], - } - tweet["extended_entities"] = {"media": tweet["photos"]} - - return { - "rest_id": tweet["id_str"], - "legacy" : tweet, - "core" : {"user_results": {"result": tweet["user"]}}, - "_retweet_id_str": retweet_id, - } - @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): From 75fa1a5553101d05b6d453c82a15112fb318f572 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 20 Dec 2023 20:59:18 +0100 Subject: [PATCH 222/344] [pinterest] remove login code this has been broken since forever and is still "protected" by an invisible recaptcha check --- gallery_dl/extractor/pinterest.py | 37 ------------------------------- 1 file changed, 37 deletions(-) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 93ef7ff6..4b263934 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -10,7 +10,6 @@ from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache import itertools BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" @@ -33,7 +32,6 @@ class PinterestExtractor(Extractor): self.api = PinterestAPI(self) def items(self): - self.api.login() data = self.metadata() videos = self.config("videos", True) @@ -416,41 +414,6 @@ class PinterestAPI(): options = {"query": query, "scope": "pins", "rs": "typed"} return self._pagination("BaseSearch", options) - def login(self): - """Login and obtain session cookies""" - username, password = self.extractor._get_auth_info() - if username: - self.cookies.update(self._login_impl(username, password)) - - @cache(maxage=180*86400, keyarg=1) - def _login_impl(self, username, password): - self.extractor.log.info("Logging in as %s", username) - - url = self.root + "/resource/UserSessionResource/create/" - options = { - "username_or_email": username, - "password" : password, - } - data = { - "data" : util.json_dumps({"options": options}), - "source_url": "", - } - - try: - response = self.extractor.request( - url, method="POST", headers=self.headers, - cookies=self.cookies, data=data) - resource = response.json()["resource_response"] - except (exception.HttpError, ValueError, KeyError): - raise exception.AuthenticationError() - - if resource["status"] != "success": - raise exception.AuthenticationError() - return { - cookie.name: cookie.value - for cookie in response.cookies - } - def _call(self, resource, options): url = "{}/resource/{}Resource/get/".format(self.root, resource) params = { From fbebc58189623416aaa3b56fae8b9d49fb103ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 21 Dec 2023 02:23:22 +0100 Subject: [PATCH 223/344] [deviantart] add 'intermediary' option (#4955) --- docs/configuration.rst | 11 +++++++++++ gallery_dl/extractor/deviantart.py | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index c19480e3..12232dd8 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1373,6 +1373,17 @@ Description It is possible to use ``"all"`` instead of listing all values separately. +extractor.deviantart.intermediary +--------------------------------- +Type + ``bool`` +Default + ``true`` +Description + For older non-downloadable images, + download a higher-quality ``/intermediary/`` version. + + extractor.deviantart.journals ----------------------------- Type diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 89cd6d1f..9be5b0df 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -48,6 +48,7 @@ class DeviantartExtractor(Extractor): self.quality = self.config("quality", "100") self.original = self.config("original", True) self.comments = self.config("comments", False) + self.intermediary = self.config("intermediary", True) self.api = DeviantartOAuthAPI(self) self.group = False @@ -136,7 +137,7 @@ class DeviantartExtractor(Extractor): elif self.jwt: self._update_token(deviation, content) elif content["src"].startswith("https://images-wixmp-"): - if deviation["index"] <= 790677560: + if self.intermediary and deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 intermediary, count = re.subn( r"(/f/[^/]+/[^/]+)/v\d+/.*", From 627ed794a20ce486c547affa34df3a303855dfb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 21 Dec 2023 14:39:38 +0100 Subject: [PATCH 224/344] [danbooru] provide 'tags' as list (#4942) keep the old 'tag_string' values around, similar to sankaku a lot of repeat code ... would be a lot less bad if "".split(" ") returned an empty list --- gallery_dl/extractor/danbooru.py | 19 +++++ test/results/danbooru.py | 125 +++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 881bd91a..09beb5f1 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -72,6 +72,25 @@ class DanbooruExtractor(BaseExtractor): post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["tags"] = ( + post["tag_string"].split(" ") + if post["tag_string"] else ()) + post["tags_artist"] = ( + post["tag_string_artist"].split(" ") + if post["tag_string_artist"] else ()) + post["tags_character"] = ( + post["tag_string_character"].split(" ") + if post["tag_string_character"] else ()) + post["tags_copyright"] = ( + post["tag_string_copyright"].split(" ") + if post["tag_string_copyright"] else ()) + post["tags_general"] = ( + post["tag_string_general"].split(" ") + if post["tag_string_general"] else ()) + post["tags_meta"] = ( + post["tag_string_meta"].split(" ") + if post["tag_string_meta"] else ()) + if post["extension"] == "zip": if self.ugoira: post["frames"] = self._ugoira_frames(post) diff --git a/test/results/danbooru.py b/test/results/danbooru.py index 1bebf1ca..bad66413 100644 --- a/test/results/danbooru.py +++ b/test/results/danbooru.py @@ -75,7 +75,132 @@ __tests__ = ( "#class" : danbooru.DanbooruPostExtractor, "#sha1_content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "approver_id": None, + "bit_flags": 0, + "created_at": "2008-08-12T00:46:05.385-04:00", "date": "dt:2008-08-12 04:46:05", + "down_score": 0, + "extension": "jpg", + "fav_count": 9, + "file_ext": "jpg", + "file_size": 358232, + "file_url": "https://cdn.donmai.us/original/ac/8e/ac8e3b92ea328ce9cf7211e69c905bf9.jpg", + "filename": "ac8e3b92ea328ce9cf7211e69c905bf9", + "has_active_children": False, + "has_children": False, + "has_large": True, + "has_visible_children": False, + "id": 294929, + "image_height": 687, + "image_width": 895, + "is_banned": False, + "is_deleted": False, + "is_flagged": False, + "is_pending": False, + "large_file_url": "https://cdn.donmai.us/sample/ac/8e/sample-ac8e3b92ea328ce9cf7211e69c905bf9.jpg", + "last_comment_bumped_at": None, + "last_commented_at": None, + "last_noted_at": None, + "md5": "ac8e3b92ea328ce9cf7211e69c905bf9", + "media_asset": dict, + "parent_id": None, + "pixiv_id": 1129835, + "preview_file_url": "https://cdn.donmai.us/180x180/ac/8e/ac8e3b92ea328ce9cf7211e69c905bf9.jpg", + "rating": "s", + "score": 1, + "source": "https://i.pximg.net/img-original/img/2008/07/09/16/10/23/1129835_p0.jpg", + "subcategory": "post", + "tag_count": 32, + "tag_count_artist": 1, + "tag_count_character": 3, + "tag_count_copyright": 3, + "tag_count_general": 23, + "tag_count_meta": 2, + "tag_string": "2boys bat_(animal) batman batman_(series) black_bodysuit bodysuit bonocho brown_eyes closed_mouth collared_shirt commentary_request copyright_name dc_comics expressionless facepaint glasgow_smile heath_ledger joker_(dc) male_focus multiple_boys outline outstretched_arm parted_lips photoshop_(medium) pink_shirt shirt sketch smile the_dark_knight upper_body white_outline wing_collar", + "tag_string_artist": "bonocho", + "tag_string_character": "batman heath_ledger joker_(dc)", + "tag_string_copyright": "batman_(series) dc_comics the_dark_knight", + "tag_string_general": "2boys bat_(animal) black_bodysuit bodysuit brown_eyes closed_mouth collared_shirt copyright_name expressionless facepaint glasgow_smile male_focus multiple_boys outline outstretched_arm parted_lips pink_shirt shirt sketch smile upper_body white_outline wing_collar", + "tag_string_meta": "commentary_request photoshop_(medium)", + "tags": [ + "2boys", + "bat_(animal)", + "batman", + "batman_(series)", + "black_bodysuit", + "bodysuit", + "bonocho", + "brown_eyes", + "closed_mouth", + "collared_shirt", + "commentary_request", + "copyright_name", + "dc_comics", + "expressionless", + "facepaint", + "glasgow_smile", + "heath_ledger", + "joker_(dc)", + "male_focus", + "multiple_boys", + "outline", + "outstretched_arm", + "parted_lips", + "photoshop_(medium)", + "pink_shirt", + "shirt", + "sketch", + "smile", + "the_dark_knight", + "upper_body", + "white_outline", + "wing_collar", + ], + "tags_artist": [ + "bonocho", + ], + "tags_character": [ + "batman", + "heath_ledger", + "joker_(dc)", + ], + "tags_copyright": [ + "batman_(series)", + "dc_comics", + "the_dark_knight", + ], + "tags_general": [ + "2boys", + "bat_(animal)", + "black_bodysuit", + "bodysuit", + "brown_eyes", + "closed_mouth", + "collared_shirt", + "copyright_name", + "expressionless", + "facepaint", + "glasgow_smile", + "male_focus", + "multiple_boys", + "outline", + "outstretched_arm", + "parted_lips", + "pink_shirt", + "shirt", + "sketch", + "smile", + "upper_body", + "white_outline", + "wing_collar", + ], + "tags_meta": [ + "commentary_request", + "photoshop_(medium)", + ], + "up_score": range(1, 5), + "updated_at": "2022-07-11T23:42:31.881-04:00", + "uploader_id": 67005, }, { From ef370df41d4ace08db698a550488a5c6d248f11d Mon Sep 17 00:00:00 2001 From: bun-dev <34141271+bun-dev@users.noreply.github.com> Date: Sun, 17 Dec 2023 15:01:26 -0500 Subject: [PATCH 225/344] [shimmie2] support 'rule34hentai.net' - Add files via upload - Update shimmie2.py - Update shimme2.py - Delete gallery_dl/extractor/shimme2.py - spacefix - Update shimmie2.py - Update shimmie2.py - flask warnings1 - Update shimmie2.py - Update shimmie2.py --- gallery_dl/extractor/shimmie2.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 912e6013..299b2394 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -36,6 +36,9 @@ class Shimmie2Extractor(BaseExtractor): if self.category == "giantessbooru": self.posts = self._posts_giantessbooru + if self.category == "rule34hentai": + self.posts = self._posts_giantessbooru + def items(self): data = self.metadata() @@ -85,6 +88,10 @@ INSTANCES = { "pattern": r"booru\.cavemanon\.xyz", "file_url": "{0}/index.php?q=image/{2}.{4}", }, + "rule34hentai": { + "root": "https://rule34hentai.net", + "pattern": r"rule34hentai\.net", + }, } BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=/?)?" @@ -187,6 +194,56 @@ class Shimmie2TagExtractor(Shimmie2Extractor): if not extr('/{}">{}<'.format(pnum, pnum), ">"): return + def _posts_rule34hentai(self): + pnum = text.parse_int(self.page, 1) + file_url_fmt = self.file_url_fmt.format + + init = True + mime = "" + + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + extr = text.extract_from(page) + + if init: + init = False + has_mime = ("data-mime=\"" in page) + has_pid = ("data-post-id=\"" in page) + + while True: + if has_mime: + mime = extr("data-mime=\"", "\"") + if has_pid: + pid = extr("data-post-id=\"", "\"") + else: + pid = extr("href='/post/view/", "?") + + if not pid: + break + + tags, dimensions, size, ext = extr( + "title=\"", "\"").split(" // ") + width, _, height = dimensions.partition("x") + md5 = extr("/_thumbs/", "/") + + yield { + "file_url": file_url_fmt( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2] if mime else "jpg"), + "id": pid, + "md5": md5, + "tags": tags, + "width": width, + "height": height, + "size": text.parse_bytes(size[:-1]), + } + + pnum += 1 + if not extr(">Next<", ">"): + if not extr("/{}'>{}<".format(pnum, pnum), ">"): + return + class Shimmie2PostExtractor(Shimmie2Extractor): """Extractor for single shimmie2 posts""" @@ -234,3 +291,25 @@ class Shimmie2PostExtractor(Shimmie2Extractor): "height" : 0, "size" : 0, },) + + def _posts_rule34hentai(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + post = { + "id" : self.post_id, + "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + ( + extr("id='main_image' src=\"", "\"") or + extr("").partition( + " ")[0].strip("\"'"), + "size" : 0, + } + + if not post["md5"]: + post["md5"] = text.extr(post["file_url"], "/_images/", "/") + + return (post,) From 79e4606893fe03bf0aa59c2afd5cec541c6cd93d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Dec 2023 00:01:36 +0100 Subject: [PATCH 226/344] [rule34hentai] cleanup - fix using 'self._posts_rule34hentai' - fix 'file_url' for posts - update docs/supportedsites - add tests --- docs/supportedsites.md | 6 ++++ gallery_dl/extractor/shimmie2.py | 9 +++--- scripts/supportedsites.py | 1 + test/results/rule34hentai.py | 51 ++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 test/results/rule34hentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e3b4ea7e..df61f122 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1371,6 +1371,12 @@ Consider all listed sites to potentially be NSFW. + + + + + + diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 299b2394..725d05d2 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -35,9 +35,8 @@ class Shimmie2Extractor(BaseExtractor): if self.category == "giantessbooru": self.posts = self._posts_giantessbooru - - if self.category == "rule34hentai": - self.posts = self._posts_giantessbooru + elif self.category == "rule34hentai": + self.posts = self._posts_rule34hentai def items(self): data = self.metadata() @@ -301,8 +300,8 @@ class Shimmie2PostExtractor(Shimmie2Extractor): "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : extr("/_thumbs/", "/"), "file_url": self.root + ( - extr("id='main_image' src=\"", "\"") or - extr("").partition( " ")[0].strip("\"'"), diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 2a625e76..11473b53 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -106,6 +106,7 @@ CATEGORY_MAP = { "redgifs" : "RedGIFs", "rozenarcana" : "Rozen Arcana", "rule34" : "Rule 34", + "rule34hentai" : "Rule34Hentai", "rule34us" : "Rule 34", "sankaku" : "Sankaku Channel", "sankakucomplex" : "Sankaku Complex", diff --git a/test/results/rule34hentai.py b/test/results/rule34hentai.py new file mode 100644 index 00000000..1d3cb291 --- /dev/null +++ b/test/results/rule34hentai.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import shimmie2 + + +__tests__ = ( +{ + "#url" : "https://rule34hentai.net/post/list/mizuki_kotora/1", + "#category": ("shimmie2", "rule34hentai", "tag"), + "#class" : shimmie2.Shimmie2TagExtractor, + "#urls" : ( + "https://rule34hentai.net/_images/7f3a411263d0f6de936e47ae8f9d35fb/332%20-%20Darkstalkers%20Felicia%20mizuki_kotora.jpeg", + "https://rule34hentai.net/_images/1a8eca7c04f8bf325bc993c5751a91c4/264%20-%20Darkstalkers%20Felicia%20mizuki_kotora.jpeg", + "https://rule34hentai.net/_images/09511511c4c9e9e1f9b795e059a60832/259%20-%20Darkstalkers%20Felicia%20mizuki_kotora.jpeg", + ), + + "extension" : "jpeg", + "file_url" : r"re:https://rule34hentai.net/_images/.+\.jpeg", + "filename" : r"re:\d+ - \w+", + "height" : range(496, 875), + "id" : range(259, 332), + "md5" : r"re:^[0-9a-f]{32}$", + "search_tags": "mizuki_kotora", + "size" : int, + "tags" : str, + "width" : range(500, 850), +}, + +{ + "#url" : "https://rule34hentai.net/post/view/264", + "#category": ("shimmie2", "rule34hentai", "post"), + "#class" : shimmie2.Shimmie2PostExtractor, + "#urls" : "https://rule34hentai.net/_images/1a8eca7c04f8bf325bc993c5751a91c4/264%20-%20Darkstalkers%20Felicia%20mizuki_kotora.jpg", + "#sha1_content": "6c23780bb78673cbff1bca9accb77ea11ec734f3", + + "extension": "jpg", + "file_url" : "https://rule34hentai.net/_images/1a8eca7c04f8bf325bc993c5751a91c4/264%20-%20Darkstalkers%20Felicia%20mizuki_kotora.jpg", + "filename" : "264 - Darkstalkers Felicia mizuki_kotora", + "height" : 875, + "id" : 264, + "md5" : "1a8eca7c04f8bf325bc993c5751a91c4", + "size" : 0, + "tags" : "Darkstalkers Felicia mizuki_kotora", + "width" : 657, +}, + +) From 2a60645095756bcfba4309f5409a7874bc0b3725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Dec 2023 14:49:10 +0100 Subject: [PATCH 227/344] [deviantart] set 'is_original' for intermediary URLs to 'false' --- gallery_dl/extractor/deviantart.py | 1 + test/results/deviantart.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9be5b0df..2ba47e1e 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -143,6 +143,7 @@ class DeviantartExtractor(Extractor): r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", content["src"], 1) if count: + deviation["is_original"] = False deviation["_fallback"] = (content["src"],) content["src"] = intermediary if self.quality: diff --git a/test/results/deviantart.py b/test/results/deviantart.py index ea8773d2..4196f32c 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -568,7 +568,10 @@ __tests__ = ( "#comment" : "wixmp URL rewrite /intermediary/", "#category": ("", "deviantart", "deviation"), "#class" : deviantart.DeviantartDeviationExtractor, - "#pattern" : r"https://images-wixmp-\w+\.wixmp\.com/intermediary/f/[^/]+/[^.]+\.jpg", + "#urls" : "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/intermediary/f/4deb0f1a-cdef-444e-b194-c8d6b3f7e933/dd1xca2-7f835e62-6fd3-4b99-92c7-2bfd4e1b296f.jpg", + + "is_downloadable": False, + "is_original" : False, }, { From 7cd0211cc981ba99f59f5e6ac12413f8d4854442 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Dec 2023 15:25:28 +0100 Subject: [PATCH 228/344] [shimmie2] autodetect single or double quotes --- gallery_dl/extractor/shimmie2.py | 103 ++++++------------------------- 1 file changed, 20 insertions(+), 83 deletions(-) diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 725d05d2..3ba95eec 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -35,8 +35,6 @@ class Shimmie2Extractor(BaseExtractor): if self.category == "giantessbooru": self.posts = self._posts_giantessbooru - elif self.category == "rule34hentai": - self.posts = self._posts_rule34hentai def items(self): data = self.metadata() @@ -66,6 +64,13 @@ class Shimmie2Extractor(BaseExtractor): """Return an iterable containing data of all relevant posts""" return () + def _quote_type(self, page): + """Return quoting character used in 'page' (' or ")""" + try: + return page[page.index("{}<'.format(pnum, pnum), ">"): return - def _posts_rule34hentai(self): - pnum = text.parse_int(self.page, 1) - file_url_fmt = self.file_url_fmt.format - - init = True - mime = "" - - while True: - url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) - page = self.request(url).text - extr = text.extract_from(page) - - if init: - init = False - has_mime = ("data-mime=\"" in page) - has_pid = ("data-post-id=\"" in page) - - while True: - if has_mime: - mime = extr("data-mime=\"", "\"") - if has_pid: - pid = extr("data-post-id=\"", "\"") - else: - pid = extr("href='/post/view/", "?") - - if not pid: - break - - tags, dimensions, size, ext = extr( - "title=\"", "\"").split(" // ") - width, _, height = dimensions.partition("x") - md5 = extr("/_thumbs/", "/") - - yield { - "file_url": file_url_fmt( - self.root, md5, pid, text.quote(tags), - mime.rpartition("/")[2] if mime else "jpg"), - "id": pid, - "md5": md5, - "tags": tags, - "width": width, - "height": height, - "size": text.parse_bytes(size[:-1]), - } - - pnum += 1 - if not extr(">Next<", ">"): - if not extr("/{}'>{}<".format(pnum, pnum), ">"): - return - class Shimmie2PostExtractor(Shimmie2Extractor): """Extractor for single shimmie2 posts""" @@ -256,15 +213,17 @@ class Shimmie2PostExtractor(Shimmie2Extractor): def posts(self): url = "{}/post/view/{}".format(self.root, self.post_id) - extr = text.extract_from(self.request(url).text) + page = self.request(url).text + extr = text.extract_from(page) + quote = self._quote_type(page) post = { "id" : self.post_id, "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : extr("/_thumbs/", "/"), "file_url": self.root + ( - extr("id='main_image' src='", "'") or - extr("").partition( " ")[0].strip("\"'"), @@ -290,25 +249,3 @@ class Shimmie2PostExtractor(Shimmie2Extractor): "height" : 0, "size" : 0, },) - - def _posts_rule34hentai(self): - url = "{}/post/view/{}".format(self.root, self.post_id) - extr = text.extract_from(self.request(url).text) - - post = { - "id" : self.post_id, - "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), - "md5" : extr("/_thumbs/", "/"), - "file_url": self.root + ( - extr('id="main_image" src="', '"') or - extr('").partition( - " ")[0].strip("\"'"), - "size" : 0, - } - - if not post["md5"]: - post["md5"] = text.extr(post["file_url"], "/_images/", "/") - - return (post,) From c184454efb48196346dfb7bbe23140369dead5ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Dec 2023 20:29:22 +0100 Subject: [PATCH 229/344] [shimmie2] small optimizations - unroll/remove loop - avoid copy --- gallery_dl/extractor/shimmie2.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 3ba95eec..8a08fabb 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -41,8 +41,9 @@ class Shimmie2Extractor(BaseExtractor): for post in self.posts(): - for key in ("id", "width", "height"): - post[key] = text.parse_int(post[key]) + post["id"] = text.parse_int(post["id"]) + post["width"] = text.parse_int(post["width"]) + post["height"] = text.parse_int(post["height"]) post["tags"] = text.unquote(post["tags"]) post.update(data) @@ -147,8 +148,11 @@ class Shimmie2TagExtractor(Shimmie2Extractor): if not pid: break - tags, dimensions, size = extr( - "title="+quote, quote).split(" // ")[:3] + data = extr("title="+quote, quote).split(" // ") + tags = data[0] + dimensions = data[1] + size = data[2] + width, _, height = dimensions.partition("x") md5 = extr("/_thumbs/", "/") From 8a42ea736ad86c294e945f2eb92dd773f8d7bb00 Mon Sep 17 00:00:00 2001 From: blankie Date: Sat, 23 Dec 2023 13:28:36 +1100 Subject: [PATCH 230/344] [postmill] implement suggestions --- docs/configuration.rst | 2 +- gallery_dl/extractor/postmill.py | 41 ++++++++++++++++---------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index c49dc2c9..fb4b93c6 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2735,7 +2735,7 @@ Description extractor.[postmill].save-link-post-body ------------------------- +---------------------------------------- Type ``bool`` Default diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py index 4d4b38a2..29b351ba 100644 --- a/gallery_dl/extractor/postmill.py +++ b/gallery_dl/extractor/postmill.py @@ -7,7 +7,6 @@ """Extractors for Postmill instances""" import re -import urllib.parse from .common import BaseExtractor, Message from .. import text, exception @@ -28,8 +27,8 @@ class PostmillExtractor(BaseExtractor): def items(self): for post_url in self.post_urls(): - response = self.request(post_url) - extr = text.extract_from(response.text) + page = self.request(post_url).text + extr = text.extract_from(page) title = text.unescape(extr( '')) @@ -52,7 +51,7 @@ class PostmillExtractor(BaseExtractor): id = int(match.group(2)) is_text_post = url.startswith("/") - is_image_post = self._search_image_tag(response.text) is not None + is_image_post = self._search_image_tag(page) is not None data = { "title": title, "date": date, @@ -60,7 +59,7 @@ class PostmillExtractor(BaseExtractor): "forum": forum, "id": id, "flair": [text.unescape(i) for i in text.extract_iter( - response.text, '', '')], + page, '', '')], "instance": self.instance, } @@ -90,32 +89,32 @@ class PostmillSubmissionsExtractor(PostmillExtractor): def __init__(self, match): PostmillExtractor.__init__(self, match) - self.base = match.group(3) - self.sorting_path = match.group(4) or "" + groups = match.groups() + self.base = groups[-3] + self.sorting_path = groups[-2] or "" self.query = {key: value for key, value in text.parse_query( - match.group(5) or "").items() if self.acceptable_query(key)} + groups[-1]).items() if self.acceptable_query(key)} def items(self): url = self.root + self.base + self.sorting_path - if self.query: - url += "?" + urllib.parse.urlencode(self.query) while url: - response = self.request(url) + response = self.request(url, params=self.query) if response.history: redirect_url = response.url if redirect_url == self.root + "/login": raise exception.StopExtraction( "HTTP redirect to login page (%s)", redirect_url) + page = response.text - for nav in text.extract_iter(response.text, + for nav in text.extract_iter(page, ''): post_url = text.unescape(text.extr(nav, '") + title = text.extr(title_container, "", "") + + return { + "manga" : text.unescape(manga), + "title" : text.unescape(title), + "author" : "", + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor": sep + minor, + } + + def images(self, page): + images_container = text.extr(page, 'pageOpts', ':[0,0]}"') + images_container = text.unescape(images_container) + + return [(url, None) for url in text.extract_iter(images_container, r'\"', r'\"')] + + +class BatoMangaExtractor(BatoBase, MangaExtractor): + """Extractor for manga from bato.to""" + reverse = False + chapterclass = BatoChapterExtractor + pattern = BASE_PATTERN + "(" + MANGA_PATTERN + "$" + ")" + # There are two possible patterns for a manga + example = "https://bato.to/title/12345-manga-name-with-spaces/" + example2 = "https://bato.to/title/12345/" + # v2x, not supported + example3 = "https://bato.to/series/12345/manga-name-with-space" + + def chapters(self, page): + data = {} + num_chapters, _ = text.extract(page, ">Chapters<", "") + num_chapters, _ = text.extract(num_chapters, r"", r"") + num_chapters = text.parse_int(num_chapters) + if num_chapters == 0: + raise exception.NotFoundError("chapter") + + manga, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") + data["manga"] = manga + + results = [] + for chapter_num in range(num_chapters): + chapter, _ = text.extract(page, f'
") + chapter += r"" # Add this back in so we can match the date + url, pos = text.extract(chapter, '') + title, _ = text.extract(title, r"", r"") + if title is None or title == "" or title == "": + title, _ = text.extract(chapter, ">", "", pos) + + date, _ = text.extract(chapter, "") + date, _ = text.extract(date, 'time="', '"') + + data["date"] = date + data["title"] = title + data["chapter"] = text.parse_int(chapter_major) + data["chapter_minor"] = sep + chapter_minor + + if url.startswith("/"): + url = self.root + url + results.append((url, data.copy())) + return results \ No newline at end of file diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 4839660d..e3738b8b 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -32,6 +32,7 @@ CATEGORY_MAP = { "atfbooru" : "ATFBooru", "b4k" : "arch.b4k.co", "baraag" : "baraag", + "bato" : "Bato", "bbc" : "BBC", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", diff --git a/test/results/bato.py b/test/results/bato.py new file mode 100644 index 00000000..18479f9a --- /dev/null +++ b/test/results/bato.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import bato +from gallery_dl import exception + +__tests__ = ( +{ + "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official/1681030-ch_8", + "#category": ("", "bato", "chapter"), + "#class" : bato.BatoChapterExtractor, + "#count" : 66, + + "manga" : "I Shall Master this Family! [Official]", + "title" : "Observing", + "chapter" : 8, +}, +{ + "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", + "#comment" : "volume (vol) in url", + "#category": ("", "bato", "chapter"), + "#class" : bato.BatoChapterExtractor, + "#count" : 7, + + "manga" : "86--EIGHTY-SIX (Official)", + "title" : "The Spearhead Squadron's Power", + "volume" : 1, + "chapter" : 5, +}, +{ + "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#count" : ">= 21", + + "manga" : "Futsutsuka na Akujo de wa Gozaimasu ga - Suuguu Chouso Torikae Den (Official)", +}, +{ + "#url" : "https://bato.to/title/104929-86-eighty-six-official", + "#comment" : "Manga with number in name", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#count" : ">= 18", + + "manga" : "86--EIGHTY-SIX (Official)", +}, +{ + "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", + "#comment" : "Non-English translation (Indonesian)", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#count" : ">= 29", + + "manga" : "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", +}, +{ + "#url" : "https://bato.to/title/134270-removed", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#exception": exception.NotFoundError +} +) From 663b8d789a183d6465a45530eb511511b2d3faf7 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:41:37 -0500 Subject: [PATCH 237/344] Fix linting --- gallery_dl/extractor/bato.py | 42 +++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index c34b74fc..320f6999 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -14,27 +14,32 @@ BASE_PATTERN = r"(?:https?://)?(?:bato\.to|dto\.to|batotoo\.com|wto\.to)" MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" + class BatoBase(): """Base class for bato v3x extractors""" category = "bato" root = "https://bato.to" + class BatoChapterExtractor(BatoBase, ChapterExtractor): """Extractor for manga chapters from bato.to""" pattern = BASE_PATTERN + "(" + MANGA_PATTERN + CHAPTER_PATTERN + ")" # There are three possible patterns for a chapter example = "https://bato.to/title/12345-manga-name-with-spaces/54212-ch_1.5" - example1 = "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" - example2 = "https://bato.to/title/12345/54212" + example2 = \ + "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" + example3 = "https://bato.to/title/12345/54212" # v2x, not supported - example3 = "https://bato.to/chapter/54212" + example4 = "https://bato.to/chapter/54212" def __init__(self, match): self.path = match.group(1) ChapterExtractor.__init__(self, match, self.root + self.path) def metadata(self, page): - info, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + info, _ = text.extract( + page, "", r" - Read Free Manga Online at Bato.To" + ) info = info.encode('latin-1').decode('utf-8').replace("\n", "") match = re.match( @@ -58,8 +63,10 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): def images(self, page): images_container = text.extr(page, 'pageOpts', ':[0,0]}"') images_container = text.unescape(images_container) - - return [(url, None) for url in text.extract_iter(images_container, r'\"', r'\"')] + return [ + (url, None) + for url in text.extract_iter(images_container, r"\"", r"\"") + ] class BatoMangaExtractor(BatoBase, MangaExtractor): @@ -80,28 +87,33 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): num_chapters = text.parse_int(num_chapters) if num_chapters == 0: raise exception.NotFoundError("chapter") - - manga, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + + manga, _ = text.extract( + page, "", r" - Read Free Manga Online at Bato.To" + ) manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") data["manga"] = manga - + results = [] for chapter_num in range(num_chapters): - chapter, _ = text.extract(page, f'
") - chapter += r"" # Add this back in so we can match the date + chapter, _ = text.extract( + page, f'
" + ) + chapter += r"" # so we can match the date url, pos = text.extract(chapter, '') + title, _ = text.extract( + chapter, f'" + ) title, _ = text.extract(title, r"", r"") if title is None or title == "" or title == "": title, _ = text.extract(chapter, ">", "", pos) date, _ = text.extract(chapter, "") date, _ = text.extract(date, 'time="', '"') - + data["date"] = date data["title"] = title data["chapter"] = text.parse_int(chapter_major) @@ -110,4 +122,4 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): if url.startswith("/"): url = self.root + url results.append((url, data.copy())) - return results \ No newline at end of file + return results From 9c1ce28f688b1173508b347a8d975bb7ae6b0743 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:44:27 -0500 Subject: [PATCH 238/344] [bato] Added mangatoto alias --- gallery_dl/extractor/bato.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index 320f6999..b82416d5 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -10,7 +10,8 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = r"(?:https?://)?(?:bato\.to|dto\.to|batotoo\.com|wto\.to)" +BASE_PATTERN = r"(?:https?://)?" \ + r"(?:bato\.to|dto\.to|batotoo\.com|wto\.to|mangatoto\.com)" MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" From 06ff1d3a3cfc0d9b1d1e84b8faf66e74f3d3aadc Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:47:30 -0500 Subject: [PATCH 239/344] Replace text.extract with extr --- gallery_dl/extractor/bato.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index b82416d5..c885f27b 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -38,7 +38,7 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): ChapterExtractor.__init__(self, match, self.root + self.path) def metadata(self, page): - info, _ = text.extract( + info = text.extr( page, "", r" - Read Free Manga Online at Bato.To" ) info = info.encode('latin-1').decode('utf-8').replace("\n", "") @@ -83,13 +83,13 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): def chapters(self, page): data = {} - num_chapters, _ = text.extract(page, ">Chapters<", "
") - num_chapters, _ = text.extract(num_chapters, r"", r"") + num_chapters = text.extr(page, ">Chapters<", "
") + num_chapters = text.extr(num_chapters, r"", r"") num_chapters = text.parse_int(num_chapters) if num_chapters == 0: raise exception.NotFoundError("chapter") - manga, _ = text.extract( + manga = text.extr( page, "", r" - Read Free Manga Online at Bato.To" ) manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") @@ -97,7 +97,7 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): results = [] for chapter_num in range(num_chapters): - chapter, _ = text.extract( + chapter = text.extr( page, f'
" ) chapter += r"" # so we can match the date @@ -105,15 +105,15 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): chapter_no = re.search(r"-ch_([\d\.]+)", url).group(1) chapter_major, sep, chapter_minor = chapter_no.partition(".") - title, _ = text.extract( + title = text.extr( chapter, f'" ) - title, _ = text.extract(title, r"", r"") + title = text.extr(title, r"", r"") if title is None or title == "" or title == "": title, _ = text.extract(chapter, ">", "", pos) - date, _ = text.extract(chapter, "") - date, _ = text.extract(date, 'time="', '"') + date = text.extr(chapter, "") + date = text.extr(date, 'time="', '"') data["date"] = date data["title"] = title From 2c3f171d653b91e2536a9829866a932f66f4f32c Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:52:06 -0500 Subject: [PATCH 240/344] Fix python 3.5 linting issue --- gallery_dl/extractor/bato.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index c885f27b..87d6c3c6 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -49,8 +49,8 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): r"Chapter *([\d\.]+)", info) manga, volume, chapter = match.groups() if match else ("", "", info) chapter, sep, minor = chapter.partition(".") - title_container = text.extr(page, f'") - title = text.extr(title_container, "", "") + title_section = text.extr(page, '") + title = text.extr(title_section, "", "") return { "manga" : text.unescape(manga), From e348da7a06da77689320fcb565f5aa4dfb6c8bd1 Mon Sep 17 00:00:00 2001 From: Antonio Date: Thu, 21 Dec 2023 12:50:54 -0600 Subject: [PATCH 241/344] [poringa] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/poringa.py | 129 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 3 + test/results/poringa.py | 47 +++++++++++ 5 files changed, 186 insertions(+) create mode 100644 gallery_dl/extractor/poringa.py create mode 100644 test/results/poringa.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8e4c59a1..b538749b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -679,6 +679,12 @@ Consider all listed sites to potentially be NSFW.
+ + + + + + diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 695b8b2a..9c684bc0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -122,6 +122,7 @@ modules = [ "pixnet", "plurk", "poipiku", + "poringa", "pornhub", "pornpics", "postmill", diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py new file mode 100644 index 00000000..e5e80d57 --- /dev/null +++ b/gallery_dl/extractor/poringa.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://www.poringa.net/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +import itertools + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net" + + +class PoringaExtractor(Extractor): + category = "poringa" + directory_fmt = ("{category}", "{user}", "{post_id}") + filename_fmt = "{post_id}_{title}_{filename}.{extension}" + archive_fmt = "{post_id}" + root = "http://www.poringa.net" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item = match.group(1) + self.__cookies = True + + def items(self): + for post_id in self.posts(): + url = "{}/posts/imagenes/{}".format(self.root, post_id) + + try: + page = self.request(url).text + except exception.HttpError as exc: + self.log.warning( + "Unable to fetch posts for '%s' (%s)", post_id, exc) + continue + + title, pos = text.extract( + page, 'property="og:title" content="', '"') + pos = page.index('
', '
') + for url in text.extract_iter( + main_post, + 'Please wait a few moments", 0, 600) < 0: + return response + self.sleep(5.0, "check") + + def _pagination(self, url, params): + for params["p"] in itertools.count(1): + page = self.request(url, params=params).text + + posts_ids = PoringaPostExtractor.pattern.findall(page) + posts_ids = list(dict.fromkeys(posts_ids)) + yield from posts_ids + + if len(posts_ids) < 19: + return + + +class PoringaPostExtractor(PoringaExtractor): + """Extractor for posts on poringa.net""" + subcategory = "post" + pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)/[a-zA-Z0-9_-]+\.html" + example = "http://www.poringa.net/posts/imagenes/12/TITLE.html" + + def posts(self): + return (self.item,) + + +class PoringaUserExtractor(PoringaExtractor): + subcategory = "user" + pattern = BASE_PATTERN + r"/([a-zA-Z0-9_-]+)$" + example = "http://www.poringa.net/USER" + + def posts(self): + url = "{}/buscar/".format(self.root) + params = {"q": text.unquote(self.item)} + return self._pagination(url, params) + + +class PoringaSearchExtractor(PoringaExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)" + example = "http://www.poringa.net/buscar/?q=QUERY" + + def posts(self): + url = self.root + "/buscar/" + params = {"q": text.unquote(self.item)} + return self._pagination(url, params) + + +@cache() +def _cookie_cache(): + return () diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 4839660d..2995a46f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -234,6 +234,9 @@ SUBCATEGORY_MAP = { "sketch": "Sketch", "work": "individual Images", }, + "poringa": { + "post": "Posts Images", + }, "pornhub": { "gifs": "", }, diff --git a/test/results/poringa.py b/test/results/poringa.py new file mode 100644 index 00000000..b6c4e95d --- /dev/null +++ b/test/results/poringa.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import poringa + + +__tests__ = ( +{ + "#url" : "http://www.poringa.net/posts/imagenes/3051081/Turrita-alto-ojete.html", + "#category": ("", "poringa", "post"), + "#class" : poringa.PoringaPostExtractor, + "#pattern" : r"http://www\.poringa\.net/posts/imagenes/3051081/[a-zA-Z0-9_-]+\.html", + + "post_id" : "3051081", + "title" : "turrita alto ojete...", + "user" : "vipower1top", +}, + +{ + "#url" : "http://www.poringa.net/posts/imagenes/3095554/Otra-culona-de-instagram.html", + "#category": ("", "poringa", "post"), + "#class" : poringa.PoringaPostExtractor, + "#pattern" : r"http://www\.poringa\.net/posts/imagenes/3095554/[a-zA-Z0-9_-]+\.html", + + "post_id" : "3095554", + "title" : "Otra culona de instagram", + "user" : "Expectro007", +}, + +{ + "#url" : "http://www.poringa.net/Expectro007", + "#category": ("", "poringa", "user"), + "#class" : poringa.PoringaUserExtractor, + "#pattern" : r"https?://img-[0-9]\.poringa\.net/poringa/img/[a-zA-Z0-9/{2}]{12}[a-zA-Z0-9-_]+/[a-zA-Z0-9-_]+\.jpg", +}, + +{ + "#url" : "http://www.poringa.net/buscar/?&q=yuslopez", + "#category": ("", "poringa", "search"), + "#class" : poringa.PoringaSearchExtractor, + "#pattern" : r"https?://img-[0-9]\.poringa\.net/poringa/img/[a-zA-Z0-9/{2}]{12}[a-zA-Z0-9-_]+/[a-zA-Z0-9-_]+\.jpg", +}, + +) From 375f2db4c28477ba71acd05b03ebae55502d0fe9 Mon Sep 17 00:00:00 2001 From: blankie Date: Thu, 28 Dec 2023 01:06:48 +1100 Subject: [PATCH 242/344] [pinterest] add count metadata field --- gallery_dl/extractor/pinterest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 4b263934..c46a5879 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -47,6 +47,7 @@ class PinterestExtractor(Extractor): carousel_data = pin.get("carousel_data") if carousel_data: + pin["count"] = len(carousel_data["carousel_slots"]) for num, slot in enumerate(carousel_data["carousel_slots"], 1): slot["media_id"] = slot.pop("id") pin.update(slot) @@ -65,7 +66,7 @@ class PinterestExtractor(Extractor): if videos or media.get("duration") is None: pin.update(media) - pin["num"] = 0 + pin["num"] = pin["count"] = 1 pin["media_id"] = "" url = media["url"] From f36dafad063c43dd0b86da9621eac8df9c53e0b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 28 Dec 2023 19:07:04 +0100 Subject: [PATCH 243/344] improve 'include' handling (#4982) - remove spaces when given as string - warn about invalid vales --- gallery_dl/extractor/common.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 9b010c59..0dd05ef2 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -526,12 +526,15 @@ class Extractor(): if include == "all": include = extractors elif isinstance(include, str): - include = include.split(",") + include = include.replace(" ", "").split(",") result = [(Message.Version, 1)] for category in include: - if category in extractors: + try: extr, url = extractors[category] + except KeyError: + self.log.warning("Invalid include '%s'", category) + else: result.append((Message.Queue, url, {"_extractor": extr})) return iter(result) From 35530255847a30fb0eb70da6bb1937ffbd33ef81 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:07:41 -0500 Subject: [PATCH 244/344] Removed f-strings --- gallery_dl/extractor/bato.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index 87d6c3c6..082c5e0a 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -98,7 +98,9 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): results = [] for chapter_num in range(num_chapters): chapter = text.extr( - page, f'
" + page, + '
" ) chapter += r"" # so we can match the date url, pos = text.extract(chapter, '" + chapter, + '" ) title = text.extr(title, r"", r"") if title is None or title == "" or title == "": From f6ce870885a1df8dfed788c0c9c2cadee1c21f8f Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:25:15 -0500 Subject: [PATCH 245/344] Better variable names --- gallery_dl/extractor/bato.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index 082c5e0a..d29a58bf 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -97,32 +97,33 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): results = [] for chapter_num in range(num_chapters): - chapter = text.extr( + chapter_info = text.extr( page, '
" ) - chapter += r"" # so we can match the date - url, pos = text.extract(chapter, '" # so we can match the date + url, pos = text.extract(chapter_info, '" ) title = text.extr(title, r"", r"") if title is None or title == "" or title == "": - title, _ = text.extract(chapter, ">", "", pos) + title, _ = text.extract(chapter_info, ">", "", pos) - date = text.extr(chapter, "") + date = text.extr(chapter_info, "") date = text.extr(date, 'time="', '"') data["date"] = date data["title"] = title - data["chapter"] = text.parse_int(chapter_major) - data["chapter_minor"] = sep + chapter_minor + data["chapter"] = text.parse_int(chapt_major) + data["chapter_minor"] = sep + chapt_minor if url.startswith("/"): url = self.root + url From 085411f3f13d691f283f1d3fcfb99d80bbb19b29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 16:07:56 +0100 Subject: [PATCH 246/344] [rule34] recognize URLs with 'www' subdomain (#4984) --- gallery_dl/extractor/gelbooru_v02.py | 2 +- test/results/rule34.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 0864b9f6..0c8af3d5 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -168,7 +168,7 @@ INSTANCES = { }, "rule34": { "root": "https://rule34.xxx", - "pattern": r"rule34\.xxx", + "pattern": r"(?:www\.)?rule34\.xxx", "api_root": "https://api.rule34.xxx", }, "safebooru": { diff --git a/test/results/rule34.py b/test/results/rule34.py index ca90e511..f8fefa32 100644 --- a/test/results/rule34.py +++ b/test/results/rule34.py @@ -34,6 +34,13 @@ __tests__ = ( "#count" : 3, }, +{ + "#url" : "https://www.rule34.xxx/index.php?page=post&s=view&id=863", + "#comment" : "www subdomain", + "#category": ("gelbooru_v02", "rule34", "post"), + "#class" : gelbooru_v02.GelbooruV02PostExtractor, +}, + { "#url" : "https://rule34.xxx/index.php?page=post&s=view&id=863", "#category": ("gelbooru_v02", "rule34", "post"), From caceb14fc2802237f67eb2b70b31d2c34ec055a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 17:26:57 +0100 Subject: [PATCH 247/344] [tests] fail when a results file contains syntax errors or is otherwise not importable --- test/results/__init__.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/results/__init__.py b/test/results/__init__.py index 0fe87462..c54bea56 100644 --- a/test/results/__init__.py +++ b/test/results/__init__.py @@ -13,12 +13,8 @@ __directory__ = os.path.dirname(__file__) @functools.lru_cache(maxsize=None) def tests(name): - try: - module = __import__(name, globals(), None, (), 1) - return module.__tests__ - except Exception as exc: - print(exc) - return () + module = __import__(name, globals(), None, (), 1) + return module.__tests__ def all(): From 00d83d9588c3fa9ed6b753d0c6baa2dc90ce4a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 18:33:46 +0100 Subject: [PATCH 248/344] [rule34us] add fallback for 'video-cdn1' videos (#4985) --- gallery_dl/extractor/rule34us.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index 6439a225..cf70cccb 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -38,7 +38,11 @@ class Rule34usExtractor(BooruExtractor): "height" : extr(' x ', 'h'), "file_url": extr(' src="', '"'), } - post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + + url = post["file_url"] + if "//video-cdn1." in url: + post["_fallback"] = (url.replace("//video-cdn1.", "//video."),) + post["md5"] = url.rpartition("/")[2].partition(".")[0] tags = collections.defaultdict(list) for tag_type, tag_name in self._find_tags(page): From 9f21c839ad6f0312fea3868568cdcb3313d09a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 20:37:09 +0100 Subject: [PATCH 249/344] [poringa] improvements and fixes - add 'num' and 'count' metadata fields - prevent crash for "private" posts - prevent crash when there's no 'main-info' - update tests --- gallery_dl/extractor/poringa.py | 47 ++++++++++++++++++++------------- test/results/poringa.py | 27 ++++++++++++------- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py index e5e80d57..0149d060 100644 --- a/gallery_dl/extractor/poringa.py +++ b/gallery_dl/extractor/poringa.py @@ -17,8 +17,8 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net" class PoringaExtractor(Extractor): category = "poringa" directory_fmt = ("{category}", "{user}", "{post_id}") - filename_fmt = "{post_id}_{title}_{filename}.{extension}" - archive_fmt = "{post_id}" + filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}" + archive_fmt = "{post_id}_{num}" root = "http://www.poringa.net" def __init__(self, match): @@ -31,36 +31,45 @@ class PoringaExtractor(Extractor): url = "{}/posts/imagenes/{}".format(self.root, post_id) try: - page = self.request(url).text + response = self.request(url) except exception.HttpError as exc: self.log.warning( "Unable to fetch posts for '%s' (%s)", post_id, exc) continue + if "/registro-login?" in response.url: + self.log.warning("Private post '%s'", post_id) + continue + + page = response.text title, pos = text.extract( page, 'property="og:title" content="', '"') - pos = page.index('
', '
') - for url in text.extract_iter( - main_post, - ' Date: Sat, 30 Dec 2023 22:25:59 +0100 Subject: [PATCH 250/344] [nijie] add 'count' metadata field https://github.com/mikf/gallery-dl/issues/146#issuecomment-1812849102 --- gallery_dl/extractor/nijie.py | 7 +++++-- test/results/horne.py | 3 +++ test/results/nijie.py | 7 +++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 57c31184..b9917057 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -55,9 +55,12 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): else: data["user_id"] = data["artist_id"] data["user_name"] = data["artist_name"] - yield Message.Directory, data - for num, url in enumerate(self._extract_images(image_id, page)): + urls = list(self._extract_images(image_id, page)) + data["count"] = len(urls) + + yield Message.Directory, data + for num, url in enumerate(urls): image = text.nameext_from_url(url, { "num": num, "url": "https:" + url, diff --git a/test/results/horne.py b/test/results/horne.py index 9058a481..f6bddba8 100644 --- a/test/results/horne.py +++ b/test/results/horne.py @@ -83,6 +83,7 @@ __tests__ = ( "artist_id" : 58000, "artist_name": "のえるわ", + "count" : 1, "date" : "dt:2018-01-29 14:25:39", "description": "前回とシチュがまるかぶり \r\n竿野郎は塗るのだるかった", "extension" : "png", @@ -113,9 +114,11 @@ __tests__ = ( "artist_id" : 58000, "artist_name": "のえるわ", + "count" : 4, "date" : "dt:2018-02-04 14:47:24", "description": "ノエル「そんなことしなくても、言ってくれたら咥えるのに・・・♡」", "image_id" : 8716, + "num" : range(0, 3), "tags" : [ "男の娘", "フェラ", diff --git a/test/results/nijie.py b/test/results/nijie.py index 01ac8fac..a2c05c81 100644 --- a/test/results/nijie.py +++ b/test/results/nijie.py @@ -31,12 +31,13 @@ __tests__ = ( "artist_id" : 44, "artist_name": "ED", + "count" : 1, "date" : datetime.datetime, "description": str, "extension" : "jpg", "filename" : str, "image_id" : int, - "num" : int, + "num" : 0, "tags" : list, "title" : str, "url" : r"re:https://pic.nijie.net/\d+/nijie/.*jpg$", @@ -102,11 +103,12 @@ __tests__ = ( "#class" : nijie.NijieImageExtractor, "#urls" : "https://pic.nijie.net/06/nijie/14/44/44/illust/0_0_28e8c02d921bee33_9222d3.jpg", "#sha1_url" : "3d654e890212ba823c9647754767336aebc0a743", - "#sha1_metadata": "41da5d0e178b04f01fe72460185df52fadc3c91b", + "#sha1_metadata": "58e716bcb03b431cae901178c198c787908e1c0c", "#sha1_content" : "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", "artist_id" : 44, "artist_name": "ED", + "count" : 1, "date" : "dt:2014-01-18 19:58:21", "description": "租絵にてお邪魔いたし候\r\n是非ともこの”おっぱい”をご高覧賜りたく馳せ参じた次第\r\n長文にて失礼仕る\r\n\r\nまず全景でありますが、首を右に傾けてみて頂きたい\r\nこの絵図は茶碗を眺めていた私が思わぬ美しさにて昇天したときのものを、筆をとり、したためたものである(トレースではない)\r\n筆は疾風の如く走り、半刻過ぎには私好みの”おっぱい”になっていたのである!\r\n次に細部をみて頂きたい\r\n絵図を正面から見直して頂くと、なんとはんなりと美しいお椀型をしたおっぱいであろうか  右手から緩やかに生まれる曲線は左手に進むにつれて、穏やかな歪みを含み流れる  これは所謂轆轤目であるが三重の紐でおっぱいをぐるぐると巻きつけた情景そのままであり、この歪みから茶碗の均整は崩れ、たぷんたぷんのおっぱいの重量感を醸し出している!\r\nさらに左手に進めば梅花皮(カイラギ)を孕んだ高大が現れる 今回は点線にて表現するが、その姿は乳首から母乳が噴出するが如く 或は精子をぶっかけられたが如く 白くとろっとした釉薬の凝固が素晴しい景色をつくりだしているのである!\r\n最後には極めつけ、すくっと螺旋を帯びながらそそり立つ兜巾(ときん)!この情景はまさしく乳首である!  全体をふんわりと盛り上げさせる乳輪にちょこっと存在する乳頭はぺろりと舌で確かめ勃起させたくなる風情がある!\r\n\r\nこれを”おっぱい”と呼ばずなんと呼ぼうや!?\r\n\r\n興奮のあまり失礼致した\r\n御免", "extension" : "jpg", @@ -133,6 +135,7 @@ __tests__ = ( "artist_id" : 49509, "artist_name": "黒川 竜", + "count" : 4, "date" : "dt:2023-12-02 04:19:29", "description": "【DLサイトコム】ウィンターセール 30%OFF\r\n期間:2024年2月14日まで\r\n【toloveるドリンク】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ042727.html\r\n【toloveるドリンク2】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ043289.html\r\n【クランクランBIG】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ043564.html", "image_id" : 594044, From fe2147b3efe580f5b44bbab30f4d2ef30be7ea92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 31 Dec 2023 01:24:12 +0100 Subject: [PATCH 251/344] [docs] document 'write-pages' (#4543) --- docs/configuration.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 2a9029ed..de180973 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1066,6 +1066,25 @@ Description after a colon ``:``, for example ``{date:%Y%m%d}``. +extractor.*.write-pages +----------------------- +Type + * ``bool`` + * ``string`` +Default + ``false`` +Description + During data extraction, + write received HTTP request data + to enumerated files in the current working directory. + + Special values: + + * ``"all"``: Include HTTP request and response headers. Hide ``Authorization``, ``Cookie``, and ``Set-Cookie`` values. + * ``"ALL"``: Include all HTTP request and response headers. + + + Extractor-specific Options ========================== From 27d5fc3697f31f50a1a78df5224d42e8b43e9c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Jan 2024 16:08:10 +0100 Subject: [PATCH 252/344] [docs] document 'tls12' (#4543) https://github.com/mikf/gallery-dl/issues/4760#issuecomment-1793345940 --- docs/configuration.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index de180973..e922f813 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -627,6 +627,20 @@ Description `ssl.SSLContext.set_ciphers() `__ +extractor.*.tls12 +----------------- +Type + ``bool`` +Default + * ``true`` + * ``false`` for ``patreon``, ``pixiv:series`` +Description + Allow selecting TLS 1.2 cipher suites. + + Can be disabled to alter TLS fingerprints + and potentially bypass Cloudflare blocks. + + extractor.*.keywords -------------------- Type From 7aa1c9671baf6193cadc878c6a44477e8575dbc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Jan 2024 02:51:34 +0100 Subject: [PATCH 253/344] [tests] fix 'invalid escape sequence' warnings --- test/results/4plebs.py | 2 +- test/results/imgbb.py | 2 +- test/results/paheal.py | 2 +- test/results/raddle.py | 4 ++-- test/results/wikiart.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/results/4plebs.py b/test/results/4plebs.py index bae62608..affe14d8 100644 --- a/test/results/4plebs.py +++ b/test/results/4plebs.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://archive.4plebs.org/tg/thread/54059290", "#category": ("foolfuuka", "4plebs", "thread"), "#class" : foolfuuka.FoolfuukaThreadExtractor, - "#pattern" : "https://i\.4pcdn\.org/tg/1[34]\d{11}\.(jpg|png|gif)", + "#pattern" : r"https://i\.4pcdn\.org/tg/1[34]\d{11}\.(jpg|png|gif)", "#count" : 30, }, diff --git a/test/results/imgbb.py b/test/results/imgbb.py index b2351d0f..e2d1bc33 100644 --- a/test/results/imgbb.py +++ b/test/results/imgbb.py @@ -21,7 +21,7 @@ __tests__ = ( "album_id" : "i5PggF", "album_name" : "British Scrap Book", "extension" : "jpg", - "id" : "re:^\w{7}$", + "id" : r"re:^\w{7}$", "title" : str, "url" : r"re:https://i\.ibb\.co/\w{7}/[\w-]+\.jpg", "user" : "folkie", diff --git a/test/results/paheal.py b/test/results/paheal.py index 1772593b..46b210f6 100644 --- a/test/results/paheal.py +++ b/test/results/paheal.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://rule34.paheal.net/post/list/Ayane_Suzuki/1", "#category": ("shimmie2", "paheal", "tag"), "#class" : paheal.PahealTagExtractor, - "#pattern" : "https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20|https://r34i\.paheal-cdn\.net/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}$", + "#pattern" : r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20|https://r34i\.paheal-cdn\.net/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}$", "#count" : range(70, 200), "date" : "type:datetime", diff --git a/test/results/raddle.py b/test/results/raddle.py index 4e60abb7..0c9de429 100644 --- a/test/results/raddle.py +++ b/test/results/raddle.py @@ -21,7 +21,7 @@ __tests__ = ( "#category": ("postmill", "raddle.me", "forum"), "#class" : postmill.PostmillForumExtractor, "#count" : 1, - "#pattern" : "^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", + "#pattern" : r"^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", }, { @@ -97,7 +97,7 @@ __tests__ = ( "#comment" : "Link + text post (with text disabled)", "#category": ("postmill", "raddle.me", "post"), "#class" : postmill.PostmillPostExtractor, - "#pattern" : "^https://fantasyanime\.com/anime/neo-tokyo-dub$", + "#pattern" : r"^https://fantasyanime\.com/anime/neo-tokyo-dub$", "#count" : 1, }, diff --git a/test/results/wikiart.py b/test/results/wikiart.py index 47eb3ec7..9ab13103 100644 --- a/test/results/wikiart.py +++ b/test/results/wikiart.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://www.wikiart.org/en/thomas-cole", "#category": ("", "wikiart", "artist"), "#class" : wikiart.WikiartArtistExtractor, - "#pattern" : "https://uploads\d+\.wikiart\.org/(\d+/)?images/thomas-cole/[\w()-]+\.(jpg|png)", + "#pattern" : r"https://uploads\d+\.wikiart\.org/(\d+/)?images/thomas-cole/[\w()-]+\.(jpg|png)", "#count" : "> 100", "albums" : None, From 63f649cd92a1aa2e70df8a2edbdb180ccae49f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Jan 2024 17:38:32 +0100 Subject: [PATCH 254/344] [idolcomplex] fix extraction & update URL patterns (#5002) --- gallery_dl/extractor/idolcomplex.py | 17 ++++++++------ test/results/idolcomplex.py | 36 ++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index b9e2c3dd..f70a948c 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -34,8 +34,11 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_post = 0 def _init(self): + self.find_pids = re.compile( + r" href=[\"#]/\w\w/posts/([0-9a-f]+)" + ).findall self.find_tags = re.compile( - r'tag-type-([^"]+)">\s*
]+>\s*\s*]*?href="/[^?]*\?tags=([^"]+)' ).findall def items(self): @@ -149,8 +152,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" - example = "https://idol.sankakucomplex.com/?tags=TAGS" + pattern = BASE_PATTERN + r"/(?:posts/?)?\?([^#]*)" + example = "https://idol.sankakucomplex.com/en/posts?tags=TAGS" per_page = 20 def __init__(self, match): @@ -196,7 +199,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): page = self.request(self.root, params=params, retries=10).text pos = ((page.find('id="more-popular-posts-link"') + 1) or (page.find(' Date: Mon, 1 Jan 2024 22:05:21 +0100 Subject: [PATCH 255/344] [manganelo] fix extraction & recognize '.to' TLDs (#5005) --- gallery_dl/extractor/manganelo.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 46019ad8..232b98d4 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -10,7 +10,11 @@ from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" +BASE_PATTERN = ( + r"(?:https?://)?" + r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o" + r"\.(?:to|com))" +) class ManganeloBase(): @@ -67,10 +71,11 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): def images(self, page): page = text.extr( - page, 'class="container-chapter-reader', '\n Date: Mon, 1 Jan 2024 22:58:42 +0100 Subject: [PATCH 256/344] [twitter] raise error for invalid 'strategy' values (#4953) --- gallery_dl/extractor/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index fdcefddc..aa9ab9f6 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -552,9 +552,11 @@ class TwitterTimelineExtractor(TwitterExtractor): return self.api.user_media if strategy == "tweets": return self.api.user_tweets + if strategy == "media": + return self.api.user_media if strategy == "with_replies": return self.api.user_tweets_and_replies - return self.api.user_media + raise exception.StopExtraction("Invalid strategy '%s'", strategy) class TwitterTweetsExtractor(TwitterExtractor): From ee65f3de437b2d782d4d05765ecf93aa6ce19387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 Jan 2024 15:03:04 +0100 Subject: [PATCH 257/344] [docs] add parent>child example (#4621) --- docs/gallery-dl-example.conf | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index c3f80493..cda584e3 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -176,16 +176,15 @@ "imgur": { - "#": "use different directory and filename formats when coming from a reddit post", - "directory": - { - "'_reddit' in locals()": [] - }, - "filename": - { - "'_reddit' in locals()": "{_reddit[id]} {id}.{extension}", - "" : "{id}.{extension}" - } + "#": "general imgur settings", + "filename": "{id}.{extension}" + }, + + "reddit>imgur": + { + "#": "special settings for imgur URLs found in reddit posts", + "directory": [], + "filename": "{_reddit[id]} {_reddit[title]} {id}.{extension}" }, "tumblr": From 4f3671458efc2d4f91baf31d2a1cfc54055872c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 Jan 2024 23:45:59 +0100 Subject: [PATCH 258/344] [deviantart] add 'avatar' and 'background' extractors (#4995) --- docs/configuration.rst | 8 ++- docs/supportedsites.md | 2 +- gallery_dl/extractor/deviantart.py | 56 ++++++++++++++-- test/results/deviantart.py | 100 ++++++++++++++++++++++++++++- 4 files changed, 154 insertions(+), 12 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index e922f813..cbc54a7d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1401,7 +1401,13 @@ Description when processing a user profile. Possible values are - ``"gallery"``, ``"scraps"``, ``"journal"``, ``"favorite"``, ``"status"``. + ``"avatar"``, + ``"background"``, + ``"gallery"``, + ``"scraps"``, + ``"journal"``, + ``"favorite"``, + ``"status"``. It is possible to use ``"all"`` instead of listing all values separately. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b538749b..dbdaac24 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -148,7 +148,7 @@ Consider all listed sites to potentially be NSFW.
- + diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2ba47e1e..4b5f1d77 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) or match.group(2) + self.user = (match.group(1) or match.group(2)).lower() self.offset = 0 def _init(self): @@ -104,7 +104,6 @@ class DeviantartExtractor(Extractor): raise exception.StopExtraction() else: self.subcategory = "group-" + self.subcategory - self.user = self.user.lower() self.group = True for deviation in self.deviations(): @@ -513,11 +512,13 @@ class DeviantartUserExtractor(DeviantartExtractor): def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( - (DeviantartGalleryExtractor , base + "gallery"), - (DeviantartScrapsExtractor , base + "gallery/scraps"), - (DeviantartJournalExtractor , base + "posts"), - (DeviantartStatusExtractor , base + "posts/statuses"), - (DeviantartFavoriteExtractor, base + "favourites"), + (DeviantartAvatarExtractor , base + "avatar"), + (DeviantartBackgroundExtractor, base + "banner"), + (DeviantartGalleryExtractor , base + "gallery"), + (DeviantartScrapsExtractor , base + "gallery/scraps"), + (DeviantartJournalExtractor , base + "posts"), + (DeviantartStatusExtractor , base + "posts/statuses"), + (DeviantartFavoriteExtractor , base + "favourites"), ), ("gallery",)) @@ -538,6 +539,47 @@ class DeviantartGalleryExtractor(DeviantartExtractor): return self._folder_urls(folders, "gallery", DeviantartFolderExtractor) +class DeviantartAvatarExtractor(DeviantartExtractor): + """Extractor for an artist's avatar""" + subcategory = "avatar" + archive_fmt = "a_{_username}_{index}" + pattern = BASE_PATTERN + r"/avatar" + example = "https://www.deviantart.com/USER/avatar/" + + def deviations(self): + profile = self.api.user_profile(self.user.lower()) + if profile: + url = profile["user"]["usericon"] + return ({ + "author" : profile["user"], + "category" : "avatar", + "index" : text.parse_int(url.rpartition("?")[2]), + "is_deleted" : False, + "is_downloadable": False, + "published_time" : 0, + "title" : "avatar", + "content" : { + "src": url.replace("/avatars/", "/avatars-big/", 1), + }, + },) + return () + + +class DeviantartBackgroundExtractor(DeviantartExtractor): + """Extractor for an artist's banner""" + subcategory = "background" + archive_fmt = "b_{index}" + pattern = BASE_PATTERN + r"/ba(?:nner|ckground)" + example = "https://www.deviantart.com/USER/banner/" + + def deviations(self): + try: + return (self.api.user_profile(self.user.lower()) + ["cover_deviation"]["cover_deviation"],) + except Exception: + return () + + class DeviantartFolderExtractor(DeviantartExtractor): """Extractor for deviations inside an artist's gallery folder""" subcategory = "folder" diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 4196f32c..45ee6c18 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -14,7 +14,7 @@ __tests__ = ( "#url" : "https://www.deviantart.com/shimoda7", "#category": ("", "deviantart", "user"), "#class" : deviantart.DeviantartUserExtractor, - "#pattern" : "/shimoda7/gallery$", + "#urls" : "https://www.deviantart.com/shimoda7/gallery", }, { @@ -22,8 +22,15 @@ __tests__ = ( "#category": ("", "deviantart", "user"), "#class" : deviantart.DeviantartUserExtractor, "#options" : {"include": "all"}, - "#pattern" : "/shimoda7/(gallery(/scraps)?|posts(/statuses)?|favourites)$", - "#count" : 5, + "#urls" : ( + "https://www.deviantart.com/shimoda7/avatar", + "https://www.deviantart.com/shimoda7/banner", + "https://www.deviantart.com/shimoda7/gallery", + "https://www.deviantart.com/shimoda7/gallery/scraps", + "https://www.deviantart.com/shimoda7/posts", + "https://www.deviantart.com/shimoda7/posts/statuses", + "https://www.deviantart.com/shimoda7/favourites", + ), }, { @@ -195,6 +202,93 @@ __tests__ = ( "#class" : deviantart.DeviantartGalleryExtractor, }, +{ + "#url" : "https://deviantart.com/shimoda7/avatar", + "#category": ("", "deviantart", "avatar"), + "#class" : deviantart.DeviantartAvatarExtractor, + "#urls" : "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4", + "#sha1_content": "abf2cc79b842315f2e54bfdd93bf794a0f612b6f", + + "author" : { + "type" : "premium", + "usericon": "https://a.deviantart.net/avatars/s/h/shimoda7.jpg?4", + "userid" : "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B", + "username": "shimoda7", + }, + "content" : { + "src": "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4" + }, + "da_category" : "avatar", + "date" : "dt:1970-01-01 00:00:00", + "extension" : "jpg", + "filename" : "avatar_by_shimoda7-d4", + "index" : 4, + "index_base36" : "4", + "is_deleted" : False, + "is_downloadable": False, + "is_original" : True, + "published_time" : 0, + "target" : { + "extension": "jpg", + "filename" : "avatar_by_shimoda7-d4", + "src" : "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4" + }, + "title" : "avatar", + "username" : "shimoda7", +}, + +{ + "#url" : "https://deviantart.com/gdldev/banner", + "#category": ("", "deviantart", "background"), + "#class" : deviantart.DeviantartBackgroundExtractor, + "#pattern" : r"https://wixmp-\w+\.wixmp\.com/f/b042e0ae-a7ff-420b-a41a-b35503427360/dgntyqc-3deebb65-04b4-4085-992a-aa0c0e7e225d\.png\?token=ey[\w.-]+$", + "#sha1_content": "980eaa76ce515f1b6bef60dfadf26a5bbe9c583f", + + "allows_comments" : True, + "author" : { + "type" : "regular", + "usericon": "https://a.deviantart.net/avatars/g/d/gdldev.jpg?2", + "userid" : "1A12BA26-33C2-AA0A-7678-0B6DFBA7AC8E", + "username": "gdldev" + }, + "category_path" : "", + "content" : { + "filename" : "banner_by_gdldev_dgntyqc.png", + "filesize" : 84510, + "height" : 4000, + "src" : r"re:https://wixmp-\w+\.wixmp\.com/f/b042e0ae-a7ff-420b-a41a-b35503427360/dgntyqc-3deebb65-04b4-4085-992a-aa0c0e7e225d\.png\?token=ey[\w.-]+$", + "transparency": False, + "width" : 6400 + }, + "da_category" : "Uncategorized", + "date" : "dt:2024-01-02 21:16:06", + "deviationid" : "8C8D6B28-766A-DE21-7F7D-CE055C3BD50A", + "download_filesize": 84510, + "extension" : "png", + "filename" : "banner_by_gdldev-dgntyqc", + "index" : 1007488020, + "index_base36" : "gntyqc", + "is_blocked" : False, + "is_deleted" : False, + "is_downloadable" : True, + "is_favourited" : False, + "is_mature" : False, + "is_original" : True, + "is_published" : False, + "preview" : dict, + "printid" : None, + "published_time" : 1704230166, + "stats" : { + "comments" : 0, + "favourites": 0, + }, + "target" : dict, + "thumbs" : list, + "title" : "Banner", + "url" : "https://sta.sh/0198jippkeys", + "username" : "gdldev", +}, + { "#url" : "https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", "#comment" : "user", From 00570028a365b514b8636d434112ad30e333f9a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 Jan 2024 01:25:50 +0100 Subject: [PATCH 259/344] [cookies] fix macOS Firefox profile path https://github.com/yt-dlp/yt-dlp/commit/85b33f5c163f60dbd089a6b9bc2ba1366d3ddf93 --- gallery_dl/cookies.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 416cc9a1..478abb63 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -215,9 +215,11 @@ def _firefox_cookies_database(profile=None, container=None): def _firefox_browser_directory(): if sys.platform in ("win32", "cygwin"): - return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles") + return os.path.expandvars( + r"%APPDATA%\Mozilla\Firefox\Profiles") if sys.platform == "darwin": - return os.path.expanduser("~/Library/Application Support/Firefox") + return os.path.expanduser( + "~/Library/Application Support/Firefox/Profiles") return os.path.expanduser("~/.mozilla/firefox") From 7eaf648f2e937114d5c3a2c60a98b3469acb5b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 Jan 2024 15:01:33 +0100 Subject: [PATCH 260/344] [fanbox] add 'metadata' option (#4921) extracts 'plan' and extended 'user' metadata --- docs/configuration.rst | 17 +++++++++- gallery_dl/extractor/fanbox.py | 59 +++++++++++++++++++++++++++++++--- test/results/fanbox.py | 37 +++++++++++++++++++++ 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index cbc54a7d..8a1752ee 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1569,7 +1569,7 @@ Default ``false`` Example * ``notes,pools`` - * ``["notes", "pools"`` + * ``["notes", "pools"]`` Description Extract additional metadata (notes, pool metadata) if available. @@ -1711,6 +1711,21 @@ Description * ``false``: Ignore embeds. +extractor.fanbox.metadata +------------------------- +Type + * ``bool`` + * ``string`` + * ``list`` of ``strings`` +Default + ``false`` +Example + * ``user,plan`` + * ``["user", "plan"]`` +Description + Extract ``plan`` and extended ``user`` metadata. + + extractor.flickr.access-token & .access-token-secret ---------------------------------------------------- Type diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 4572bea6..61a39283 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text +from ..cache import memcache import re BASE_PATTERN = ( @@ -27,8 +28,20 @@ class FanboxExtractor(Extractor): _warning = True def _init(self): + self.headers = {"Origin": self.root} self.embeds = self.config("embeds", True) + includes = self.config("metadata") + if includes: + if isinstance(includes, str): + includes = includes.split(",") + elif not isinstance(includes, (list, tuple)): + includes = ("user", "plan") + self._meta_user = ("user" in includes) + self._meta_plan = ("plan" in includes) + else: + self._meta_user = self._meta_plan = False + if self._warning: if not self.cookies_check(("FANBOXSESSID",)): self.log.warning("no 'FANBOXSESSID' cookie set") @@ -43,11 +56,9 @@ class FanboxExtractor(Extractor): """Return all relevant post objects""" def _pagination(self, url): - headers = {"Origin": self.root} - while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=headers).json()["body"] + body = self.request(url, headers=self.headers).json()["body"] for item in body["items"]: try: yield self._get_post_data(item["id"]) @@ -58,9 +69,8 @@ class FanboxExtractor(Extractor): def _get_post_data(self, post_id): """Fetch and process post data""" - headers = {"Origin": self.root} url = "https://api.fanbox.cc/post.info?postId="+post_id - post = self.request(url, headers=headers).json()["body"] + post = self.request(url, headers=self.headers).json()["body"] content_body = post.pop("body", None) if content_body: @@ -98,8 +108,47 @@ class FanboxExtractor(Extractor): post["text"] = content_body.get("text") if content_body else None post["isCoverImage"] = False + if self._meta_user: + post["user"] = self._get_user_data(post["creatorId"]) + if self._meta_plan: + plans = self._get_plan_data(post["creatorId"]) + post["plan"] = plans[post["feeRequired"]] + return content_body, post + @memcache(keyarg=1) + def _get_user_data(self, creator_id): + url = "https://api.fanbox.cc/creator.get" + params = {"creatorId": creator_id} + data = self.request(url, params=params, headers=self.headers).json() + + user = data["body"] + user.update(user.pop("user")) + + return user + + @memcache(keyarg=1) + def _get_plan_data(self, creator_id): + url = "https://api.fanbox.cc/plan.listCreator" + params = {"creatorId": creator_id} + data = self.request(url, params=params, headers=self.headers).json() + + plans = {0: { + "id" : "", + "title" : "", + "fee" : 0, + "description" : "", + "coverImageUrl" : "", + "creatorId" : creator_id, + "hasAdultContent": None, + "paymentMethod" : None, + }} + for plan in data["body"]: + del plan["user"] + plans[plan["fee"]] = plan + + return plans + def _get_urls_from_post(self, content_body, post): num = 0 cover_image = post.get("coverImageUrl") diff --git a/test/results/fanbox.py b/test/results/fanbox.py index 78f7fe54..32f13096 100644 --- a/test/results/fanbox.py +++ b/test/results/fanbox.py @@ -86,6 +86,43 @@ __tests__ = ( "content": r"re:(?s)^Greetings from FANBOX.\n \nAs of Monday, September 5th, 2022, we are happy to announce the start of the FANBOX hashtag event #MySetupTour ! \nAbout the event\nTo join this event .+ \nPlease check this page for further details regarding the Privacy & Terms.\nhttps://fanbox.pixiv.help/.+/10184952456601\n\n\nThank you for your continued support of FANBOX.$", }, +{ + "#url" : "https://official-en.fanbox.cc/posts/7022572", + "#comment" : "'plan' and 'user' metadata (#4921)", + "#category": ("", "fanbox", "post"), + "#class" : fanbox.FanboxPostExtractor, + "#options" : {"metadata": True}, + + "plan": { + "coverImageUrl" : "", + "creatorId" : "official-en", + "description" : "", + "fee" : 0, + "hasAdultContent": None, + "id" : "", + "paymentMethod" : None, + "title" : "", + }, + "user": { + "coverImageUrl" : "https://pixiv.pximg.net/c/1620x580_90_a2_g5/fanbox/public/images/creator/74349833/cover/n9mX8q4tUXHXXj7sK1RPWyUu.jpeg", + "creatorId" : "official-en", + "description" : "This is the official English pixivFANBOX account! \n(official Japanese account: https://official.fanbox.cc/ )\n\npixivFANBOX is a subscription service for building a reliable fan community where creators can nurture creative lifestyles together with their fans.\nFollowers can be notified of the updates from their favorite creators they are following. Supporters can enjoy closer communication with creators through exclusive content and their latest information.\n", + "hasAdultContent" : False, + "hasBoothShop" : False, + "iconUrl" : "https://pixiv.pximg.net/c/160x160_90_a2_g5/fanbox/public/images/user/74349833/icon/oJH0OoGoSixLrJXlnneNvC95.jpeg", + "isAcceptingRequest": False, + "isFollowed" : False, + "isStopped" : False, + "isSupported" : False, + "name" : "pixivFANBOX English", + "profileItems" : [], + "profileLinks" : [ + "https://twitter.com/pixivfanbox", + ], + "userId" : "74349833", + }, +}, + { "#url" : "https://mochirong.fanbox.cc/posts/3746116", "#comment" : "imageMap file order (#2718)", From a86775f6175460ea6ceb963567000c2b6e7002fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 Jan 2024 15:05:33 +0100 Subject: [PATCH 261/344] [gelbooru] fix 'favorite' extractor (#4903) lots of +1/-1 and = last: + pnum, last = divmod(count-1, self.per_page) + if self.offset > last: + # page number change self.offset -= last - diff, self.offset = divmod(self.offset, self.per_page) + diff, self.offset = divmod(self.offset-1, self.per_page) pnum -= diff + 1 skip = self.offset @@ -183,8 +184,8 @@ class GelbooruFavoriteExtractor(GelbooruBase, while True: favs = self._api_request(params, "favorite") - favs.reverse() + if skip: favs = favs[skip:] skip = 0 From 0f3013610997d67c6c952e2974597af7797e6957 Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Thu, 4 Jan 2024 21:38:59 +0530 Subject: [PATCH 262/344] [zzup] add 'gallery' extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/zzup.py | 40 ++++++++++++++++++++++++++++++++ test/results/zzup.py | 31 +++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 gallery_dl/extractor/zzup.py create mode 100644 test/results/zzup.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9c684bc0..8d974ecc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -178,6 +178,7 @@ modules = [ "xhamster", "xvideos", "zerochan", + "zzup", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py new file mode 100644 index 00000000..45b0cd80 --- /dev/null +++ b/gallery_dl/extractor/zzup.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import GalleryExtractor +from .. import text + + +class ZzupGalleryExtractor(GalleryExtractor): + category = "zzup" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{slug}_{num:>03}.{extension}" + archive_fmt = "{slug}_{num}" + root = "https://zzup.com" + pattern = (r"(?:https?://)?(?:www\.)?zzup\.com(/content" + r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html") + example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html" + + def __init__(self, match): + url = "{}/{}/index.html".format(self.root, match.group(1)) + GalleryExtractor.__init__(self, match, url) + self.slug = match.group(2) + + def metadata(self, page): + return { + "slug" : self.slug, + "title": text.unescape(text.extr( + page, "", ""))[:-11], + } + + def images(self, page): + path = text.extr(page, 'class="picbox">05}" + p2[4:] + return [(ufmt.format(num), None) for num in range(1, count + 1)] diff --git a/test/results/zzup.py b/test/results/zzup.py new file mode 100644 index 00000000..ad68e41c --- /dev/null +++ b/test/results/zzup.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import zzup + + +__tests__ = ( +{ + "#url" : "https://zzup.com/content/NjM=/MetArt_20080206_viki_c_sensazioni_by_ingret/OTE=/index.html", + "#category": ("", "zzup", "gallery"), + "#class" : zzup.ZzupGalleryExtractor, + "#pattern" : r"https://zzup\.com/MjAxNjc3OTIyMjE5Nzk=/showimage/zzup-8769086487/image00\d\d\d-5896498214-1-9689595623/MetArt-20080206_viki_c_sensazioni_by_ingret/9879560327/zzup.com.jpg", + + "slug" : "MetArt_20080206_viki_c_sensazioni_by_ingret", + "title" : "MetArt 20080206 viki c sensazioni by ingret", + "num" : int, + "count" : 135, +}, + +{ + "#url" : "https://zzup.com/content/MTc2MDYxMw==/Courtesan/NDA=/page-1.html", + "#category": ("", "zzup", "gallery"), + "#class" : zzup.ZzupGalleryExtractor, + "#pattern" : r"https://zzup.com/MjAxNjc3OTIyMjE5Nzk=/showimage/zzup-8769086487/image000\d\d-5896498214-40-9689595623/Courtesan/9879560327/zzup.com.jpg", +}, + +) + From 0ab0a10d2dff8a32177f45c23f48c58e5493b725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 02:26:22 +0100 Subject: [PATCH 263/344] [jpgfish] update domain --- gallery_dl/extractor/chevereto.py | 2 +- test/results/jpgfish.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 21166bdb..2bf200b0 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -35,7 +35,7 @@ class CheveretoExtractor(BaseExtractor): BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { - "root": "https://jpg2.su", + "root": "https://jpg4.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, "pixl": { diff --git a/test/results/jpgfish.py b/test/results/jpgfish.py index bf35bf7a..354e2ff5 100644 --- a/test/results/jpgfish.py +++ b/test/results/jpgfish.py @@ -9,7 +9,7 @@ from gallery_dl.extractor import chevereto __tests__ = ( { - "#url" : "https://jpg2.su/img/funnymeme.LecXGS", + "#url" : "https://jpg4.su/img/funnymeme.LecXGS", "#category": ("chevereto", "jpgfish", "image"), "#class" : chevereto.CheveretoImageExtractor, "#urls" : "https://simp3.jpg.church/images/funnymeme.jpg", From b4bcf40278e79628a81f58d7640f41630a9c66b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 17:18:33 +0100 Subject: [PATCH 264/344] [weibo] fix AttributeError in 'user' extractor (#5022) yet another bug caused by a383eca7 --- gallery_dl/extractor/weibo.py | 3 --- test/results/weibo.py | 47 +++++++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 7413b5a0..3bd06489 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -225,9 +225,6 @@ class WeiboUserExtractor(WeiboExtractor): pattern = USER_PATTERN + r"(?:$|#)" example = "https://weibo.com/USER" - def initialize(self): - pass - def items(self): base = "{}/u/{}?tabtype=".format(self.root, self._user_id()) return self._dispatch_extractors(( diff --git a/test/results/weibo.py b/test/results/weibo.py index 639994c0..68e27f8f 100644 --- a/test/results/weibo.py +++ b/test/results/weibo.py @@ -13,7 +13,35 @@ __tests__ = ( "#url" : "https://weibo.com/1758989602", "#category": ("", "weibo", "user"), "#class" : weibo.WeiboUserExtractor, - "#pattern" : r"^https://weibo\.com/u/1758989602\?tabtype=feed$", + "#urls" : "https://weibo.com/u/1758989602?tabtype=feed", +}, + +{ + "#url" : "https://weibo.com/1758989602", + "#category": ("", "weibo", "user"), + "#class" : weibo.WeiboUserExtractor, + "#options" : {"include": "all"}, + "#urls" : ( + "https://weibo.com/u/1758989602?tabtype=home", + "https://weibo.com/u/1758989602?tabtype=feed", + "https://weibo.com/u/1758989602?tabtype=video", + "https://weibo.com/u/1758989602?tabtype=newVideo", + "https://weibo.com/u/1758989602?tabtype=album", + ), +}, + +{ + "#url" : "https://weibo.com/zhouyuxi77", + "#category": ("", "weibo", "user"), + "#class" : weibo.WeiboUserExtractor, + "#urls" : "https://weibo.com/u/7488709788?tabtype=feed", +}, + +{ + "#url" : "https://www.weibo.com/n/周于希Sally", + "#category": ("", "weibo", "user"), + "#class" : weibo.WeiboUserExtractor, + "#urls" : "https://weibo.com/u/7488709788?tabtype=feed", }, { @@ -69,9 +97,11 @@ __tests__ = ( "#class" : weibo.WeiboFeedExtractor, "#range" : "1", - "status": {"user": { - "id" : 7488709788, -}}, + "status": { + "user": { + "id": 7488709788, + }, + }, }, { @@ -80,9 +110,12 @@ __tests__ = ( "#class" : weibo.WeiboFeedExtractor, "#range" : "1", - "status": {"user": { - "id" : 7488709788, -}}, + + "status": { + "user": { + "id": 7488709788, + }, + }, }, { From e61f016465c0edd71725c11dadfb66da57decce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 17:56:39 +0100 Subject: [PATCH 265/344] [szurubooru] support 'snootbooru.com' (#5023) --- docs/supportedsites.md | 8 ++- gallery_dl/extractor/szurubooru.py | 4 ++ test/results/snootbooru.py | 79 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 test/results/snootbooru.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dbdaac24..c0fee2a8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1037,7 +1037,7 @@ Consider all listed sites to potentially be NSFW. - + @@ -1409,6 +1409,12 @@ Consider all listed sites to potentially be NSFW. + + + + + + diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 5415bf30..08cccab6 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -87,6 +87,10 @@ BASE_PATTERN = SzurubooruExtractor.update({ "root": "https://booru.bcbnsfw.space", "pattern": r"booru\.bcbnsfw\.space", }, + "snootbooru": { + "root": "https://snootbooru.com", + "pattern": r"snootbooru\.com", + }, }) diff --git a/test/results/snootbooru.py b/test/results/snootbooru.py new file mode 100644 index 00000000..822bad6e --- /dev/null +++ b/test/results/snootbooru.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import szurubooru + + +__tests__ = ( +{ + "#url" : "https://snootbooru.com/posts/query=sport", + "#category": ("szurubooru", "snootbooru", "tag"), + "#class" : szurubooru.SzurubooruTagExtractor, + "#pattern" : r"https://snootbooru\.com/data/posts/\d+_[0-9a-f]{16}\.\w+", + "#count" : range(35, 50), +}, + +{ + "#url" : "https://snootbooru.com/post/14511", + "#category": ("szurubooru", "snootbooru", "post"), + "#class" : szurubooru.SzurubooruPostExtractor, + "#urls" : "https://snootbooru.com/data/posts/14511_e753313112755da6.png", + "#sha1_content": "e69e61e61c5372514808480aae3a8e355c9cd6fb", + + "canvasHeight" : 1000, + "canvasWidth" : 1414, + "checksum" : "e69e61e61c5372514808480aae3a8e355c9cd6fb", + "checksumMD5" : "f4f4ddfcbdf367f466ede0980acb3d7d", + "commentCount" : int, + "comments" : list, + "contentUrl" : "data/posts/14511_e753313112755da6.png", + "creationTime" : "2023-12-02T01:11:01.433664Z", + "date" : "dt:2023-12-02 01:11:01", + "extension" : "png", + "favoriteCount": int, + "favoritedBy" : list, + "featureCount" : int, + "fileSize" : 270639, + "filename" : "14511_e753313112755da6", + "flags" : [], + "hasCustomThumbnail": False, + "id" : 14511, + "lastEditTime" : "2023-12-02T01:12:09.500217Z", + "lastFeatureTime": None, + "mimeType" : "image/png", + "noteCount" : 0, + "notes" : [], + "ownFavorite" : False, + "ownScore" : 0, + "pools" : [], + "relationCount": 0, + "relations" : [], + "safety" : "safe", + "score" : 0, + "source" : None, + "tagCount" : 3, + "tags" : [ + "transparent", + "sport", + "text", + ], + "tags_default" : [ + "sport", + "text" + ], + "tags_type" : [ + "transparent" + ], + "thumbnailUrl" : "data/generated-thumbnails/14511_e753313112755da6.jpg", + "type" : "image", + "user" : { + "avatarUrl": "data/avatars/komp.png", + "name": "komp" + }, + "version" : 2, +}, + +) From 217fa7f8a1d42c53730807adfcbf9e4b730902d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 18:16:33 +0100 Subject: [PATCH 266/344] include 'test/results' in flake8 checks --- setup.cfg | 3 +- test/results/__init__.py | 1 - test/results/blogspot.py | 1 - test/results/nitter1d4us.py | 6 +- test/results/pillowfort.py | 124 ++++++++++++++++++------------------ test/results/unsplash.py | 2 +- 6 files changed, 67 insertions(+), 70 deletions(-) diff --git a/setup.cfg b/setup.cfg index e115e874..a5e01b66 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,8 @@ [flake8] -exclude = .git,__pycache__,build,dist,archive,results +exclude = .git,__pycache__,build,dist,archive ignore = E203,E226,W504 per-file-ignores = setup.py: E501 gallery_dl/extractor/500px.py: E501 gallery_dl/extractor/mangapark.py: E501 + test/results/*.py: E122,E241,E402,E501 diff --git a/test/results/__init__.py b/test/results/__init__.py index c54bea56..0865693b 100644 --- a/test/results/__init__.py +++ b/test/results/__init__.py @@ -5,7 +5,6 @@ # published by the Free Software Foundation. import os -import sys import functools __directory__ = os.path.dirname(__file__) diff --git a/test/results/blogspot.py b/test/results/blogspot.py index 83f4e5f7..75ecff92 100644 --- a/test/results/blogspot.py +++ b/test/results/blogspot.py @@ -43,7 +43,6 @@ __tests__ = ( "extension": "jpg", "filename" : "Icy-Moonrise---For-Web", "num" : 1, - "num" : int, "url" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", }, diff --git a/test/results/nitter1d4us.py b/test/results/nitter1d4us.py index 4c6c3d12..b816b44f 100644 --- a/test/results/nitter1d4us.py +++ b/test/results/nitter1d4us.py @@ -41,10 +41,8 @@ __tests__ = ( "#category": ("nitter", "nitter.1d4.us", "tweet"), "#class" : nitter.NitterTweetExtractor, - "content": r"""re:Gear up for #PokemonSwordShieldEX with special Mystery Gifts! - -You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, plus some very useful items. It’s our \(Mystery\) Gift to you, Trainers! - + "content": r"""re:Gear up for #PokemonSwordShieldEX with special Mystery Gifts! \n +You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, plus some very useful items. It’s our \(Mystery\) Gift to you, Trainers! \n ❓🎁➡️ """, }, diff --git a/test/results/pillowfort.py b/test/results/pillowfort.py index b04be6f3..0d260b91 100644 --- a/test/results/pillowfort.py +++ b/test/results/pillowfort.py @@ -71,58 +71,58 @@ __tests__ = ( "#pattern" : r"https://img2\.pillowfort\.social/posts/c8e834bc09e6_Brandee\.png", "#count" : 1, - "avatar_frame" : None, - "avatar_id" : None, - "avatar_url" : "https://img3.pillowfort.social/avatars/000/037/139/original/437.jpg?1545015697", - "b2_lg_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee.png", - "b2_sm_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee_small.png", - "cached_tag_list": "art, digital art, mermaid, mermaids, underwater, seaweed, illustration, speed paint", - "col" : 0, - "comm_screening_status": "not_applicable", - "commentable" : True, - "comments_count": 0, - "community_id" : None, - "concealed_comment_warning": None, - "content" : "

Sea Bed

", - "created_at" : r"re:2020-02-.+", - "currentuser_default_avatar_url": None, - "currentuser_multi_avi": None, - "date" : "dt:2020-02-29 17:09:03", - "deleted" : None, - "deleted_at" : None, - "deleted_by_mod": None, - "deleted_for_flag_id": None, - "embed_code" : None, - "extension" : "png", - "filename" : "Brandee", - "hash" : "c8e834bc09e6", - "id" : 720167, - "last_activity" : r"re:2020-02-.+", - "last_activity_elapsed": r"re:\d+ months", - "last_edited_at": None, - "likes_count" : 8, - "media_type" : "picture", - "nsfw" : False, - "num" : 1, - "original_post_id": None, - "original_post_user_id": None, - "pic_row_last" : 1, - "picture_content_type": None, - "picture_file_name": None, - "picture_file_size": None, - "picture_updated_at": None, - "post_id" : 1124584, - "post_type" : "picture", - "privacy" : "public", - "reblog_copy_info": [], - "rebloggable" : True, - "reblogged_from_post_id": None, - "reblogged_from_user_id": None, - "reblogs_count" : int, - "row" : 1, - "small_image_url": None, - "tag_list" : None, - "tags" : [ + "avatar_frame" : None, + "avatar_id" : None, + "avatar_url" : "https://img3.pillowfort.social/avatars/000/037/139/original/437.jpg?1545015697", + "b2_lg_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee.png", + "b2_sm_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee_small.png", + "cached_tag_list": "art, digital art, mermaid, mermaids, underwater, seaweed, illustration, speed paint", + "col" : 0, + "comm_screening_status": "not_applicable", + "commentable" : True, + "comments_count": 0, + "community_id" : None, + "concealed_comment_warning": None, + "content" : "

Sea Bed

", + "created_at" : r"re:2020-02-.+", + "currentuser_default_avatar_url": None, + "currentuser_multi_avi": None, + "date" : "dt:2020-02-29 17:09:03", + "deleted" : None, + "deleted_at" : None, + "deleted_by_mod": None, + "deleted_for_flag_id": None, + "embed_code" : None, + "extension" : "png", + "filename" : "Brandee", + "hash" : "c8e834bc09e6", + "id" : 720167, + "last_activity" : r"re:2020-02-.+", + "last_activity_elapsed": r"re:\d+ months", + "last_edited_at": None, + "likes_count" : 8, + "media_type" : "picture", + "nsfw" : False, + "num" : 1, + "original_post_id": None, + "original_post_user_id": None, + "pic_row_last" : 1, + "picture_content_type": None, + "picture_file_name": None, + "picture_file_size": None, + "picture_updated_at": None, + "post_id" : 1124584, + "post_type" : "picture", + "privacy" : "public", + "reblog_copy_info": [], + "rebloggable" : True, + "reblogged_from_post_id": None, + "reblogged_from_user_id": None, + "reblogs_count" : int, + "row" : 1, + "small_image_url": None, + "tag_list" : None, + "tags" : [ "art", "digital art", "mermaid", @@ -130,16 +130,16 @@ __tests__ = ( "underwater", "seaweed", "illustration", - "speed paint" - ], - "time_elapsed" : r"re:\d+ months", - "timestamp" : str, - "title" : "", - "updated_at" : r"re:2020-02-.+", - "url" : "", - "user_concealed": None, - "user_id" : 37201, - "username" : "Maclanahan", + "speed paint", + ], + "time_elapsed" : r"re:\d+ months", + "timestamp" : str, + "title" : "", + "updated_at" : r"re:2020-02-.+", + "url" : "", + "user_concealed": None, + "user_id" : 37201, + "username" : "Maclanahan", }, { diff --git a/test/results/unsplash.py b/test/results/unsplash.py index e3413aff..01692eec 100644 --- a/test/results/unsplash.py +++ b/test/results/unsplash.py @@ -81,7 +81,7 @@ __tests__ = ( "full" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=srgb&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=85", "raw" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3", "regular" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=1080", - "small" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=400", + "small" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=400", "small_s3": "https://s3.us-west-2.amazonaws.com/images.unsplash.com/small/photo-1601823984263-b87b59798b70", "thumb" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=200", }, From 11150a7d72a68647b5960702e984b15d784b061f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 21:32:04 +0100 Subject: [PATCH 267/344] [nudecollect] remove module --- docs/supportedsites.md | 12 ++-- gallery_dl/extractor/__init__.py | 1 - gallery_dl/extractor/nudecollect.py | 87 ----------------------------- test/results/nudecollect.py | 56 ------------------- test/results/zzup.py | 1 - 5 files changed, 6 insertions(+), 151 deletions(-) delete mode 100644 gallery_dl/extractor/nudecollect.py delete mode 100644 test/results/nudecollect.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c0fee2a8..23459d0c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -589,12 +589,6 @@ Consider all listed sites to potentially be NSFW.
- - - - - - @@ -1003,6 +997,12 @@ Consider all listed sites to potentially be NSFW. + + + + + + diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8d974ecc..6fca0120 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -107,7 +107,6 @@ modules = [ "nitter", "nozomi", "nsfwalbum", - "nudecollect", "paheal", "patreon", "philomena", diff --git a/gallery_dl/extractor/nudecollect.py b/gallery_dl/extractor/nudecollect.py deleted file mode 100644 index bda5d774..00000000 --- a/gallery_dl/extractor/nudecollect.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://nudecollect.com/""" - -from .common import GalleryExtractor -from .. import text - - -class NudecollectExtractor(GalleryExtractor): - """Base class for Nudecollect extractors""" - category = "nudecollect" - directory_fmt = ("{category}", "{title}") - filename_fmt = "{slug}_{num:>03}.{extension}" - archive_fmt = "{slug}_{num}" - root = "https://www.nudecollect.com" - - def request(self, url, **kwargs): - kwargs["allow_redirects"] = False - return GalleryExtractor.request(self, url, **kwargs) - - @staticmethod - def get_title(page): - return text.unescape(text.extr(page, "", ""))[31:] - - @staticmethod - def get_image(page): - return text.extr(page, '05}" + p2[4:] - return [(ufmt.format(num), None) for num in range(1, self.count + 1)] diff --git a/test/results/nudecollect.py b/test/results/nudecollect.py deleted file mode 100644 index 423c915f..00000000 --- a/test/results/nudecollect.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import nudecollect - - -__tests__ = ( -{ - "#url" : "https://www.nudecollect.com/content/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/image-4-pics-108-mirror-43.html", - "#category": ("", "nudecollect", "image"), - "#class" : nudecollect.NudecollectImageExtractor, - "#pattern" : r"https://mirror\d+\.nudecollect\.com/showimage/nudecollect-8769086487/image00004-5896498214-43-9689595623/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/9879560327/nudecollect\.com\.jpg", - - "slug" : "20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust", - "title" : "20201220 Teenpornstorage Patritcy Vanessa Lesbian Lust", - "num" : 4, - "count" : 108, - "mirror": 43, -}, - -{ - "#url" : "https://www.nudecollect.com/content/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/image-10-pics-108-mirror-43.html", - "#category": ("", "nudecollect", "image"), - "#class" : nudecollect.NudecollectImageExtractor, -}, - -{ - "#url" : "https://www.nudecollect.com/content/20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex_with_alluring_Czech_babes_x125_1080px/index-mirror-67-125.html", - "#category": ("", "nudecollect", "album"), - "#class" : nudecollect.NudecollectAlbumExtractor, - "#pattern" : r"https://mirror\d+\.nudecollect\.com/showimage/nudecollect-8769086487/image00\d\d\d-5896498214-67-9689595623/20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex_with_alluring_Czech_babes_x125_1080px/9879560327/nudecollect\.com\.jpg", - "#count" : 125, - - "slug" : "20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex_with_alluring_Czech_babes_x125_1080px", - "title" : "20170219 TheWhiteBoxxx Caprice Tracy Loves Hot ass fingering and sensual lesbian sex with alluring Czech babes x125 1080px", - "num" : int, - "mirror": 67, -}, - -{ - "#url" : "https://www.nudecollect.com/content/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/page-1-pics-108-mirror-43.html", - "#category": ("", "nudecollect", "album"), - "#class" : nudecollect.NudecollectAlbumExtractor, - "#pattern" : r"https://mirror\d+\.nudecollect\.com/showimage/nudecollect-8769086487/image00\d\d\d-5896498214-43-9689595623/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/9879560327/nudecollect\.com\.jpg", - "#count" : 108, - - "slug" : "20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust", - "title" : "20201220 Teenpornstorage Patritcy Vanessa Lesbian Lust", - "num" : int, - "mirror": 43, -}, - -) diff --git a/test/results/zzup.py b/test/results/zzup.py index ad68e41c..87b9bada 100644 --- a/test/results/zzup.py +++ b/test/results/zzup.py @@ -28,4 +28,3 @@ __tests__ = ( }, ) - From 3aa24c3744474a4fe06ebdec946a895c4f9d538c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Jan 2024 00:51:52 +0100 Subject: [PATCH 268/344] [bato] simplify and update --- docs/supportedsites.md | 2 +- gallery_dl/extractor/bato.py | 149 ++++++++++++++++------------------- test/results/bato.py | 2 +- 3 files changed, 70 insertions(+), 83 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6040cd47..c1acadd2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -99,7 +99,7 @@ Consider all listed sites to potentially be NSFW. - + diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index d29a58bf..83404a75 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -4,61 +4,63 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bato.to and aliases (v3x only)""" +"""Extractors for https://bato.to/""" -from .common import ChapterExtractor, MangaExtractor +from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = r"(?:https?://)?" \ - r"(?:bato\.to|dto\.to|batotoo\.com|wto\.to|mangatoto\.com)" -MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" -CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" +BASE_PATTERN = (r"(?:https?://)?" + r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") class BatoBase(): - """Base class for bato v3x extractors""" + """Base class for bato extractors""" category = "bato" root = "https://bato.to" + def request(self, url, **kwargs): + kwargs["encoding"] = "utf-8" + return Extractor.request(self, url, **kwargs) + class BatoChapterExtractor(BatoBase, ChapterExtractor): - """Extractor for manga chapters from bato.to""" - pattern = BASE_PATTERN + "(" + MANGA_PATTERN + CHAPTER_PATTERN + ")" - # There are three possible patterns for a chapter - example = "https://bato.to/title/12345-manga-name-with-spaces/54212-ch_1.5" - example2 = \ - "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" - example3 = "https://bato.to/title/12345/54212" - # v2x, not supported - example4 = "https://bato.to/chapter/54212" + """Extractor for bato.to manga chapters""" + pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" + example = "https://bato.to/title/12345-MANGA/54321" def __init__(self, match): - self.path = match.group(1) - ChapterExtractor.__init__(self, match, self.root + self.path) + self.root = text.root_from_url(match.group(0)) + self.chapter_id = match.group(1) + url = "{}/title/0/{}".format(self.root, self.chapter_id) + ChapterExtractor.__init__(self, match, url) def metadata(self, page): - info = text.extr( - page, "", r" - Read Free Manga Online at Bato.To" - ) - info = info.encode('latin-1').decode('utf-8').replace("\n", "") + extr = text.extract_from(page) + manga, info, _ = extr("", "<").rsplit(" - ", 3) + manga_id = extr("/title/", "/") match = re.match( - r"(.+) - " - r"(?:Volume *(\d+) )?" - r"Chapter *([\d\.]+)", info) - manga, volume, chapter = match.groups() if match else ("", "", info) - chapter, sep, minor = chapter.partition(".") - title_section = text.extr(page, '<a href="' + self.path + '"', "</a>") - title = text.extr(title_section, "<!-- -->", "</span>") + r"(?:Volume\s+(\d+) )?" + r"\w+\s+(\d+)(.*)", info) + if match: + volume, chapter, minor = match.groups() + title = text.remove_html(extr( + "selected>", "</option")).partition(" : ")[2] + else: + volume = chapter = 0 + minor = "" + title = info return { "manga" : text.unescape(manga), + "manga_id" : text.parse_int(manga_id), "title" : text.unescape(title), - "author" : "", "volume" : text.parse_int(volume), "chapter" : text.parse_int(chapter), - "chapter_minor": sep + minor, + "chapter_minor": minor, + "chapter_id" : text.parse_int(self.chapter_id), + "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), } def images(self, page): @@ -71,61 +73,46 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): class BatoMangaExtractor(BatoBase, MangaExtractor): - """Extractor for manga from bato.to""" + """Extractor for bato.to manga""" reverse = False chapterclass = BatoChapterExtractor - pattern = BASE_PATTERN + "(" + MANGA_PATTERN + "$" + ")" - # There are two possible patterns for a manga - example = "https://bato.to/title/12345-manga-name-with-spaces/" - example2 = "https://bato.to/title/12345/" - # v2x, not supported - example3 = "https://bato.to/series/12345/manga-name-with-space" + pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" + example = "https://bato.to/title/12345-MANGA/" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + self.manga_id = match.group(1) + url = "{}/title/{}".format(self.root, self.manga_id) + MangaExtractor.__init__(self, match, url) def chapters(self, page): - data = {} - num_chapters = text.extr(page, ">Chapters<", "</div>") - num_chapters = text.extr(num_chapters, r"<!-- -->", r"<!-- -->") - num_chapters = text.parse_int(num_chapters) - if num_chapters == 0: - raise exception.NotFoundError("chapter") - - manga = text.extr( - page, "<title>", r" - Read Free Manga Online at Bato.To" - ) - manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") - data["manga"] = manga + extr = text.extract_from(page) + warning = extr(' class="alert alert-warning">', "<") + if warning: + raise exception.StopExtraction("'%s'", text.remove_html(warning)) + + data = { + "manga_id": text.parse_int(self.manga_id), + "manga" : text.unescape(extr( + "", "<").rpartition(" - ")[0]), + } + + extr('<div data-hk="0-0-0-0"', "") results = [] - for chapter_num in range(num_chapters): - chapter_info = text.extr( - page, - '<div data-hk="0-0-{}-0"'.format(chapter_num), - r"</time><!--/-->" - ) - chapter_info += r"</time><!--/-->" # so we can match the date - url, pos = text.extract(chapter_info, '<a href="', '"') - - chapter = re.search(r"-ch_([\d\.]+)", url) - if chapter: - chapt_major, sep, chapt_minor = chapter.group(1).partition(".") - title = text.extr( - chapter_info, - '<span data-hk="0-0-{}-1"'.format(chapter_num), - "</span>" - ) - title = text.extr(title, r"<!--#-->", r"<!--/-->") - if title is None or title == "" or title == "<!--/-->": - title, _ = text.extract(chapter_info, ">", "</a>", pos) - - date = text.extr(chapter_info, "<time", "</time>") - date = text.extr(date, 'time="', '"') - - data["date"] = date - data["title"] = title - data["chapter"] = text.parse_int(chapt_major) - data["chapter_minor"] = sep + chapt_minor - - if url.startswith("/"): - url = self.root + url + while True: + href = extr('<a href="/title/', '"') + if not href: + break + + chapter = href.rpartition("-ch_")[2] + chapter, sep, minor = chapter.partition(".") + + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + data["date"] = text.parse_datetime( + extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ") + + url = "{}/title/{}".format(self.root, href) results.append((url, data.copy())) return results diff --git a/test/results/bato.py b/test/results/bato.py index 18479f9a..672362f5 100644 --- a/test/results/bato.py +++ b/test/results/bato.py @@ -60,6 +60,6 @@ __tests__ = ( "#url" : "https://bato.to/title/134270-removed", "#category": ("", "bato", "manga"), "#class" : bato.BatoMangaExtractor, - "#exception": exception.NotFoundError + "#exception": exception.StopExtraction, } ) From b11c352d66b6f23a9cb03047d4b19f7092bb4b4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 01:49:34 +0100 Subject: [PATCH 269/344] [bato] rename to 'batoto' to use the same category name as the previous bato.to site --- docs/supportedsites.md | 2 +- gallery_dl/extractor/__init__.py | 2 +- gallery_dl/extractor/{bato.py => batoto.py} | 12 +++++----- scripts/supportedsites.py | 2 +- test/results/{bato.py => batoto.py} | 26 ++++++++++----------- 5 files changed, 22 insertions(+), 22 deletions(-) rename gallery_dl/extractor/{bato.py => batoto.py} (93%) rename test/results/{bato.py => batoto.py} (73%) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1acadd2..9dc174a8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,7 +98,7 @@ Consider all listed sites to potentially be NSFW. <td></td> </tr> <tr> - <td>Bato</td> + <td>BATO.TO</td> <td>https://bato.to/</td> <td>Chapters, Manga</td> <td></td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 99de2169..4ab9db4d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,7 +24,7 @@ modules = [ "architizer", "artstation", "aryion", - "bato", + "batoto", "bbc", "behance", "blogger", diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/batoto.py similarity index 93% rename from gallery_dl/extractor/bato.py rename to gallery_dl/extractor/batoto.py index 83404a75..cd6302e6 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/batoto.py @@ -14,9 +14,9 @@ BASE_PATTERN = (r"(?:https?://)?" r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") -class BatoBase(): - """Base class for bato extractors""" - category = "bato" +class BatotoBase(): + """Base class for batoto extractors""" + category = "batoto" root = "https://bato.to" def request(self, url, **kwargs): @@ -24,7 +24,7 @@ class BatoBase(): return Extractor.request(self, url, **kwargs) -class BatoChapterExtractor(BatoBase, ChapterExtractor): +class BatotoChapterExtractor(BatotoBase, ChapterExtractor): """Extractor for bato.to manga chapters""" pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" example = "https://bato.to/title/12345-MANGA/54321" @@ -72,10 +72,10 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): ] -class BatoMangaExtractor(BatoBase, MangaExtractor): +class BatotoMangaExtractor(BatotoBase, MangaExtractor): """Extractor for bato.to manga""" reverse = False - chapterclass = BatoChapterExtractor + chapterclass = BatotoChapterExtractor pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" example = "https://bato.to/title/12345-MANGA/" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index e3738b8b..ea6c2597 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -32,7 +32,7 @@ CATEGORY_MAP = { "atfbooru" : "ATFBooru", "b4k" : "arch.b4k.co", "baraag" : "baraag", - "bato" : "Bato", + "batoto" : "BATO.TO", "bbc" : "BBC", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", diff --git a/test/results/bato.py b/test/results/batoto.py similarity index 73% rename from test/results/bato.py rename to test/results/batoto.py index 672362f5..f3853247 100644 --- a/test/results/bato.py +++ b/test/results/batoto.py @@ -4,14 +4,14 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -from gallery_dl.extractor import bato +from gallery_dl.extractor import batoto from gallery_dl import exception __tests__ = ( { "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official/1681030-ch_8", - "#category": ("", "bato", "chapter"), - "#class" : bato.BatoChapterExtractor, + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, "#count" : 66, "manga" : "I Shall Master this Family! [Official]", @@ -21,8 +21,8 @@ __tests__ = ( { "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", "#comment" : "volume (vol) in url", - "#category": ("", "bato", "chapter"), - "#class" : bato.BatoChapterExtractor, + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, "#count" : 7, "manga" : "86--EIGHTY-SIX (Official)", @@ -32,8 +32,8 @@ __tests__ = ( }, { "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 21", "manga" : "Futsutsuka na Akujo de wa Gozaimasu ga - Suuguu Chouso Torikae Den (Official)", @@ -41,8 +41,8 @@ __tests__ = ( { "#url" : "https://bato.to/title/104929-86-eighty-six-official", "#comment" : "Manga with number in name", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 18", "manga" : "86--EIGHTY-SIX (Official)", @@ -50,16 +50,16 @@ __tests__ = ( { "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", "#comment" : "Non-English translation (Indonesian)", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 29", "manga" : "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", }, { "#url" : "https://bato.to/title/134270-removed", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#exception": exception.StopExtraction, } ) From 8e1a2b5446dd2b4e4933435da469ea2e76e04eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 02:16:43 +0100 Subject: [PATCH 270/344] [komikcast] update domain to 'komikcast.lol' (#5027) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/komikcast.py | 14 ++++++------- test/results/komikcast.py | 35 ++++++++++++++++++++++++++++--- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9aa51a08..d046aad4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -453,7 +453,7 @@ Consider all listed sites to potentially be NSFW. </tr> <tr> <td>Komikcast</td> - <td>https://komikcast.site/</td> + <td>https://komikcast.lol/</td> <td>Chapters, Manga</td> <td></td> </tr> diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index a3e01305..53411a2e 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.site/""" +"""Extractors for https://komikcast.lol/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)" class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.site" + root = "https://komikcast.lol" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -46,9 +46,9 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): - """Extractor for manga-chapters from komikcast.site""" + """Extractor for manga-chapters from komikcast.lol""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" - example = "https://komikcast.site/chapter/TITLE/" + example = "https://komikcast.lol/chapter/TITLE/" def metadata(self, page): info = text.extr(page, "<title>", " - Komikcast<") @@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): - """Extractor for manga from komikcast.site""" + """Extractor for manga from komikcast.lol""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" - example = "https://komikcast.site/komik/TITLE" + example = "https://komikcast.lol/komik/TITLE" def chapters(self, page): results = [] diff --git a/test/results/komikcast.py b/test/results/komikcast.py index 9a246009..89fcbf10 100644 --- a/test/results/komikcast.py +++ b/test/results/komikcast.py @@ -8,19 +8,48 @@ from gallery_dl.extractor import komikcast __tests__ = ( +{ + "#url" : "https://komikcast.lol/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", + "#category": ("", "komikcast", "chapter"), + "#class" : komikcast.KomikcastChapterExtractor, + "#pattern" : r"https://svr\d+\.imgkc\d+\.my\.id/wp-content/img/A/Apotheosis/002-2/\d{3}\.jpg", + "#count" : 18, + + "chapter" : 2, + "chapter_minor": ".2", + "count" : 18, + "extension": "jpg", + "filename" : r"re:0\d{2}", + "lang" : "id", + "language" : "Indonesian", + "manga" : "Apotheosis", + "page" : range(1, 18), + "title" : "", +}, + { "#url" : "https://komikcast.site/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", "#category": ("", "komikcast", "chapter"), "#class" : komikcast.KomikcastChapterExtractor, - "#sha1_url" : "f6b43fbc027697749b3ea1c14931c83f878d7936", - "#sha1_metadata": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", +}, + +{ + "#url" : "https://komikcast.me/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", + "#category": ("", "komikcast", "chapter"), + "#class" : komikcast.KomikcastChapterExtractor, +}, + +{ + "#url" : "https://komikcast.com/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", + "#category": ("", "komikcast", "chapter"), + "#class" : komikcast.KomikcastChapterExtractor, }, { "#url" : "https://komikcast.me/chapter/soul-land-ii-chapter-300-1-bahasa-indonesia/", "#category": ("", "komikcast", "chapter"), "#class" : komikcast.KomikcastChapterExtractor, - "#sha1_url" : "efd00a9bd95461272d51990d7bc54b79ff3ff2e6", + "#sha1_url" : "f2674e31b41a7f009f2f292652be2aefb6612d3f", "#sha1_metadata": "cb646cfed3d45105bd645ab38b2e9f7d8c436436", }, From c25bdbae91f172112b5be7f1ea926ed07ac0c370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 14:19:44 +0100 Subject: [PATCH 271/344] [komikcast] fix 'manga' extractor (#5027) --- gallery_dl/extractor/komikcast.py | 6 ++++-- test/results/komikcast.py | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 53411a2e..7a19be50 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -76,8 +76,10 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): for item in text.extract_iter( page, '<a class="chapter-link-item" href="', '</a'): - url, _, chapter_string = item.rpartition('">Chapter ') - self.parse_chapter_string(chapter_string, data) + url, _, chapter = item.rpartition('">Chapter') + chapter, sep, minor = chapter.strip().partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor results.append((url, data.copy())) return results diff --git a/test/results/komikcast.py b/test/results/komikcast.py index 89fcbf10..fa35c95f 100644 --- a/test/results/komikcast.py +++ b/test/results/komikcast.py @@ -57,8 +57,22 @@ __tests__ = ( "#url" : "https://komikcast.site/komik/090-eko-to-issho/", "#category": ("", "komikcast", "manga"), "#class" : komikcast.KomikcastMangaExtractor, - "#sha1_url" : "19d3d50d532e84be6280a3d61ff0fd0ca04dd6b4", - "#sha1_metadata": "837a7e96867344ff59d840771c04c20dc46c0ab1", + "#pattern" : komikcast.KomikcastChapterExtractor.pattern, + "#count" : 12, + + "author" : "Asakura Maru", + "chapter": range(1, 12), + "chapter_minor": "", + "genres" : [ + "Comedy", + "Drama", + "Romance", + "School Life", + "Sci-Fi", + "Shounen" + ], + "manga" : "090 Eko to Issho", + "type" : "Manga", }, { From cbfb7bfdf175d29beb655c2e96107956c2df346b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 14:26:46 +0100 Subject: [PATCH 272/344] [gelbooru] display error for invalid API responses (#4903) --- gallery_dl/extractor/gelbooru.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index d9da7bc3..eba15390 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -32,10 +32,13 @@ class GelbooruBase(): url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() - if key not in data: - return () + try: + posts = data[key] + except KeyError: + self.log.error("Incomplete API response (missing '%s')", key) + self.log.debug("%s", data) + return [] - posts = data[key] if not isinstance(posts, list): return (posts,) return posts From d0d199414f2eb77c5e19fa103740dbbccf015568 Mon Sep 17 00:00:00 2001 From: Se AKi <seaki@sastudio.jp> Date: Sat, 6 Jan 2024 23:15:15 +0900 Subject: [PATCH 273/344] modify useragent of pixiv --- gallery_dl/extractor/pixiv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 4414c71c..b9821f23 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -826,9 +826,9 @@ class PixivAppAPI(): extractor.session.headers.update({ "App-OS" : "ios", - "App-OS-Version": "13.1.2", - "App-Version" : "7.7.6", - "User-Agent" : "PixivIOSApp/7.7.6 (iOS 13.1.2; iPhone11,8)", + "App-OS-Version": "16.7.2", + "App-Version" : "7.19.1", + "User-Agent" : "PixivIOSApp/7.19.1 (iOS 16.7.2; iPhone12,8)", "Referer" : "https://app-api.pixiv.net/", }) From 6e10260fb071ce7625e79c0e59d8c004c29b501e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 17:48:58 +0100 Subject: [PATCH 274/344] release version 1.26.6 --- CHANGELOG.md | 32 ++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8907e07b..7b135b74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,37 @@ # Changelog +## 1.26.6 - 2024-01-06 +### Extractors +#### Additions +- [batoto] add `chapter` and `manga` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434), [#2111](https://github.com/mikf/gallery-dl/issues/2111), [#4979](https://github.com/mikf/gallery-dl/issues/4979)) +- [deviantart] add `avatar` and `background` extractors ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [poringa] add support ([#4675](https://github.com/mikf/gallery-dl/issues/4675), [#4962](https://github.com/mikf/gallery-dl/issues/4962)) +- [szurubooru] support `snootbooru.com` ([#5023](https://github.com/mikf/gallery-dl/issues/5023)) +- [zzup] add `gallery` extractor ([#4517](https://github.com/mikf/gallery-dl/issues/4517), [#4604](https://github.com/mikf/gallery-dl/issues/4604), [#4659](https://github.com/mikf/gallery-dl/issues/4659), [#4863](https://github.com/mikf/gallery-dl/issues/4863), [#5016](https://github.com/mikf/gallery-dl/issues/5016)) +#### Fixes +- [gelbooru] fix `favorite` extractor ([#4903](https://github.com/mikf/gallery-dl/issues/4903)) +- [idolcomplex] fix extraction & update URL patterns ([#5002](https://github.com/mikf/gallery-dl/issues/5002)) +- [imagechest] fix loading more than 10 images in a gallery ([#4469](https://github.com/mikf/gallery-dl/issues/4469)) +- [jpgfish] update domain +- [komikcast] fix `manga` extractor ([#5027](https://github.com/mikf/gallery-dl/issues/5027)) +- [komikcast] update domain ([#5027](https://github.com/mikf/gallery-dl/issues/5027)) +- [lynxchan] update `bbw-chan` domain ([#4970](https://github.com/mikf/gallery-dl/issues/4970)) +- [manganelo] fix extraction & recognize `.to` TLDs ([#5005](https://github.com/mikf/gallery-dl/issues/5005)) +- [paheal] restore `extension` metadata ([#4976](https://github.com/mikf/gallery-dl/issues/4976)) +- [rule34us] add fallback for `video-cdn1` videos ([#4985](https://github.com/mikf/gallery-dl/issues/4985)) +- [weibo] fix AttributeError in `user` extractor ([#5022](https://github.com/mikf/gallery-dl/issues/5022)) +#### Improvements +- [gelbooru] show error for invalid API responses ([#4903](https://github.com/mikf/gallery-dl/issues/4903)) +- [rule34] recognize URLs with `www` subdomain ([#4984](https://github.com/mikf/gallery-dl/issues/4984)) +- [twitter] raise error for invalid `strategy` values ([#4953](https://github.com/mikf/gallery-dl/issues/4953)) +#### Metadata +- [fanbox] add `metadata` option ([#4921](https://github.com/mikf/gallery-dl/issues/4921)) +- [nijie] add `count` metadata ([#146](https://github.com/mikf/gallery-dl/issues/146)) +- [pinterest] add `count` metadata ([#4981](https://github.com/mikf/gallery-dl/issues/4981)) +### Miscellaneous +- fix and update zsh completion ([#4972](https://github.com/mikf/gallery-dl/issues/4972)) +- fix `--cookies-from-browser` macOS Firefox profile path + ## 1.26.5 - 2023-12-23 ### Extractors #### Additions diff --git a/README.rst b/README.rst index 782c8ded..ee165e52 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d4ab3f64..15905d6b 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.6-dev" +__version__ = "1.26.6" From db8de135376d5c55ed685518024ed827161ebbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 7 Jan 2024 00:12:52 +0100 Subject: [PATCH 275/344] [vk] transform image URLs to non-blurred versions (#5017) apply the same filter from before d85e66bc --- gallery_dl/extractor/vk.py | 8 +++++++- gallery_dl/version.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index c22e67e6..95eeafe8 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +import re BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" @@ -24,6 +25,7 @@ class VkExtractor(Extractor): request_interval = (0.5, 1.5) def items(self): + sub = re.compile(r"/imp[fg]/").sub sizes = "wzyxrqpo" data = self.metadata() @@ -40,11 +42,15 @@ class VkExtractor(Extractor): continue try: - photo["url"] = photo[size + "src"] + url = photo[size + "src"] except KeyError: self.log.warning("no photo URL found (%s)", photo.get("id")) continue + photo["url"] = sub("/", url.partition("?")[0]) + # photo["url"] = url + photo["_fallback"] = (url,) + try: _, photo["width"], photo["height"] = photo[size] except ValueError: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 15905d6b..d348b548 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.6" +__version__ = "1.26.7-dev" From 33f228756ace7efe282c924dbab4fb1c5801283a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 7 Jan 2024 02:59:35 +0100 Subject: [PATCH 276/344] [mangadex] add 'list' extractor (#5025) supports listing manga and chapters from list feed --- docs/supportedsites.md | 2 +- gallery_dl/extractor/mangadex.py | 31 +++++++++++++++++++++++++++++++ test/results/mangadex.py | 27 +++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d046aad4..057515c9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -502,7 +502,7 @@ Consider all listed sites to potentially be NSFW. <tr> <td>MangaDex</td> <td>https://mangadex.org/</td> - <td>Chapters, Followed Feed, Manga</td> + <td>Chapters, Followed Feed, Lists, Manga</td> <td>Supported</td> </tr> <tr> diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 94bea570..d287d5cf 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -148,6 +148,31 @@ class MangadexFeedExtractor(MangadexExtractor): return self.api.user_follows_manga_feed() +class MangadexListExtractor(MangadexExtractor): + """Extractor for mangadex lists""" + subcategory = "list" + pattern = (BASE_PATTERN + + r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?") + example = ("https://mangadex.org/list" + "/01234567-89ab-cdef-0123-456789abcdef/NAME") + + def __init__(self, match): + MangadexExtractor.__init__(self, match) + if match.group(2) != "feed": + self.subcategory = "list-feed" + self.items = self._items_titles + + def chapters(self): + return self.api.list_feed(self.uuid) + + def _items_titles(self): + data = {"_extractor": MangadexMangaExtractor} + for item in self.api.list(self.uuid)["relationships"]: + if item["type"] == "manga": + url = "{}/title/{}".format(self.root, item["id"]) + yield Message.Queue, url, data + + class MangadexAPI(): """Interface for the MangaDex API v5 @@ -173,6 +198,12 @@ class MangadexAPI(): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] + def list(self, uuid): + return self._call("/list/" + uuid)["data"] + + def list_feed(self, uuid): + return self._pagination("/list/" + uuid + "/feed") + @memcache(keyarg=1) def manga(self, uuid): params = {"includes[]": ("artist", "author")} diff --git a/test/results/mangadex.py b/test/results/mangadex.py index 17b2157c..ae1c7ab1 100644 --- a/test/results/mangadex.py +++ b/test/results/mangadex.py @@ -113,4 +113,31 @@ __tests__ = ( "#class" : mangadex.MangadexFeedExtractor, }, +{ + "#url" : "https://mangadex.org/list/3a0982c5-65aa-4de2-8a4a-2175be7383ab/test", + "#category": ("", "mangadex", "list"), + "#class" : mangadex.MangadexListExtractor, + "#urls" : ( + "https://mangadex.org/title/cba4e5d6-67a0-47a0-b37a-c06e9bf25d93", + "https://mangadex.org/title/cad76ec6-ca22-42f6-96f8-eca164da6545", + ), +}, + +{ + "#url" : "https://mangadex.org/list/3a0982c5-65aa-4de2-8a4a-2175be7383ab/test?tab=titles", + "#category": ("", "mangadex", "list"), + "#class" : mangadex.MangadexListExtractor, +}, + +{ + "#url" : "https://mangadex.org/list/3a0982c5-65aa-4de2-8a4a-2175be7383ab/test?tab=feed", + "#category": ("", "mangadex", "list-feed"), + "#class" : mangadex.MangadexListExtractor, + "#urls" : ( + "https://mangadex.org/chapter/c765d6d5-5712-4360-be0b-0c8e0914fc94", + "https://mangadex.org/chapter/fa8a695d-260f-4dcc-95a3-1f30e66d6571", + "https://mangadex.org/chapter/788766b9-41c6-422e-97ba-552f03ba9655", + ), +}, + ) From 657ed93a22a7eeb2e45c5a7787fb1e8069576951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 7 Jan 2024 22:23:30 +0100 Subject: [PATCH 277/344] [batoto] improve v2 manga URL pattern and add tests --- gallery_dl/extractor/batoto.py | 7 +-- test/results/batoto.py | 85 ++++++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index cd6302e6..9cc6494a 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -11,7 +11,7 @@ from .. import text, exception import re BASE_PATTERN = (r"(?:https?://)?" - r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") + r"(?:(?:ba|d|w)to\.to|(?:batotoo|mangatoto)\.com)") class BatotoBase(): @@ -76,12 +76,13 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): """Extractor for bato.to manga""" reverse = False chapterclass = BatotoChapterExtractor - pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" + pattern = (BASE_PATTERN + + r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$") example = "https://bato.to/title/12345-MANGA/" def __init__(self, match): self.root = text.root_from_url(match.group(0)) - self.manga_id = match.group(1) + self.manga_id = match.group(1) or match.group(2) url = "{}/title/{}".format(self.root, self.manga_id) MangaExtractor.__init__(self, match, url) diff --git a/test/results/batoto.py b/test/results/batoto.py index f3853247..d61f7c87 100644 --- a/test/results/batoto.py +++ b/test/results/batoto.py @@ -14,10 +14,21 @@ __tests__ = ( "#class" : batoto.BatotoChapterExtractor, "#count" : 66, + "chapter" : 8, + "chapter_id" : 1681030, + "chapter_minor": "", + "count" : 66, + "date" : "dt:2021-05-15 18:51:37", + "extension" : "webp", + "filename" : str, "manga" : "I Shall Master this Family! [Official]", + "manga_id" : 86408, + "page" : range(1, 66), "title" : "Observing", - "chapter" : 8, + "volume" : 0, + }, + { "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", "#comment" : "volume (vol) in url", @@ -25,19 +36,25 @@ __tests__ = ( "#class" : batoto.BatotoChapterExtractor, "#count" : 7, - "manga" : "86--EIGHTY-SIX (Official)", - "title" : "The Spearhead Squadron's Power", - "volume" : 1, - "chapter" : 5, + "manga" : "86--EIGHTY-SIX (Official)", + "title" : "The Spearhead Squadron's Power", + "volume" : 1, + "chapter": 5, }, + { "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", "#category": ("", "batoto", "manga"), "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 21", + "chapter" : int, + "chapter_minor": str, + "date" : "type:datetime", "manga" : "Futsutsuka na Akujo de wa Gozaimasu ga - Suuguu Chouso Torikae Den (Official)", + "manga_id" : 113742, }, + { "#url" : "https://bato.to/title/104929-86-eighty-six-official", "#comment" : "Manga with number in name", @@ -45,8 +62,9 @@ __tests__ = ( "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 18", - "manga" : "86--EIGHTY-SIX (Official)", + "manga": "86--EIGHTY-SIX (Official)", }, + { "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", "#comment" : "Non-English translation (Indonesian)", @@ -54,12 +72,63 @@ __tests__ = ( "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 29", - "manga" : "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", + "manga": "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", }, + { "#url" : "https://bato.to/title/134270-removed", + "#comment" : "Deleted/removed manga", "#category": ("", "batoto", "manga"), "#class" : batoto.BatotoMangaExtractor, "#exception": exception.StopExtraction, -} +}, + +{ + "#url" : "https://bato.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://bato.to/chapter/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://dto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://wto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://batotoo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://mangatoto.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, +}, + +{ + "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, +}, + ) From 61f3b2f820f4687837e10fa9b067782807d49a4c Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 01:29:47 +1100 Subject: [PATCH 278/344] [hatenablog] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/hatenablog.py | 167 +++++++++++++++++++++++++++++ scripts/supportedsites.py | 7 +- test/results/hatenablog.py | 144 +++++++++++++++++++++++++ 5 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 gallery_dl/extractor/hatenablog.py create mode 100644 test/results/hatenablog.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d046aad4..188d8294 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW. <td>Folders</td> <td></td> </tr> +<tr> + <td>HatenaBlog</td> + <td>https://hatenablog.com</td> + <td>Archive, Individual Posts, Home Feed, Search Results</td> + <td></td> +</tr> <tr> <td>HBrowse</td> <td>https://www.hbrowse.com/</td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3..26ce2093 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -53,6 +53,7 @@ modules = [ "gelbooru_v01", "gelbooru_v02", "gofile", + "hatenablog", "hbrowse", "hentai2read", "hentaicosplays", diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 00000000..59e2f94e --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.com|hatenablog\.jp" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenaBlogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + + self.domain = match.group(1) or match.group(2) + self._find_img = re.compile(r'<img +(.+?) */?>').finditer + self._is_image = re.compile( + r'(?: |^)class="hatena-fotolife"(?: |$)').search + self._find_img_src = re.compile(r'(?: |^)src="(.+?)"(?: |$)').search + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('<time datetime="', '"')) + entry_link = text.unescape(extr( + '<a href="', '" class="entry-title-link bookmark">')) + entry = entry_link.partition("/entry/")[2] + title = extr('', '</a>') + content = extr( + '<div class="entry-content hatenablog-entry">', '</div>') + + images = [] + for i in self._find_img(content): + attributes = i.group(1) + if not self._is_image(attributes): + continue + image = text.unescape(self._find_img_src(attributes).group(1)) + images.append(image) + + data = { + "domain": self.domain, + "date": date, + "entry": entry, + "title": title, + "count": len(images), + } + yield Message.Directory, data + for data["num"], url in enumerate(images, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + +class HatenaBlogEntriesExtractor(HatenaBlogExtractor): + """Base class for a list of entries""" + allowed_parameters = () + + def __init__(self, match): + HatenaBlogExtractor.__init__(self, match) + self.path = match.group(3) + self.query = {key: value for key, value in text.parse_query( + match.group(4)).items() if self._acceptable_query(key)} + self._find_pager_url = re.compile( + r'<span class="pager-next">\s*<a href="(.+?)"').search + + def items(self): + url = "https://" + self.domain + self.path + query = self.query + + while url: + page = self.request(url, params=query).text + + extr = text.extract_from(page) + attributes = extr('<body ', '>') + if "page-archive" in attributes: + yield from self._handle_partial_articles(extr) + else: + yield from self._handle_full_articles(extr) + + match = self._find_pager_url(page) + url = text.unescape(match.group(1)) if match else None + query = None + + def _handle_partial_articles(self, extr): + while True: + section = extr('<section class="archive-entry', '</section>') + if not section: + break + + url = "hatenablog:" + text.unescape(text.extr(section, + '<a class="entry-title-link" href="', '"')) + data = {"_extractor": HatenaBlogEntryExtractor} + yield Message.Queue, url, data + + def _handle_full_articles(self, extr): + while True: + attributes = extr('<article ', '>') + if not attributes: + break + if "no-entry" in attributes: + continue + + article = extr('', '</article>') + yield from self._handle_article(article) + + def _acceptable_query(self, key: str) -> bool: + return key == "page" or key in self.allowed_parameters + + +class HatenaBlogEntryExtractor(HatenaBlogExtractor): + """Extractor for a single entry URL""" + subcategory = "entry" + pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE + example = "https://BLOG.hatenablog.com/entry/PATH" + + def __init__(self, match): + HatenaBlogExtractor.__init__(self, match) + self.path = match.group(3) + + def items(self): + url = "https://" + self.domain + "/entry/" + self.path + page = self.request(url).text + + extr = text.extract_from(page) + while True: + attributes = extr('<article ', '>') + if "no-entry" in attributes: + continue + article = extr('', '</article>') + return self._handle_article(article) + + +class HatenaBlogHomeExtractor(HatenaBlogEntriesExtractor): + """Extractor for a blog's home page""" + subcategory = "home" + pattern = BASE_PATTERN + r"(/?)" + QUERY_RE + example = "https://BLOG.hatenablog.com" + + +class HatenaBlogArchiveExtractor(HatenaBlogEntriesExtractor): + """Extractor for a blog's archive page""" + subcategory = "archive" + pattern = BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + \ + r"|/category/[^?#]+)?)" + QUERY_RE + example = "https://BLOG.hatenablog.com/archive/2024" + + +class HatenaBlogSearchExtractor(HatenaBlogEntriesExtractor): + """Extractor for a blog's search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/search)" + QUERY_RE + example = "https://BLOG.hatenablog.com/search?q=QUERY" + allowed_parameters = ("q",) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 798a6830..d29001c7 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -50,6 +50,7 @@ CATEGORY_MAP = { "fanbox" : "pixivFANBOX", "fashionnova" : "Fashion Nova", "furaffinity" : "Fur Affinity", + "hatenablog" : "HatenaBlog", "hbrowse" : "HBrowse", "hentai2read" : "Hentai2Read", "hentaicosplays" : "Hentai Cosplay", @@ -102,7 +103,6 @@ CATEGORY_MAP = { "pornimagesxxx" : "Porn Image", "pornpics" : "PornPics.com", "pornreactor" : "PornReactor", - "postmill" : "Postmill", "readcomiconline": "Read Comic Online", "rbt" : "RebeccaBlackTech", "redgifs" : "RedGIFs", @@ -189,6 +189,11 @@ SUBCATEGORY_MAP = { "fapello": { "path": "Videos, Trending Posts, Popular Videos, Top Models", }, + "hatenablog": { + "archive": "Archive", + "entry" : "Individual Posts", + "home" : "Home Feed", + }, "hentaifoundry": { "story": "", }, diff --git a/test/results/hatenablog.py b/test/results/hatenablog.py new file mode 100644 index 00000000..8ca7876f --- /dev/null +++ b/test/results/hatenablog.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import hatenablog + + +__tests__ = ( +{ + "#url" : "https://cosmiclatte.hatenablog.com/entry/2020/05/28/003227", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, + "#count" : 20, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/entry/2023/12/31/083846", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/entry/20231227/1703685600", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/entry/2ndlife", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/entry/2023/12/22/133549", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "https://cetriolo.hatenablog.com", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, + "#range" : "1-7", + "#count" : 7, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : ("https://8saki.hatenablog.com/archive/category/%E3%82%BB%E3" + "%83%AB%E3%83%95%E3%82%B8%E3%82%A7%E3%83%AB%E3%83%8D%E3%82" + "%A4%E3%83%AB"), + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#range" : "1-30", + "#count" : 30, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/archive/2023", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#count" : 13, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/archive/2023/01", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#count" : 5, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/archive", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#range" : "1-30", + "#count" : 30, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/archive/2024/01/01", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, + "#range" : "1-30", + "#count" : 30, +}, + +{ + "#url" : "https://cosmiclatte.hatenablog.com/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +) From be6949c55d994d4a62d783d20c3a9d92bc81a53a Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 01:36:52 +1100 Subject: [PATCH 279/344] [hatenablog] fix linting error --- gallery_dl/extractor/hatenablog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 59e2f94e..322f2ca5 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -101,8 +101,8 @@ class HatenaBlogEntriesExtractor(HatenaBlogExtractor): if not section: break - url = "hatenablog:" + text.unescape(text.extr(section, - '<a class="entry-title-link" href="', '"')) + url = "hatenablog:" + text.unescape(text.extr( + section, '<a class="entry-title-link" href="', '"')) data = {"_extractor": HatenaBlogEntryExtractor} yield Message.Queue, url, data From 2cfe788f936a532784e66e7906dfb54c7c678e1f Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 01:42:57 +1100 Subject: [PATCH 280/344] [hatenablog] fix extractor naming errors --- gallery_dl/extractor/hatenablog.py | 18 +++++++------- test/results/hatenablog.py | 40 +++++++++++++++--------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 322f2ca5..dd1e45a5 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -19,7 +19,7 @@ BASE_PATTERN = ( QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" -class HatenaBlogExtractor(Extractor): +class HatenablogExtractor(Extractor): """Base class for HatenaBlog extractors""" category = "hatenablog" directory_fmt = ("{category}", "{domain}") @@ -65,12 +65,12 @@ class HatenaBlogExtractor(Extractor): yield Message.Url, url, text.nameext_from_url(url, data) -class HatenaBlogEntriesExtractor(HatenaBlogExtractor): +class HatenablogEntriesExtractor(HatenablogExtractor): """Base class for a list of entries""" allowed_parameters = () def __init__(self, match): - HatenaBlogExtractor.__init__(self, match) + HatenablogExtractor.__init__(self, match) self.path = match.group(3) self.query = {key: value for key, value in text.parse_query( match.group(4)).items() if self._acceptable_query(key)} @@ -103,7 +103,7 @@ class HatenaBlogEntriesExtractor(HatenaBlogExtractor): url = "hatenablog:" + text.unescape(text.extr( section, '<a class="entry-title-link" href="', '"')) - data = {"_extractor": HatenaBlogEntryExtractor} + data = {"_extractor": HatenablogEntryExtractor} yield Message.Queue, url, data def _handle_full_articles(self, extr): @@ -121,14 +121,14 @@ class HatenaBlogEntriesExtractor(HatenaBlogExtractor): return key == "page" or key in self.allowed_parameters -class HatenaBlogEntryExtractor(HatenaBlogExtractor): +class HatenablogEntryExtractor(HatenablogExtractor): """Extractor for a single entry URL""" subcategory = "entry" pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE example = "https://BLOG.hatenablog.com/entry/PATH" def __init__(self, match): - HatenaBlogExtractor.__init__(self, match) + HatenablogExtractor.__init__(self, match) self.path = match.group(3) def items(self): @@ -144,14 +144,14 @@ class HatenaBlogEntryExtractor(HatenaBlogExtractor): return self._handle_article(article) -class HatenaBlogHomeExtractor(HatenaBlogEntriesExtractor): +class HatenablogHomeExtractor(HatenablogEntriesExtractor): """Extractor for a blog's home page""" subcategory = "home" pattern = BASE_PATTERN + r"(/?)" + QUERY_RE example = "https://BLOG.hatenablog.com" -class HatenaBlogArchiveExtractor(HatenaBlogEntriesExtractor): +class HatenablogArchiveExtractor(HatenablogEntriesExtractor): """Extractor for a blog's archive page""" subcategory = "archive" pattern = BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + \ @@ -159,7 +159,7 @@ class HatenaBlogArchiveExtractor(HatenaBlogEntriesExtractor): example = "https://BLOG.hatenablog.com/archive/2024" -class HatenaBlogSearchExtractor(HatenaBlogEntriesExtractor): +class HatenablogSearchExtractor(HatenablogEntriesExtractor): """Extractor for a blog's search results""" subcategory = "search" pattern = BASE_PATTERN + r"(/search)" + QUERY_RE diff --git a/test/results/hatenablog.py b/test/results/hatenablog.py index 8ca7876f..4a306f9a 100644 --- a/test/results/hatenablog.py +++ b/test/results/hatenablog.py @@ -11,38 +11,38 @@ __tests__ = ( { "#url" : "https://cosmiclatte.hatenablog.com/entry/2020/05/28/003227", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, "#count" : 20, }, { "#url" : "https://moko0908.hatenablog.jp/entry/2023/12/31/083846", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "https://p-shirokuma.hatenadiary.com/entry/20231227/1703685600", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "https://urakatahero.hateblo.jp/entry/2ndlife", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "hatenablog:https://blog.hyouhon.com/entry/2023/12/22/133549", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "https://cetriolo.hatenablog.com", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, "#range" : "1-7", "#count" : 7, }, @@ -50,25 +50,25 @@ __tests__ = ( { "#url" : "https://moko0908.hatenablog.jp/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { "#url" : "https://p-shirokuma.hatenadiary.com/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { "#url" : "https://urakatahero.hateblo.jp/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { "#url" : "hatenablog:https://blog.hyouhon.com/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { @@ -76,7 +76,7 @@ __tests__ = ( "%83%AB%E3%83%95%E3%82%B8%E3%82%A7%E3%83%AB%E3%83%8D%E3%82" "%A4%E3%83%AB"), "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#range" : "1-30", "#count" : 30, }, @@ -84,21 +84,21 @@ __tests__ = ( { "#url" : "https://moko0908.hatenablog.jp/archive/2023", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#count" : 13, }, { "#url" : "https://p-shirokuma.hatenadiary.com/archive/2023/01", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#count" : 5, }, { "#url" : "https://urakatahero.hateblo.jp/archive", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#range" : "1-30", "#count" : 30, }, @@ -106,13 +106,13 @@ __tests__ = ( { "#url" : "hatenablog:https://blog.hyouhon.com/archive/2024/01/01", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, }, { "#url" : "hatenablog:https://blog.hyouhon.com/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, "#range" : "1-30", "#count" : 30, }, @@ -120,25 +120,25 @@ __tests__ = ( { "#url" : "https://cosmiclatte.hatenablog.com/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, { "#url" : "https://moko0908.hatenablog.jp/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, { "#url" : "https://p-shirokuma.hatenadiary.com/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, { "#url" : "https://urakatahero.hateblo.jp/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, ) From ec958a26bc0f6664a8bf54bbb24412fdb49ada3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 8 Jan 2024 19:18:12 +0100 Subject: [PATCH 281/344] [fuskator] make metadata extraction non-fatal (#5039) - prevent KeyErrors - prevent HTTP redirect - return file URLs as list --- gallery_dl/extractor/fuskator.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index 20afb5a4..beecbff2 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -22,7 +22,7 @@ class FuskatorGalleryExtractor(GalleryExtractor): def __init__(self, match): self.gallery_hash = match.group(1) - url = "{}/thumbs/{}/".format(self.root, self.gallery_hash) + url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash) GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -50,15 +50,16 @@ class FuskatorGalleryExtractor(GalleryExtractor): "gallery_id" : text.parse_int(gallery_id), "gallery_hash": self.gallery_hash, "title" : text.unescape(title[:-15]), - "views" : data["hits"], - "score" : data["rating"], - "tags" : data["tags"].split(","), - "count" : len(data["images"]), + "views" : data.get("hits"), + "score" : data.get("rating"), + "tags" : (data.get("tags") or "").split(","), } def images(self, page): - for image in self.data["images"]: - yield "https:" + image["imageUrl"], image + return [ + ("https:" + image["imageUrl"], image) + for image in self.data["images"] + ] class FuskatorSearchExtractor(Extractor): From 2ccb7d3bd3f071c6923ca6eb9baedd196665d769 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 17:12:56 +1100 Subject: [PATCH 282/344] [steamgriddb] add support --- docs/configuration.rst | 170 ++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/steamgriddb.py | 210 ++++++++++++++++++++++++++++ test/results/steamgriddb.py | 124 ++++++++++++++++ 4 files changed, 505 insertions(+) create mode 100644 gallery_dl/extractor/steamgriddb.py create mode 100644 test/results/steamgriddb.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 8a1752ee..cfd67b3d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3076,6 +3076,176 @@ Description Download video files. +extractor.steamgriddb.animated +------------------------------ +Type + ``bool`` +Default + ``true`` +Description + Include animated assets when downloading from a list of assets. + + +extractor.steamgriddb.epilepsy +------------------------------ +Type + ``bool`` +Default + ``true`` +Description + Include assets tagged with epilepsy when downloading from a list of assets. + + +extractor.steamgriddb.dimensions +-------------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"all"`` +Examples + * ``"1024x512,512x512"`` + * ``["460x215", "920x430"]`` +Description + Only include assets that are in the specified dimensions. ``all`` can be + used to specify all dimensions. Valid values are: + + * Grids: ``460x215``, ``920x430``, ``600x900``, ``342x482``, ``660x930``, + ``512x512``, ``1024x1024`` + * Heroes: ``1920x620``, ``3840x1240``, ``1600x650`` + * Logos: N/A (will be ignored) + * Icons: ``8x8``, ``10x10``, ``14x14``, ``16x16``, ``20x20``, ``24x24``, + ``28x28``, ``32x32``, ``35x35``, ``40x40``, ``48x48``, ``54x54``, + ``56x56``, ``57x57``, ``60x60``, ``64x64``, ``72x72``, ``76x76``, + ``80x80``, ``90x90``, ``96x96``, ``100x100``, ``114x114``, ``120x120``, + ``128x128``, ``144x144``, ``150x150``, ``152x152``, ``160x160``, + ``180x180``, ``192x192``, ``194x194``, ``256x256``, ``310x310``, + ``512x512``, ``768x768``, ``1024x1024`` + + +extractor.steamgriddb.file-types +-------------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"all"`` +Examples + * ``"png,jpg"`` + * ``["jpeg", "webp"]`` +Description + Only include assets that are in the specified file types. ``all`` can be + used to specifiy all file types. Valid values are: + + * Grids: ``png``, ``jpeg``, ``jpg``, ``webp`` + * Heroes: ``png``, ``jpeg``, ``jpg``, ``webp`` + * Logos: ``png``, ``webp`` + * Icons: ``png``, ``ico`` + + +extractor.steamgriddb.download-fake-png +--------------------------------------- +Type + ``bool`` +Default + ``true`` +Description + Download fake PNGs alongside the real file. + + +extractor.steamgriddb.humor +--------------------------- +Type + ``bool`` +Default + ``true`` +Description + Include assets tagged with humor when downloading from a list of assets. + + +extractor.steamgriddb.languages +------------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"all"`` +Examples + * ``"en,km"`` + * ``["fr", "it"]`` +Description + Only include assets that are in the specified languages. ``all`` can be + used to specifiy all languages. Valid values are `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639-1>`__ + language codes. + + +extractor.steamgriddb.nsfw +-------------------------- +Type + ``bool`` +Default + ``true`` +Description + Include assets tagged with adult content when downloading from a list of assets. + + +extractor.steamgriddb.sort +-------------------------- +Type + ``string`` +Default + ``score_desc`` +Description + Set the chosen sorting method when downloading from a list of assets. Can be one of: + + * ``score_desc`` (Highest Score (Beta)) + * ``score_asc`` (Lowest Score (Beta)) + * ``score_old_desc`` (Highest Score (Old)) + * ``score_old_asc`` (Lowest Score (Old)) + * ``age_desc`` (Newest First) + * ``age_asc`` (Oldest First) + + +extractor.steamgriddb.static +---------------------------- +Type + ``bool`` +Default + ``true`` +Description + Include static assets when downloading from a list of assets. + + +extractor.steamgriddb.styles +---------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``all`` +Examples + * ``white,black`` + * ``["no_logo", "white_logo"]`` +Description + Only include assets that are in the specified styles. ``all`` can be used + to specify all styles. Valid values are: + + * Grids: ``alternate``, ``blurred``, ``no_logo``, ``material``, ``white_logo`` + * Heroes: ``alternate``, ``blurred``, ``material`` + * Logos: ``official``, ``white``, ``black``, ``custom`` + * Icons: ``official``, ``custom`` + + +extractor.steamgriddb.untagged +------------------------------ +Type + ``bool`` +Default + ``true`` +Description + Include untagged assets when downloading from a list of assets. + + extractor.[szurubooru].username & .token ---------------------------------------- Type diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3..be3ca649 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -145,6 +145,7 @@ modules = [ "smugmug", "soundgasm", "speakerdeck", + "steamgriddb", "subscribestar", "szurubooru", "tapas", diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py new file mode 100644 index 00000000..516c422b --- /dev/null +++ b/gallery_dl/extractor/steamgriddb.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.steamgriddb.com""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com" +LANGUAGE_CODES = ( + "aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az", + "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce", + "ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee", + "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr", + "fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr", + "ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is", + "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn", + "ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln", + "lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", + "mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv", + "ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu", + "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk", + "sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta", + "te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw", + "ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", + "yo", "za", "zh", "zu", +) +FILE_EXT_TO_MIME = { + "png": "image/png", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "webp": "image/webp", + "ico": "image/vnd.microsoft.icon", + "all": "all", +} + + +class SteamgriddbExtractor(Extractor): + """Base class for SteamGridDB""" + category = "steamgriddb" + directory_fmt = ("{category}", "{subcategory}", "{game[id]}") + filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}" + archive_fmt = "{filename}" + root = "https://www.steamgriddb.com" + + def _init(self): + self.cookies_update({ + "userprefs": "%7B%22adult%22%3Afalse%7D", + }) + + def items(self): + download_fake_png = self.config("download-fake-png", True) + + for asset in self.assets(): + urls = [asset["url"]] + if download_fake_png and asset.get("fake_png"): + urls.append(asset["fake_png"]) + + asset["count"] = len(urls) + yield Message.Directory, asset + for asset["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, asset) + + def _call(self, endpoint: str, **kwargs): + data = self.request(self.root + endpoint, **kwargs).json() + if not data["success"]: + raise exception.StopExtraction(data["error"]) + return data["data"] + + +class SteamgriddbAssetsExtractor(SteamgriddbExtractor): + """Base class for extracting a list of assets""" + + def __init__(self, match): + SteamgriddbExtractor.__init__(self, match) + list_type = match.group(1) + id = int(match.group(2)) + self.game_id = id if list_type == "game" else None + self.collection_id = id if list_type == "collection" else None + self.page = int(match.group(3) or 1) + + def assets(self): + limit = 48 + page = min(self.page - 1, 0) + + sort = self.config("sort", "score_desc") + if sort not in ("score_desc", "score_asc", "score_old_desc", + "score_old_asc", "age_desc", "age_asc"): + raise exception.StopExtractor("Invalid sort '%s'", sort) + + json = { + "static" : self.config("static", True), + "animated": self.config("animated", True), + "humor" : self.config("humor", True), + "nsfw" : self.config("nsfw", True), + "epilepsy": self.config("epilepsy", True), + "untagged": self.config("untagged", True), + + "asset_type": self.asset_type, + "limit": limit, + "order": sort, + } + if self.valid_dimensions: + json["dimensions"] = self.config_list( + "dimensions", "dimension", self.valid_dimensions) + json["styles"] = self.config_list("styles", "style", self.valid_styles) + json["languages"] = self.config_list( + "languages", "language", LANGUAGE_CODES) + file_types = self.config_list( + "file-types", "file type", self.valid_file_types) + json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types] + + if self.game_id: + json["game_id"] = [self.game_id] + else: + json["collection_id"] = self.collection_id + + while True: + json["page"] = page + + data = self._call( + "/api/public/search/assets", method="POST", json=json) + for asset in data["assets"]: + if not asset.get("game"): + asset["game"] = data["game"] + yield asset + + if data["total"] > limit * page: + page += 1 + else: + break + + def config_list(self, key, type_name, valid_values): + value = self.config(key, ["all"]) + if isinstance(value, str): + value = value.split(",") + + if "all" in value: + return ["all"] + + for i in value: + if i not in valid_values: + raise exception.StopExtraction("Invalid %s '%s'", type_name, i) + + return value + + +class SteamgriddbAssetExtractor(SteamgriddbExtractor): + """Extractor for a single asset""" + subcategory = "asset" + pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)" + example = "https://www.steamgriddb.com/grid/1234" + + def __init__(self, match): + SteamgriddbExtractor.__init__(self, match) + self.asset_type = match.group(1) + self.asset_id = match.group(2) + + def assets(self): + endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id + asset = self._call(endpoint)["asset"] + return (asset,) + + +class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): + subcategory = "grids" + asset_type = "grid" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/grids" + valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930", + "512x512", "1024x1024") + valid_styles = ("alternate", "blurred", "no_logo", "material", "white_logo") + valid_file_types = ("png", "jpeg", "jpg", "webp") + + +class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor): + subcategory = "heroes" + asset_type = "hero" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/heroes" + valid_dimensions = ("1920x620", "3840x1240", "1600x650") + valid_styles = ("alternate", "blurred", "material") + valid_file_types = ("png", "jpeg", "jpg", "webp") + + +class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor): + subcategory = "logos" + asset_type = "logo" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/logos" + valid_dimensions = None + valid_styles = ("official", "white", "black", "custom") + valid_file_types = ("png", "webp") + + +class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor): + subcategory = "icons" + asset_type = "icon" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/icons" + valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24, + 28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90, + 96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192, + 194, 256, 310, 512, 768, 1024)] + valid_styles = ("official", "custom") + valid_file_types = ("png", "ico") diff --git a/test/results/steamgriddb.py b/test/results/steamgriddb.py new file mode 100644 index 00000000..06c1c22b --- /dev/null +++ b/test/results/steamgriddb.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import steamgriddb + + +__tests__ = ( +{ + "#url" : "https://www.steamgriddb.com/grid/368023", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, + "#urls" : ("https://cdn2.steamgriddb.com/grid/" + "82fee171d62c044898d99ba0fddeb203.png"), + "#count" : 1, + "#sha1_content": "0bffaccae6f35f9fab529684a5b158d1cec4186b", + + "game": { + "id" : 5259324, + "name": "Helltaker", + }, +}, + +{ + "#url" : "https://www.steamgriddb.com/grid/132605", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, + "#count" : 2, + "#sha1_url" : "4ff9158c008a1f01921d7553bcabf5e6204cdc79", + "#sha1_content": "bc16c5eebf71463abdb33cfbf4b45a2fe092a2b2", + + "game": { + "id" : 5247997, + "name": "OMORI", + }, +}, + +{ + "#url" : "https://www.steamgriddb.com/grid/132605", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, + "#options" : {"download-fake-png": False}, + "#count" : 1, + "#sha1_url" : "f6819c593ff65f15864796fb89581f05d21adddb", + "#sha1_content": "0d9e6114dd8bb9699182fbb7c6bd9064d8b0b6cd", + + "game": { + "id" : 5247997, + "name": "OMORI", + }, +}, + +{ + "#url" : "https://www.steamgriddb.com/hero/61104", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/logo/9610", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/icon/173", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5259324/grids", + "#category": ("", "steamgriddb", "grids"), + "#class" : steamgriddb.SteamgriddbGridsExtractor, + "#range" : "1-10", + "#count" : 10, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5259324/grids", + "#category": ("", "steamgriddb", "grids"), + "#class" : steamgriddb.SteamgriddbGridsExtractor, + "#options" : {"humor": False, "epilepsy": False, "untagged": False}, + "#range" : "1-33", + "#count" : 33, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5331605/heroes", + "#category": ("", "steamgriddb", "heroes"), + "#class" : steamgriddb.SteamgriddbHeroesExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5255394/logos", + "#category": ("", "steamgriddb", "logos"), + "#class" : steamgriddb.SteamgriddbLogosExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5279790/icons", + "#category": ("", "steamgriddb", "icons"), + "#class" : steamgriddb.SteamgriddbIconsExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/collection/332/grids", + "#category": ("", "steamgriddb", "grids"), + "#class" : steamgriddb.SteamgriddbGridsExtractor, + "#range" : "1-10", + "#count" : 10, +}, + +{ + "#url" : "https://www.steamgriddb.com/collection/332/heroes", + "#category": ("", "steamgriddb", "heroes"), + "#class" : steamgriddb.SteamgriddbHeroesExtractor, + "#options" : {"animated": False}, + "#count" : 0, +}, + +) From 100966b122cd90ca139593cf8ff21fd0f777243a Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 17:18:31 +1100 Subject: [PATCH 283/344] [steamgriddb] fix linting error --- gallery_dl/extractor/steamgriddb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 516c422b..1f803ffd 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -173,7 +173,8 @@ class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): example = "https://www.steamgriddb.com/game/1234/grids" valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930", "512x512", "1024x1024") - valid_styles = ("alternate", "blurred", "no_logo", "material", "white_logo") + valid_styles = ("alternate", "blurred", "no_logo", "material", + "white_logo") valid_file_types = ("png", "jpeg", "jpg", "webp") From 0c88373a219a646ab100a6ad89a42cb041cf1fad Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 17:22:25 +1100 Subject: [PATCH 284/344] [docs] add steamgriddb to supportedsites.md --- docs/supportedsites.md | 6 ++++++ scripts/supportedsites.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 057515c9..92a4cee0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -811,6 +811,12 @@ Consider all listed sites to potentially be NSFW. <td>Presentations</td> <td></td> </tr> +<tr> + <td>SteamGridDB</td> + <td>https://www.steamgriddb.com</td> + <td>Individual Assets, Grids, Heroes, Icons, Logos</td> + <td></td> +</tr> <tr> <td>SubscribeStar</td> <td>https://www.subscribestar.com/</td> diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 798a6830..d4ce3eed 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -121,6 +121,7 @@ CATEGORY_MAP = { "slideshare" : "SlideShare", "smugmug" : "SmugMug", "speakerdeck" : "Speaker Deck", + "steamgriddb" : "SteamGridDB", "subscribestar" : "SubscribeStar", "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", @@ -262,6 +263,9 @@ SUBCATEGORY_MAP = { "smugmug": { "path": "Images from Users and Folders", }, + "steamgriddb": { + "asset": "Individual Assets", + }, "tumblr": { "day": "Days", }, From 0a382a5092d275658a6a32e454ec3ff800b8d853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 17:25:04 +0100 Subject: [PATCH 285/344] [batoto] improve 'manga_id' extraction (#5042) --- gallery_dl/extractor/batoto.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 9cc6494a..72b5b6e5 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -38,7 +38,8 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) manga, info, _ = extr("<title>", "<").rsplit(" - ", 3) - manga_id = extr("/title/", "/") + manga_id = text.extr( + extr('rel="canonical" href="', '"'), "/title/", "/") match = re.match( r"(?:Volume\s+(\d+) )?" From 887ade30a51edeea150fd1a95b33c86208319289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 18:02:49 +0100 Subject: [PATCH 286/344] [batoto] support more mirror domains (#5042) --- gallery_dl/extractor/batoto.py | 7 +- test/results/batoto.py | 130 ++++++++++++++++++++++++++++++--- 2 files changed, 123 insertions(+), 14 deletions(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 72b5b6e5..e82cd09f 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -10,8 +10,11 @@ from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = (r"(?:https?://)?" - r"(?:(?:ba|d|w)to\.to|(?:batotoo|mangatoto)\.com)") +BASE_PATTERN = (r"(?:https?://)?(?:" + r"(?:ba|d|h|m|w)to\.to|" + r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|" + r"comiko\.(?:net|org)|" + r"bat(?:otoo|o?two)\.com)") class BatotoBase(): diff --git a/test/results/batoto.py b/test/results/batoto.py index d61f7c87..4992bda1 100644 --- a/test/results/batoto.py +++ b/test/results/batoto.py @@ -42,6 +42,19 @@ __tests__ = ( "chapter": 5, }, +{ + "#url" : "https://bato.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://bato.to/chapter/1681030", + "#comment" : "v2 URL", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + { "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", "#category": ("", "batoto", "manga"), @@ -84,51 +97,144 @@ __tests__ = ( }, { - "#url" : "https://bato.to/title/86408/1681030", + "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, +}, + +{ + "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", + "#comment" : "v2 URL", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, +}, + +{ + "#url" : "https://dto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://hto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://mto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://wto.to/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://bato.to/chapter/1681030", + "#url" : "https://mangatoto.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://mangatoto.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://mangatoto.org/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://dto.to/title/86408/1681030", + "#url" : "https://batocomic.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://batocomic.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://batocomic.org/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://wto.to/title/86408/1681030", + "#url" : "https://readtoto.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://readtoto.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://readtoto.org/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://batotoo.com/title/86408/1681030", + "#url" : "https://xbato.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://xbato.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://xbato.org/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://mangatoto.com/title/86408/1681030", + "#url" : "https://zbato.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://zbato.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://zbato.org/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", - "#category": ("", "batoto", "manga"), - "#class" : batoto.BatotoMangaExtractor, + "#url" : "https://comiko.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://comiko.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", - "#category": ("", "batoto", "manga"), - "#class" : batoto.BatotoMangaExtractor, + "#url" : "https://batotoo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://batotwo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://battwo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, }, ) From 5f9a98cf0fded6dac8efcc02b4f2cbc39ebc614a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 20:04:46 +0100 Subject: [PATCH 287/344] [deviantart:avatar] fix exception when 'comments' are enabled (#4995) --- gallery_dl/extractor/deviantart.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 4b5f1d77..32dedacf 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -558,6 +558,7 @@ class DeviantartAvatarExtractor(DeviantartExtractor): "is_downloadable": False, "published_time" : 0, "title" : "avatar", + "stats" : {"comments": 0}, "content" : { "src": url.replace("/avatars/", "/avatars-big/", 1), }, From 5c43098a1ae062456e040246459f063bc84eefff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 23:19:39 +0100 Subject: [PATCH 288/344] [twitter] revert to using 'media' timeline by default (#4953) This reverts commit a94f9441487573ea84700936117f4535e78d32c0. --- docs/configuration.rst | 2 +- gallery_dl/extractor/twitter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 8a1752ee..ba3cc413 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3496,7 +3496,7 @@ Description * ``"tweets"``: `/tweets <https://twitter.com/USER/tweets>`__ timeline + search * ``"media"``: `/media <https://twitter.com/USER/media>`__ timeline + search * ``"with_replies"``: `/with_replies <https://twitter.com/USER/with_replies>`__ timeline + search - * ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets <extractor.twitter.retweets_>`__, `replies <extractor.twitter.replies_>`__, and `text-tweets <extractor.twitter.text-tweets_>`__ settings + * ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets <extractor.twitter.retweets_>`__ and `text-tweets <extractor.twitter.text-tweets_>`__ settings extractor.twitter.text-tweets diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index aa9ab9f6..cf759e0f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -546,7 +546,7 @@ class TwitterTimelineExtractor(TwitterExtractor): def _select_tweet_source(self): strategy = self.config("strategy") if strategy is None or strategy == "auto": - if self.retweets or self.replies or self.textonly: + if self.retweets or self.textonly: return self.api.user_tweets else: return self.api.user_media From 39904c9e4eb2fe664ff5855bc2d3c2d749dcb690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 10 Jan 2024 17:13:34 +0100 Subject: [PATCH 289/344] [deviantart:avatar] add 'formats' option (#4995) --- docs/configuration.rst | 13 +++++++ gallery_dl/extractor/deviantart.py | 56 +++++++++++++++++++++--------- test/results/deviantart.py | 17 ++++++++- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index ba3cc413..00be43a7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1559,6 +1559,19 @@ Description Minimum wait time in seconds before API requests. +extractor.deviantart.avatar.formats +----------------------------------- +Type + ``list`` of ``strings`` +Example + ``["original.jpg", "big.jpg", "big.gif", ".png"]`` +Description + Avatar URL formats to return. + + | Each format is parsed as ``SIZE.EXT``. + | Leave ``SIZE`` empty to download the regular, small avatar format. + + extractor.[E621].metadata ------------------------- Type diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 32dedacf..7df1890e 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -547,23 +547,45 @@ class DeviantartAvatarExtractor(DeviantartExtractor): example = "https://www.deviantart.com/USER/avatar/" def deviations(self): - profile = self.api.user_profile(self.user.lower()) - if profile: - url = profile["user"]["usericon"] - return ({ - "author" : profile["user"], - "category" : "avatar", - "index" : text.parse_int(url.rpartition("?")[2]), - "is_deleted" : False, - "is_downloadable": False, - "published_time" : 0, - "title" : "avatar", - "stats" : {"comments": 0}, - "content" : { - "src": url.replace("/avatars/", "/avatars-big/", 1), - }, - },) - return () + name = self.user.lower() + profile = self.api.user_profile(name) + if not profile: + return () + + user = profile["user"] + icon = user["usericon"] + index = icon.rpartition("?")[2] + + formats = self.config("formats") + if not formats: + url = icon.replace("/avatars/", "/avatars-big/", 1) + return (self._make_deviation(url, user, index, ""),) + + if isinstance(formats, str): + formats = formats.replace(" ", "").split(",") + + results = [] + for fmt in formats: + fmt, _, ext = fmt.rpartition(".") + if fmt: + fmt = "-" + fmt + url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format( + fmt, name[0], name[1], name, ext, index) + results.append(self._make_deviation(url, user, index, fmt)) + return results + + def _make_deviation(self, url, user, index, fmt): + return { + "author" : user, + "category" : "avatar", + "index" : text.parse_int(index), + "is_deleted" : False, + "is_downloadable": False, + "published_time" : 0, + "title" : "avatar" + fmt, + "stats" : {"comments": 0}, + "content" : {"src": url}, + } class DeviantartBackgroundExtractor(DeviantartExtractor): diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 45ee6c18..41cb3219 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -210,7 +210,7 @@ __tests__ = ( "#sha1_content": "abf2cc79b842315f2e54bfdd93bf794a0f612b6f", "author" : { - "type" : "premium", + "type" : "regular", "usericon": "https://a.deviantart.net/avatars/s/h/shimoda7.jpg?4", "userid" : "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B", "username": "shimoda7", @@ -237,6 +237,21 @@ __tests__ = ( "username" : "shimoda7", }, +{ + "#url" : "https://deviantart.com/shimoda7/avatar", + "#comment" : "'formats' option", + "#category": ("", "deviantart", "avatar"), + "#class" : deviantart.DeviantartAvatarExtractor, + "#archive" : False, + "#options" : {"formats": ["original.jpg", "big.jpg", "big.png", "big.gif"]}, + "#urls" : ( + "https://a.deviantart.net/avatars-original/s/h/shimoda7.jpg?4", + "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4", + "https://a.deviantart.net/avatars-big/s/h/shimoda7.png?4", + "https://a.deviantart.net/avatars-big/s/h/shimoda7.gif?4", + ), +}, + { "#url" : "https://deviantart.com/gdldev/banner", "#category": ("", "deviantart", "background"), From bbf96753e2ab8d02adb4682a5a2d607943914627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 10 Jan 2024 17:21:30 +0100 Subject: [PATCH 290/344] [gelbooru] only log "Incomplete API response" for favorites (#5045) --- gallery_dl/extractor/gelbooru.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index eba15390..e37b2e92 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -23,7 +23,7 @@ class GelbooruBase(): root = "https://gelbooru.com" offset = 0 - def _api_request(self, params, key="post"): + def _api_request(self, params, key="post", log=False): if "s" not in params: params["s"] = "post" params["api_key"] = self.api_key @@ -35,8 +35,9 @@ class GelbooruBase(): try: posts = data[key] except KeyError: - self.log.error("Incomplete API response (missing '%s')", key) - self.log.debug("%s", data) + if log: + self.log.error("Incomplete API response (missing '%s')", key) + self.log.debug("%s", data) return [] if not isinstance(posts, list): @@ -169,7 +170,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, "limit": "1", } - count = self._api_request(params, "@attributes")[0]["count"] + count = self._api_request(params, "@attributes", True)[0]["count"] if count <= self.offset: return @@ -186,7 +187,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite") + favs = self._api_request(params, "favorite", True) favs.reverse() if skip: From 2191e29e14ab138da8347744c993df0b40b85a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 10 Jan 2024 23:27:10 +0100 Subject: [PATCH 291/344] [nijie] fix image URL for single image posts (#5049) --- gallery_dl/extractor/nijie.py | 3 ++- test/results/nijie.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index b9917057..96145130 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -116,7 +116,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): yield from text.extract_iter( page, 'href="javascript:void(0);"><img src="', '"') else: - yield text.extr(page, 'itemprop="image" src="', '"') + pos = page.find('id="view-center"') + 1 + yield text.extract(page, 'itemprop="image" src="', '"', pos)[0] @staticmethod def _extract_user_name(page): diff --git a/test/results/nijie.py b/test/results/nijie.py index a2c05c81..1f86bcb1 100644 --- a/test/results/nijie.py +++ b/test/results/nijie.py @@ -157,6 +157,14 @@ __tests__ = ( "user_name" : "黒川 竜", }, +{ + "#url" : "https://nijie.info/view.php?id=37078", + "#comment" : "'view_side_dojin' thumbnails (#5049)", + "#category": ("Nijie", "nijie", "image"), + "#class" : nijie.NijieImageExtractor, + "#urls" : "https://pic.nijie.net/03/nijie/13/98/498/illust/0_0_703023d18ca8d058_bca943.jpg", +}, + { "#url" : "https://nijie.info/view.php?id=70724", "#category": ("Nijie", "nijie", "image"), From 1c68b7df010913cb661f06224bbbf7b610c79590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 11 Jan 2024 17:56:47 +0100 Subject: [PATCH 292/344] [patreon] fix KeyError (#5048) --- gallery_dl/extractor/patreon.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6c2f39dc..c175ab83 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -56,15 +56,16 @@ class PatreonExtractor(Extractor): else: self.log.debug("skipping %s (%s %s)", url, fhash, kind) - @staticmethod - def _postfile(post): + def _postfile(self, post): postfile = post.get("post_file") if postfile: - return (("postfile", postfile["url"], postfile["name"]),) + url = postfile["url"] + name = postfile.get("name") or self._filename(url) or url + return (("postfile", url, name),) return () def _images(self, post): - for image in post["images"]: + for image in post.get("images") or (): url = image.get("download_url") if url: name = image.get("file_name") or self._filename(url) or url @@ -80,7 +81,7 @@ class PatreonExtractor(Extractor): return () def _attachments(self, post): - for attachment in post["attachments"]: + for attachment in post.get("attachments") or (): url = self.request( attachment["url"], method="HEAD", allow_redirects=False, fatal=False, From 2dcfb012ea0b773d22d1898c7f28e6bf3fa90eed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 02:33:27 +0100 Subject: [PATCH 293/344] [patreon] download 'm3u8' manifests with ytdl --- gallery_dl/extractor/patreon.py | 6 +++++- test/results/patreon.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index c175ab83..dfcfe24b 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -52,7 +52,11 @@ class PatreonExtractor(Extractor): post["hash"] = fhash post["type"] = kind post["num"] += 1 - yield Message.Url, url, text.nameext_from_url(name, post) + text.nameext_from_url(name, post) + if text.ext_from_url(url) == "m3u8": + url = "ytdl:" + url + post["extension"] = "mp4" + yield Message.Url, url, post else: self.log.debug("skipping %s (%s %s)", url, fhash, kind) diff --git a/test/results/patreon.py b/test/results/patreon.py index d4557173..79c0a603 100644 --- a/test/results/patreon.py +++ b/test/results/patreon.py @@ -103,6 +103,14 @@ __tests__ = ( "tags": ["AWMedia"], }, +{ + "#url" : "https://www.patreon.com/posts/meu8-94714289", + "#category": ("", "patreon", "post"), + "#class" : patreon.PatreonPostExtractor, + "#range" : "2", + "#pattern" : r"ytdl:https://stream\.mux\.com/NLrxTLdxyGStpOgapJAtB8uPGAaokEcj8YovML00y2DY\.m3u8\?token=ey.+", +}, + { "#url" : "https://www.patreon.com/posts/not-found-123", "#category": ("", "patreon", "post"), From 58e0665fbcefe050d90e3b629bfb52559f9f7670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 03:21:44 +0100 Subject: [PATCH 294/344] [tests] load config from external file --- test/test_results.py | 76 +++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 50 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index c7a50019..6b60e9d8 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -28,6 +28,16 @@ BROKEN = { "photobucket", } +CONFIG = { + "cache": { + "file": None, + }, + "downloader": { + "adjust-extensions": False, + "part": False, + }, +} + class TestExtractorResults(unittest.TestCase): @@ -348,56 +358,21 @@ class TestFormatter(formatter.StringFormatter): def setup_test_config(): - name = "gallerydl" - email = "gallerydl@openaliasbox.org" - email2 = "gallerydl@protonmail.com" - - config.clear() - config.set(("cache",), "file", None) - config.set(("downloader",), "part", False) - config.set(("downloader",), "adjust-extensions", False) - config.set(("extractor" ,), "timeout" , 60) - config.set(("extractor" ,), "username", name) - config.set(("extractor" ,), "password", name) - - config.set(("extractor", "nijie") , "username", email) - config.set(("extractor", "seiga") , "username", email) - config.set(("extractor", "horne") , "username", email2) - config.set(("extractor", "pinterest") , "username", email2) - config.set(("extractor", "pinterest") , "username", None) # login broken - - config.set(("extractor", "newgrounds"), "username", "d1618111") - config.set(("extractor", "newgrounds"), "password", "d1618111") - - config.set(("extractor", "mangoxo") , "username", "LiQiang3") - config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") - - for category in ("danbooru", "atfbooru", "aibooru", "booruvar", - "e621", "e926", "e6ai", - "instagram", "twitter", "subscribestar", "deviantart", - "inkbunny", "tapas", "pillowfort", "mangadex", - "vipergirls"): - config.set(("extractor", category), "username", None) - - config.set(("extractor", "mastodon.social"), "access-token", - "Blf9gVqG7GytDTfVMiyYQjwVMQaNACgf3Ds3IxxVDUQ") - - config.set(("extractor", "nana"), "favkey", - "9237ddb82019558ea7d179e805100805" - "ea6aa1c53ca6885cd4c179f9fb22ead2") - - config.set(("extractor", "deviantart"), "client-id", "7777") - config.set(("extractor", "deviantart"), "client-secret", - "ff14994c744d9208e5caeec7aab4a026") - - config.set(("extractor", "tumblr"), "api-key", - "0cXoHfIqVzMQcc3HESZSNsVlulGxEXGDTTZCDrRrjaa0jmuTc6") - config.set(("extractor", "tumblr"), "api-secret", - "6wxAK2HwrXdedn7VIoZWxGqVhZ8JdYKDLjiQjL46MLqGuEtyVj") - config.set(("extractor", "tumblr"), "access-token", - "N613fPV6tOZQnyn0ERTuoEZn0mEqG8m2K8M3ClSJdEHZJuqFdG") - config.set(("extractor", "tumblr"), "access-token-secret", - "sgOA7ZTT4FBXdOGGVV331sSp0jHYp4yMDRslbhaQf7CaS71i4O") + config._config.update(CONFIG) + + +def load_test_config(): + try: + path = os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "archive", "config.json") + with open(path) as fp: + CONFIG.update(json.loads(fp.read())) + except FileNotFoundError: + pass + except Exception as exc: + print("Error when loading {}: {}: {}".format( + path, exc.__class__.__name__, exc)) def generate_tests(): @@ -446,6 +421,7 @@ def generate_tests(): setattr(TestExtractorResults, method.__name__, method) +load_test_config() generate_tests() if __name__ == "__main__": unittest.main(warnings="ignore") From b97af09e03ada519aeed0a6f723fd2e733732811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 03:23:21 +0100 Subject: [PATCH 295/344] [tests] include URL in failure report --- test/test_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_results.py b/test/test_results.py index 6b60e9d8..575fc0f3 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -417,6 +417,7 @@ def generate_tests(): enum[name] += 1 method = _generate_method(result) + method.__doc__ = result["#url"] method.__name__ = "test_{}_{}".format(name, enum[name]) setattr(TestExtractorResults, method.__name__, method) From b1c175fdd1a5f5258d0f6aace5d3639446847e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 16:38:18 +0100 Subject: [PATCH 296/344] allow using an empty string as argument for -D/--directory --- gallery_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index fff53eb5..19ea77b2 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -45,7 +45,7 @@ def main(): elif filename.startswith("\\f"): filename = "\f" + filename[2:] config.set((), "filename", filename) - if args.directory: + if args.directory is not None: config.set((), "base-directory", args.directory) config.set((), "directory", ()) if args.postprocessors: From 8995fd5f0114695732bb994a61abe317f72d9bde Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 09:55:39 +1100 Subject: [PATCH 297/344] [steamgriddb] implement suggestions --- docs/configuration.rst | 2 +- gallery_dl/extractor/steamgriddb.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index cfd67b3d..e54b2e6e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3131,7 +3131,7 @@ Type Default ``"all"`` Examples - * ``"png,jpg"`` + * ``"png,jpeg"`` * ``["jpeg", "webp"]`` Description Only include assets that are in the specified file types. ``all`` can be diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 1f803ffd..eb00a9f4 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -56,16 +56,16 @@ class SteamgriddbExtractor(Extractor): download_fake_png = self.config("download-fake-png", True) for asset in self.assets(): - urls = [asset["url"]] + urls = (asset["url"],) if download_fake_png and asset.get("fake_png"): - urls.append(asset["fake_png"]) + urls = (asset["url"], asset["fake_png"]) asset["count"] = len(urls) yield Message.Directory, asset for asset["num"], url in enumerate(urls, 1): yield Message.Url, url, text.nameext_from_url(url, asset) - def _call(self, endpoint: str, **kwargs): + def _call(self, endpoint, **kwargs): data = self.request(self.root + endpoint, **kwargs).json() if not data["success"]: raise exception.StopExtraction(data["error"]) @@ -129,17 +129,16 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor): asset["game"] = data["game"] yield asset - if data["total"] > limit * page: - page += 1 - else: + if data["total"] <= limit * page: break + page += 1 def config_list(self, key, type_name, valid_values): - value = self.config(key, ["all"]) + value = self.config(key) if isinstance(value, str): value = value.split(",") - if "all" in value: + if value is None or "all" in value: return ["all"] for i in value: From 65f42442f562e91f74e3f4881f059007552be41e Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 10:12:15 +1100 Subject: [PATCH 298/344] [steamgriddb] implement another suggestion --- gallery_dl/extractor/steamgriddb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index eb00a9f4..9d46fd6b 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -56,9 +56,10 @@ class SteamgriddbExtractor(Extractor): download_fake_png = self.config("download-fake-png", True) for asset in self.assets(): - urls = (asset["url"],) if download_fake_png and asset.get("fake_png"): urls = (asset["url"], asset["fake_png"]) + else: + urls = (asset["url"],) asset["count"] = len(urls) yield Message.Directory, asset From 293f1559dfb24ccdb823f4bd023f6a9d1b88fb6f Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 10:42:22 +1100 Subject: [PATCH 299/344] [hatenablog] implement suggestions --- gallery_dl/extractor/hatenablog.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index dd1e45a5..40c36bb6 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -13,7 +13,7 @@ from .. import text BASE_PATTERN = ( r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" - r"([\w-]+\.(?:hatenablog\.com|hatenablog\.jp" + r"([\w-]+\.(?:hatenablog\.(?:com|jp)" r"|hatenadiary\.com|hateblo\.jp)))" ) QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" @@ -28,29 +28,26 @@ class HatenablogExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.domain = match.group(1) or match.group(2) - self._find_img = re.compile(r'<img +(.+?) */?>').finditer - self._is_image = re.compile( - r'(?: |^)class="hatena-fotolife"(?: |$)').search - self._find_img_src = re.compile(r'(?: |^)src="(.+?)"(?: |$)').search + + def _init(self): + self._find_img = re.compile(r'<img +([^>]+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) date = text.parse_datetime(extr('<time datetime="', '"')) - entry_link = text.unescape(extr( - '<a href="', '" class="entry-title-link bookmark">')) + entry_link = text.unescape(extr('<a href="', '"')) entry = entry_link.partition("/entry/")[2] - title = extr('', '</a>') + title = text.unescape(extr('>', '<')) content = extr( '<div class="entry-content hatenablog-entry">', '</div>') images = [] for i in self._find_img(content): attributes = i.group(1) - if not self._is_image(attributes): + if 'class="hatena-fotolife"' not in attributes: continue - image = text.unescape(self._find_img_src(attributes).group(1)) + image = text.unescape(text.extr(attributes, 'src="', '"')) images.append(image) data = { @@ -74,8 +71,11 @@ class HatenablogEntriesExtractor(HatenablogExtractor): self.path = match.group(3) self.query = {key: value for key, value in text.parse_query( match.group(4)).items() if self._acceptable_query(key)} + + def _init(self): + HatenablogExtractor._init(self) self._find_pager_url = re.compile( - r'<span class="pager-next">\s*<a href="(.+?)"').search + r' class="pager-next">\s*<a href="([^"]+)').search def items(self): url = "https://" + self.domain + self.path @@ -117,7 +117,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): article = extr('', '</article>') yield from self._handle_article(article) - def _acceptable_query(self, key: str) -> bool: + def _acceptable_query(self, key): return key == "page" or key in self.allowed_parameters @@ -154,8 +154,8 @@ class HatenablogHomeExtractor(HatenablogEntriesExtractor): class HatenablogArchiveExtractor(HatenablogEntriesExtractor): """Extractor for a blog's archive page""" subcategory = "archive" - pattern = BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + \ - r"|/category/[^?#]+)?)" + QUERY_RE + pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + r"|/category/[^?#]+)?)" + QUERY_RE) example = "https://BLOG.hatenablog.com/archive/2024" From 9f53daabb8e031871a604707bcc46f5359818910 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 10:43:25 +1100 Subject: [PATCH 300/344] [hatenablog] implement additional suggestion --- gallery_dl/extractor/hatenablog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 40c36bb6..792f6664 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -12,7 +12,7 @@ from .. import text BASE_PATTERN = ( - r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" + r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" r"([\w-]+\.(?:hatenablog\.(?:com|jp)" r"|hatenadiary\.com|hateblo\.jp)))" ) From bb446b15983ff0c09245fe58e9e9b997b73c4d77 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sun, 14 Jan 2024 19:26:49 +1100 Subject: [PATCH 301/344] [webtoons] extract more metadata --- gallery_dl/extractor/webtoons.py | 37 +++++++++++++++++++------------- test/results/webtoons.py | 12 +++++++++++ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 3f2f410d..1c7af470 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -87,23 +87,30 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): self.episode_no = params.get("episode_no") def metadata(self, page): - keywords, pos = text.extract( - page, '<meta name="keywords" content="', '"') - title, pos = text.extract( - page, '<meta property="og:title" content="', '"', pos) - descr, pos = text.extract( - page, '<meta property="og:description" content="', '"', pos) + extr = text.extract_from(page) + keywords = extr('<meta name="keywords" content="', '"').split(", ") + title = extr('<meta property="og:title" content="', '"') + descr = extr('<meta property="og:description" content="', '"') + + author_area = extr('<div class="author_area">', '</div>') + aa_extr = text.extract_from(author_area) + username = aa_extr('/creator/', '"') + author_name = aa_extr('<span>', '</span>') return { - "genre" : self.genre, - "comic" : self.comic, - "title_no" : self.title_no, - "episode_no" : self.episode_no, - "title" : text.unescape(title), - "episode" : keywords.split(", ")[1], - "description": text.unescape(descr), - "lang" : self.lang, - "language" : util.code_to_language(self.lang), + "genre" : self.genre, + "comic" : self.comic, + "title_no" : self.title_no, + "episode_no" : self.episode_no, + "title" : text.unescape(title), + "episode" : keywords[1], + "comic_name" : text.unescape(keywords[0]), + "episode_name": text.unescape(keywords[2]), + "username" : username, + "author_name" : text.unescape(author_name), + "description" : text.unescape(descr), + "lang" : self.lang, + "language" : util.code_to_language(self.lang), } @staticmethod diff --git a/test/results/webtoons.py b/test/results/webtoons.py index d2a177fd..9ca93446 100644 --- a/test/results/webtoons.py +++ b/test/results/webtoons.py @@ -37,6 +37,18 @@ __tests__ = ( "title_no" : "312584", }, +{ + "#url" : "https://www.webtoons.com/en/canvas/i-want-to-be-a-cute-anime-girl/209-the-storys-story/viewer?title_no=349416&episode_no=214", + "#category": ("", "webtoons", "episode"), + "#class" : webtoons.WebtoonsEpisodeExtractor, + "#count" : 4, + + "comic_name" : "I want to be a cute anime girl", + "episode_name": "209 - The story's story", + "username" : "m9huj", + "author_name" : "Azul Crescent", +}, + { "#url" : "https://www.webtoons.com/en/comedy/live-with-yourself/list?title_no=919", "#comment" : "english", From 69726fc82c96d54af72746b379948ffef103070a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 14 Jan 2024 22:09:26 +0100 Subject: [PATCH 302/344] [tests] skip tests requiring auth when non is provided --- test/results/coomerparty.py | 11 +++++++++-- test/results/kemonoparty.py | 3 +++ test/test_results.py | 25 +++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/test/results/coomerparty.py b/test/results/coomerparty.py index dfc4a188..87c932e8 100644 --- a/test/results/coomerparty.py +++ b/test/results/coomerparty.py @@ -8,12 +8,19 @@ from gallery_dl.extractor import kemonoparty __tests__ = ( +{ + "#url" : "https://coomer.su/onlyfans/user/alinity/post/125962203", + "#comment" : "coomer (#2100)", + "#category": ("", "coomerparty", "onlyfans"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#urls" : "https://coomer.su/data/7d/3f/7d3fd9804583dc224968c0591163ec91794552b04f00a6c2f42a15b68231d5a8.jpg", +}, + { "#url" : "https://coomer.party/onlyfans/user/alinity/post/125962203", - "#comment" : "coomer.party (#2100)", "#category": ("", "coomerparty", "onlyfans"), "#class" : kemonoparty.KemonopartyPostExtractor, - "#pattern" : r"https://coomer\.party/data/7d/3f/7d3fd9804583dc224968c0591163ec91794552b04f00a6c2f42a15b68231d5a8\.jpg", + "#urls" : "https://coomer.party/data/7d/3f/7d3fd9804583dc224968c0591163ec91794552b04f00a6c2f42a15b68231d5a8.jpg", }, ) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index ad94a496..5bd541a3 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -297,6 +297,7 @@ __tests__ = ( "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyUserExtractor.pattern, + "#auth" : True, "#count" : 3, "#sha1_url": "f4b5b796979bcba824af84206578c79101c7f0e1", }, @@ -306,6 +307,7 @@ __tests__ = ( "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, + "#auth" : True, "#count" : 3, "#sha1_url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", }, @@ -315,6 +317,7 @@ __tests__ = ( "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, + "#auth" : True, "#count" : 3, "#sha1_url": "4be8e84cb384a907a8e7997baaf6287b451783b5", }, diff --git a/test/test_results.py b/test/test_results.py index 575fc0f3..12fe59d5 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -38,6 +38,15 @@ CONFIG = { }, } +AUTH = { + "pixiv", + "nijie", + "horne", + "seiga", + "instagram", + "twitter", +} + class TestExtractorResults(unittest.TestCase): @@ -76,6 +85,18 @@ class TestExtractorResults(unittest.TestCase): for key, value in result["#options"].items(): key = key.split(".") config.set(key[:-1], key[-1], value) + + requires_auth = result.get("#auth") + if requires_auth is None: + requires_auth = (result["#category"][1] in AUTH) + if requires_auth: + extr = result["#class"].from_url(result["#url"]) + if not any(extr.config(key) for key in ( + "username", "cookies", "api-key", "client-id")): + msg = "no auth" + self._skipped.append((result["#url"], msg)) + self.skipTest(msg) + if "#range" in result: config.set((), "image-range" , result["#range"]) config.set((), "chapter-range", result["#range"]) @@ -371,7 +392,7 @@ def load_test_config(): except FileNotFoundError: pass except Exception as exc: - print("Error when loading {}: {}: {}".format( + sys.exit("Error when loading {}: {}: {}".format( path, exc.__class__.__name__, exc)) @@ -422,7 +443,7 @@ def generate_tests(): setattr(TestExtractorResults, method.__name__, method) -load_test_config() generate_tests() if __name__ == "__main__": + load_test_config() unittest.main(warnings="ignore") From 6c4abc982e79b3f7b65bebbeddee01e32ec3f36d Mon Sep 17 00:00:00 2001 From: hunter-gatherer8 <hunter.gatherer8@proton.me> Date: Fri, 18 Aug 2023 00:23:22 +0300 Subject: [PATCH 303/344] [2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/2ch.py | 84 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + 3 files changed, 91 insertions(+) create mode 100644 gallery_dl/extractor/2ch.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a704cf4..53c88335 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW. </tr> </thead> <tbody valign="top"> +<tr> + <td>2ch</td> + <td>https://2ch.hk/</td> + <td>Boards, Threads</td> + <td></td> +</tr> <tr> <td>2chen</td> <td>https://sturdychan.help/</td> diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 00000000..f841dd3c --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.2ch.hk/""" + +from .common import Extractor, Message +from .. import text + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{file_id} - {filename}.{extension}" + archive_fmt = "{board}_{thread}_{file_id}" + pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" + thread_data = self.request(url).json() + + posts = thread_data["threads"][0]["posts"] + post = posts[0] + title = post.get("subject") or text.remove_html(post["comment"]) + + thread_metadata = { + "board": self.board, + "thread": self.thread, + "title": text.unescape(title)[:50], + } + + yield Message.Directory, thread_metadata + for post in posts: + if "files" in post and post['files']: + for file in post['files']: + file_metadata = { + "post_num": post["num"], + "file_id": file["name"].split('.')[0], + "filename": ".".join(file["fullname"].split('.')[:-1]), + "extension": file["name"].split('.')[-1], + } + file_metadata.update(thread_metadata) + + url = f"https://2ch.hk/{file['path']}" + yield Message.Url, url, file_metadata + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def get_pages(self): + url = f"https://2ch.hk/{self.board}/index.json" + index_page = self.request(url).json() + pages_total = len(index_page['pages']) + + yield index_page + for i in range(1, pages_total): + url = f"https://2ch.hk/{self.board}/{i}.json" + yield self.request(url).json() + + def get_thread_nums(self): + for page in self.get_pages(): + for thread in page["threads"]: + yield thread["thread_num"] + + def items(self): + for thread_num in self.get_thread_nums(): + url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" + yield Message.Queue, url, {"_extractor": _2chThreadExtractor} diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 13d7b38b..8e712961 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ import sys import re modules = [ + "2ch", "2chan", "2chen", "35photo", From 68196589c42bf3fadea2437cf996293da1892176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 8 Jan 2024 02:04:34 +0100 Subject: [PATCH 304/344] [2ch] update - simplify extractor code - more metadata - add tests --- gallery_dl/extractor/2ch.py | 95 ++++++++++++++++++++----------------- test/results/2ch.py | 64 +++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 44 deletions(-) create mode 100644 test/results/2ch.py diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index f841dd3c..dbbf21b6 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -4,81 +4,88 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://www.2ch.hk/""" +"""Extractors for https://2ch.hk/""" from .common import Extractor, Message -from .. import text +from .. import text, util class _2chThreadExtractor(Extractor): """Extractor for 2ch threads""" category = "2ch" subcategory = "thread" + root = "https://2ch.hk" directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{file_id} - {filename}.{extension}" - archive_fmt = "{board}_{thread}_{file_id}" - pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + filename_fmt = "{tim}{filename:? //}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" + example = "https://2ch.hk/a/res/12345.html" def __init__(self, match): Extractor.__init__(self, match) self.board, self.thread = match.groups() def items(self): - url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" - thread_data = self.request(url).json() + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["threads"][0]["posts"] - posts = thread_data["threads"][0]["posts"] - post = posts[0] - title = post.get("subject") or text.remove_html(post["comment"]) + op = posts[0] + title = op.get("subject") or text.remove_html(op["comment"]) - thread_metadata = { - "board": self.board, + thread = { + "board" : self.board, "thread": self.thread, - "title": text.unescape(title)[:50], + "title" : text.unescape(title)[:50], } - yield Message.Directory, thread_metadata + yield Message.Directory, thread for post in posts: - if "files" in post and post['files']: - for file in post['files']: - file_metadata = { - "post_num": post["num"], - "file_id": file["name"].split('.')[0], - "filename": ".".join(file["fullname"].split('.')[:-1]), - "extension": file["name"].split('.')[-1], - } - file_metadata.update(thread_metadata) + files = post.get("files") + if files: + post["post_name"] = post["name"] + post["date"] = text.parse_timestamp(post["timestamp"]) + del post["files"] + del post["name"] - url = f"https://2ch.hk/{file['path']}" - yield Message.Url, url, file_metadata + for file in files: + file.update(thread) + file.update(post) + + file["filename"] = file["fullname"].rpartition(".")[0] + file["tim"], _, file["extension"] = \ + file["name"].rpartition(".") + + yield Message.Url, self.root + file["path"], file class _2chBoardExtractor(Extractor): """Extractor for 2ch boards""" category = "2ch" subcategory = "board" - pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + root = "https://2ch.hk" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" + example = "https://2ch.hk/a/" def __init__(self, match): Extractor.__init__(self, match) self.board = match.group(1) - def get_pages(self): - url = f"https://2ch.hk/{self.board}/index.json" - index_page = self.request(url).json() - pages_total = len(index_page['pages']) - - yield index_page - for i in range(1, pages_total): - url = f"https://2ch.hk/{self.board}/{i}.json" - yield self.request(url).json() - - def get_thread_nums(self): - for page in self.get_pages(): - for thread in page["threads"]: - yield thread["thread_num"] - def items(self): - for thread_num in self.get_thread_nums(): - url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" - yield Message.Queue, url, {"_extractor": _2chThreadExtractor} + # index page + url = "{}/{}/index.json".format(self.root, self.board) + index = self.request(url).json() + index["_extractor"] = _2chThreadExtractor + for thread in index["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, index + + # pages 1..n + for n in util.advance(index["pages"], 1): + url = "{}/{}/{}.json".format(self.root, self.board, n) + page = self.request(url).json() + page["_extractor"] = _2chThreadExtractor + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, page diff --git a/test/results/2ch.py b/test/results/2ch.py new file mode 100644 index 00000000..5400292c --- /dev/null +++ b/test/results/2ch.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.2ch") +_2ch = getattr(gallery_dl.extractor, "2ch") + + +__tests__ = ( +{ + "#url" : "https://2ch.hk/a/res/6202876.html", + "#category": ("", "2ch", "thread"), + "#class" : _2ch._2chThreadExtractor, + "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+", + "#count" : range(450, 1000), + + "banned" : 0, + "board" : "a", + "closed" : 0, + "comment" : str, + "date" : "type:datetime", + "displayname": str, + "email" : "", + "endless" : 1, + "extension": str, + "filename" : str, + "fullname" : str, + "height" : int, + "lasthit" : 1705273977, + "md5" : r"re:[0-9a-f]{32}", + "name" : r"re:\d+\.\w+", + "num" : int, + "number" : range(1, 1000), + "op" : 0, + "parent" : int, + "path" : r"re:/a/src/6202876/\d+\.\w+", + "post_name": "Аноним", + "size" : int, + "sticky" : 0, + "subject" : str, + "thread" : "6202876", + "thumbnail": str, + "tim" : r"re:\d+", + "timestamp": int, + "title" : "MP4/WEBM", + "tn_height": int, + "tn_width" : int, + "trip" : "", + "type" : int, + "views" : int, + "width" : int, +}, + +{ + "#url" : "https://2ch.hk/a/", + "#category": ("", "2ch", "board"), + "#class" : _2ch._2chBoardExtractor, + "#pattern" : _2ch._2chThreadExtractor.pattern, + "#count" : range(200, 300), +}, + +) From 4cedf378d5548889256b0192ab4e081e5c570f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 16:28:57 +0100 Subject: [PATCH 305/344] [deviantart] fix AttributeError for URLs without username (#5065) caused by 4f367145 --- gallery_dl/extractor/deviantart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 7df1890e..a46517cd 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = (match.group(1) or match.group(2)).lower() + self.user = (match.group(1) or match.group(2) or "").lower() self.offset = 0 def _init(self): From 90b382304a1e8580f888a2c84ca95f74c2827710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 17:30:03 +0100 Subject: [PATCH 306/344] [deviantart] fix KeyError: 'premium_folder_data' (#5063) --- gallery_dl/extractor/deviantart.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index a46517cd..bcfbe73b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -452,9 +452,11 @@ class DeviantartExtractor(Extractor): return None dev = self.api.deviation(deviation["deviationid"], False) - folder = dev["premium_folder_data"] + folder = deviation["premium_folder_data"] username = dev["author"]["username"] - has_access = folder["has_access"] + + # premium_folder_data is no longer present when user has access (#5063) + has_access = ("premium_folder_data" not in dev) or folder["has_access"] if not has_access and folder["type"] == "watchers" and \ self.config("auto-watch"): From 8ffa0cd3c8c4f6d15ad281bf449812b7bf415bcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 18:24:47 +0100 Subject: [PATCH 307/344] [webtoons] small optimization don't extract the entire 'author_area' and avoid creating a second 'text.extract_from()' object --- gallery_dl/extractor/webtoons.py | 9 +++++---- test/results/webtoons.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 1c7af470..a4259358 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -92,10 +92,11 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): title = extr('<meta property="og:title" content="', '"') descr = extr('<meta property="og:description" content="', '"') - author_area = extr('<div class="author_area">', '</div>') - aa_extr = text.extract_from(author_area) - username = aa_extr('/creator/', '"') - author_name = aa_extr('<span>', '</span>') + if extr('<div class="author_area"', '\n'): + username = extr('/creator/', '"') + author_name = extr('<span>', '</span>') + else: + username = author_name = "" return { "genre" : self.genre, diff --git a/test/results/webtoons.py b/test/results/webtoons.py index 9ca93446..82831f02 100644 --- a/test/results/webtoons.py +++ b/test/results/webtoons.py @@ -20,6 +20,22 @@ __tests__ = ( "42055e44659f6ffc410b3fb6557346dfbb993df3", "49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9", ], + + "author_name" : "Chris McCoy", + "comic" : "safely-endangered", + "comic_name" : "Safely Endangered", + "count" : 5, + "description" : "Silly comics for silly people.", + "episode" : "572", + "episode_name": "Ep. 572 - Earth", + "episode_no" : "572", + "genre" : "comedy", + "lang" : "en", + "language" : "English", + "num" : range(1, 5), + "title" : "Safely Endangered - Ep. 572 - Earth", + "title_no" : "352", + "username" : "safelyendangered", }, { From 4d6ec6958d29bd22739ba5fe27086e715d51fbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 22:37:33 +0100 Subject: [PATCH 308/344] [scripts] add 'push --force' to pull-request --- scripts/pull-request | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/pull-request b/scripts/pull-request index defdc11f..dea9b292 100755 --- a/scripts/pull-request +++ b/scripts/pull-request @@ -41,6 +41,10 @@ case "${2,,}" in call git push "$USER" HEAD:"$BRANCH" ;; +"pf"|"push-force") + call git push --force "$USER" HEAD:"$BRANCH" + ;; + "d"|"delete") call git switch master call git branch -D "$USER-$BRANCH" From 3d68eda4abcfde18ecf377f140b8ad6ec4c2de6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 16 Jan 2024 00:24:30 +0100 Subject: [PATCH 309/344] [kemonoparty] add 'revision_hash' metadata (#4706, #4727, #5013) A SHA1 hexdigest of other relevant metadata fields like title, content, file and attachment URLs. This value does NOT reflect which revisions are listed on the website. Neither does 'edited' or any other metadata field (combinations). --- gallery_dl/extractor/kemonoparty.py | 26 ++++++++++++++++++++++---- test/results/kemonoparty.py | 2 ++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index c24e57d1..10228b5c 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -9,9 +9,10 @@ """Extractors for https://kemono.party/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache, memcache import itertools +import json import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" @@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor): Extractor.__init__(self, match) def _init(self): + self.revisions = self.config("revisions") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall + self._json_dumps = json.JSONEncoder( + ensure_ascii=False, check_circular=False, + sort_keys=True, separators=(",", ":")).encode def items(self): find_hash = re.compile(HASH_PATTERN).match @@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor): idx = len(revs) for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx idx -= 1 return revs + def _revision_hash(self, revision): + rev = revision.copy() + rev.pop("revision_id", None) + rev.pop("added", None) + rev.pop("next", None) + rev.pop("prev", None) + rev["file"].pop("name", None) + for a in rev["attachments"]: + a.pop("name", None) + return util.sha1(self._json_dumps(rev)) + def _validate(response): return (response.headers["content-length"] != "9" or @@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor): url = self.api_url params = text.parse_query(self.query) params["o"] = text.parse_int(params.get("o")) - revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - if revisions: + if self.revisions: for post in posts: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 post_url = "{}/post/{}".format(self.api_url, post["id"]) try: @@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor): def posts(self): if not self.revision: post = self.request(self.api_url).json() - if self.config("revisions"): + if self.revisions: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 try: revs = self._post_revisions(self.api_url) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 5bd541a3..c3dbdf73 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -177,6 +177,7 @@ __tests__ = ( "revision_id": 142470, "revision_index": 2, + "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", }, { @@ -190,6 +191,7 @@ __tests__ = ( "revision_id": range(134996, 3052965), "revision_index": range(1, 9), + "revision_hash": r"re:^[0-9a-f]{40}$", }, From e33056adcd1469a80f1f7656848d1cf6cde5b3f6 Mon Sep 17 00:00:00 2001 From: Ailothaen <mail@ailothaen.fr> Date: Sun, 27 Feb 2022 19:40:15 +0100 Subject: [PATCH 310/344] [wikimedia] Add Wikipedia/Wikimedia extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/wikimedia.py | 172 ++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 gallery_dl/extractor/wikimedia.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8e712961..86308917 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -178,6 +178,7 @@ modules = [ "weibo", "wikiart", "wikifeet", + "wikimedia", "xhamster", "xvideos", "zerochan", diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py new file mode 100644 index 00000000..41cc1c9e --- /dev/null +++ b/gallery_dl/extractor/wikimedia.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022-2022 Ailothaen +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Wikimedia and Wikipedia. +(Other Mediawiki instances use the same API,so a similar extractor +could be written) + +Various reference: +https://www.mediawiki.org/wiki/API:Query +https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category +""" + +from .common import Extractor, Message +import time +import re + + +class WikimediaArticleExtractor(Extractor): + category = "wikimedia" + subcategory = "article" + filename_fmt = "{filename}.{extension}" + archive_fmt = "{filename}" + pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" + directory_fmt = ("{category}", "{page}") + test = ( + ("https://en.wikipedia.org/wiki/Athena"), + ("https://zh.wikipedia.org/wiki/太阳"), + ("https://simple.wikipedia.org/wiki/Hydrogen", { + "count": ">= 2" + }) + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.lang, self.page = match.groups() + + def items(self): + continuation = None + gimcontinuation = None + + while True: + if continuation is None: + file_list_request = self.request( + "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa + lang=self.lang, page=self.page + ) + ) + else: + file_list_request = self.request( + "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa + lang=self.lang, + page=self.page, + continuation=continuation, + gimcontinuation=gimcontinuation, + ) + ) + file_list = file_list_request.json() + + for file_index in list(file_list["query"]["pages"]): + image = file_list["query"]["pages"][file_index]["imageinfo"][0] + + metadata = image + metadata["filename"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[0] + metadata["extension"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[1] + + yield Message.Directory, {"page": self.page, "lang": self.lang} + yield Message.Url, image["url"], image + else: + # We arrived at the end of the response + # checking if there are more files to retrieve + try: + continuation_info = file_list["continue"] + except KeyError: + # No more continuation info: all files were retrieved + break + else: + # Continuation info is present + # there are still files to retrieve + continuation = continuation_info["continue"] + gimcontinuation = continuation_info["gimcontinue"] + + # giving a rest to Wikipedia API + time.sleep(1) + + +class WikimediaCategoryExtractor(Extractor): + category = "wikimedia" + subcategory = "category" + filename_fmt = "{filename}.{extension}" + archive_fmt = "{filename}" + pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" + directory_fmt = ("{category}", "{page}") + + test = ( + ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa + ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa + "count": ">= 21" + }) + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.page = match.groups()[0] + + def items(self): + continuation = None + gcmcontinuation = None + + while True: + if continuation is None: + file_list_request = self.request( + "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa + page=self.page + ) + ) + else: + file_list_request = self.request( + "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa + page=self.page, + continuation=continuation, + gcmcontinuation=gcmcontinuation, + ) + ) + file_list = file_list_request.json() + + for file_index in list(file_list["query"]["pages"]): + image = file_list["query"]["pages"][file_index]["imageinfo"][0] + + metadata = image + metadata["filename"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[0] + metadata["extension"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[1] + + yield Message.Directory, {"page": self.page, "lang": "common"} + yield Message.Url, image["url"], image + else: + # We arrived at the end of the response + # checking if there are more files to retrieve + try: + continuation_info = file_list["continue"] + except KeyError: + # No more continuation info: all files were retrieved + break + else: + # Continuation info is present + # there are still files to retrieve + continuation = continuation_info["continue"] + gcmcontinuation = continuation_info["gcmcontinue"] + + # giving a rest to Wikipedia API + time.sleep(1) + + +class WikimediaUtils: + @staticmethod + def clean_name(name): + name = re.sub(r"^\w+:", "", name) + filename = ".".join(name.split(".")[:-1]) + extension = name.split(".")[-1] + return filename, extension From 221f54309cf5437ad887e89a5c71d1a4263294d6 Mon Sep 17 00:00:00 2001 From: Ailothaen <mail@ailothaen.fr> Date: Mon, 25 Apr 2022 23:14:16 +0200 Subject: [PATCH 311/344] [wikimedia] Improved archive identifiers --- gallery_dl/extractor/wikimedia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 41cc1c9e..a2ddfa2c 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -24,7 +24,7 @@ class WikimediaArticleExtractor(Extractor): category = "wikimedia" subcategory = "article" filename_fmt = "{filename}.{extension}" - archive_fmt = "{filename}" + archive_fmt = "a_{sha1}" pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" directory_fmt = ("{category}", "{page}") test = ( @@ -96,7 +96,7 @@ class WikimediaCategoryExtractor(Extractor): category = "wikimedia" subcategory = "category" filename_fmt = "{filename}.{extension}" - archive_fmt = "{filename}" + archive_fmt = "c_{sha1}" pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" directory_fmt = ("{category}", "{page}") From c3c1635ef35df7ef3f8884bd933578e79a2ade8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 16 Jan 2024 22:08:03 +0100 Subject: [PATCH 312/344] [wikimedia] update - rewrite using BaseExtractor - support most Wiki* domains - update docs/supportedsites - add tests --- docs/supportedsites.md | 58 +++++++ gallery_dl/extractor/wikimedia.py | 274 ++++++++++++++---------------- scripts/supportedsites.py | 1 + test/results/wikibooks.py | 23 +++ test/results/wikimediacommons.py | 23 +++ test/results/wikinews.py | 23 +++ test/results/wikipedia.py | 53 ++++++ test/results/wikiquote.py | 23 +++ test/results/wikisource.py | 23 +++ test/results/wikispecies.py | 25 +++ test/results/wikiversity.py | 23 +++ test/results/wiktionary.py | 23 +++ 12 files changed, 421 insertions(+), 151 deletions(-) create mode 100644 test/results/wikibooks.py create mode 100644 test/results/wikimediacommons.py create mode 100644 test/results/wikinews.py create mode 100644 test/results/wikipedia.py create mode 100644 test/results/wikiquote.py create mode 100644 test/results/wikisource.py create mode 100644 test/results/wikispecies.py create mode 100644 test/results/wikiversity.py create mode 100644 test/results/wiktionary.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53c88335..d3d2a8a3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW. <td></td> </tr> +<tr> + <td colspan="4"><strong>Wikimedia Instances</strong></td> +</tr> +<tr> + <td>Wikipedia</td> + <td>https://www.wikipedia.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wiktionary</td> + <td>https://www.wiktionary.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikiquote</td> + <td>https://www.wikiquote.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikibooks</td> + <td>https://www.wikibooks.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikisource</td> + <td>https://www.wikisource.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikinews</td> + <td>https://www.wikinews.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikiversity</td> + <td>https://www.wikiversity.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikispecies</td> + <td>https://species.wikimedia.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikimedia Commons</td> + <td>https://commons.wikimedia.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> + <tr> <td colspan="4"><strong>Moebooru and MyImouto</strong></td> </tr> diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index a2ddfa2c..1a896515 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -1,172 +1,144 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2022 Ailothaen +# Copyright 2022 Ailothaen +# Copyright 2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Wikimedia and Wikipedia. -(Other Mediawiki instances use the same API,so a similar extractor -could be written) +"""Extractors for Wikimedia and Wikipedia""" -Various reference: -https://www.mediawiki.org/wiki/API:Query -https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category -""" +from .common import BaseExtractor, Message +from .. import text -from .common import Extractor, Message -import time -import re - -class WikimediaArticleExtractor(Extractor): - category = "wikimedia" - subcategory = "article" - filename_fmt = "{filename}.{extension}" - archive_fmt = "a_{sha1}" - pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" +class WikimediaExtractor(BaseExtractor): + """Base class for wikimedia extractors""" + basecategory = "wikimedia" directory_fmt = ("{category}", "{page}") - test = ( - ("https://en.wikipedia.org/wiki/Athena"), - ("https://zh.wikipedia.org/wiki/太阳"), - ("https://simple.wikipedia.org/wiki/Hydrogen", { - "count": ">= 2" - }) - ) + archive_fmt = "{sha1}" + request_interval = (1.0, 2.0) def __init__(self, match): - Extractor.__init__(self, match) - self.lang, self.page = match.groups() + BaseExtractor.__init__(self, match) + self.title = match.group(match.lastindex) def items(self): - continuation = None - gimcontinuation = None + for info in self._pagination(self.params): + image = info["imageinfo"][0] + + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"]} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"]} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + image["page"] = self.title + + yield Message.Directory, image + yield Message.Url, image["url"], image + + def _pagination(self, params): + """ + https://www.mediawiki.org/wiki/API:Query + https://opendata.stackexchange.com/questions/13381 + """ + + url = self.root + "/w/api.php" + params["action"] = "query" + params["format"] = "json" while True: - if continuation is None: - file_list_request = self.request( - "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa - lang=self.lang, page=self.page - ) - ) - else: - file_list_request = self.request( - "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa - lang=self.lang, - page=self.page, - continuation=continuation, - gimcontinuation=gimcontinuation, - ) - ) - file_list = file_list_request.json() - - for file_index in list(file_list["query"]["pages"]): - image = file_list["query"]["pages"][file_index]["imageinfo"][0] - - metadata = image - metadata["filename"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[0] - metadata["extension"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[1] - - yield Message.Directory, {"page": self.page, "lang": self.lang} - yield Message.Url, image["url"], image - else: - # We arrived at the end of the response - # checking if there are more files to retrieve - try: - continuation_info = file_list["continue"] - except KeyError: - # No more continuation info: all files were retrieved - break - else: - # Continuation info is present - # there are still files to retrieve - continuation = continuation_info["continue"] - gimcontinuation = continuation_info["gimcontinue"] - - # giving a rest to Wikipedia API - time.sleep(1) - - -class WikimediaCategoryExtractor(Extractor): - category = "wikimedia" - subcategory = "category" - filename_fmt = "{filename}.{extension}" - archive_fmt = "c_{sha1}" - pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" - directory_fmt = ("{category}", "{page}") + data = self.request(url, params=params).json() - test = ( - ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa - ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa - "count": ">= 21" - }) - ) + try: + pages = data["query"]["pages"] + except KeyError: + pass + else: + yield from pages.values() + + try: + continuation = data["continue"] + except KeyError: + break + params.update(continuation) + + +BASE_PATTERN = WikimediaExtractor.update({ + "wikipedia": { + "root": None, + "pattern": r"[a-z]{2,}\.wikipedia\.org", + }, + "wiktionary": { + "root": None, + "pattern": r"[a-z]{2,}\.wiktionary\.org", + }, + "wikiquote": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiquote\.org", + }, + "wikibooks": { + "root": None, + "pattern": r"[a-z]{2,}\.wikibooks\.org", + }, + "wikisource": { + "root": None, + "pattern": r"[a-z]{2,}\.wikisource\.org", + }, + "wikinews": { + "root": None, + "pattern": r"[a-z]{2,}\.wikinews\.org", + }, + "wikiversity": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiversity\.org", + }, + "wikispecies": { + "root": "https://species.wikimedia.org", + "pattern": r"species\.wikimedia\.org", + }, + "wikimediacommons": { + "root": "https://commons.wikimedia.org", + "pattern": r"commons\.wikimedia\.org", + }, +}) + + +class WikimediaArticleExtractor(WikimediaExtractor): + """Extractor for wikimedia articles""" + subcategory = "article" + pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" + example = "https://en.wikipedia.org/wiki/TITLE" - def __init__(self, match): - Extractor.__init__(self, match) - self.page = match.groups()[0] + def _init(self): + self.params = { + "generator": "images", + "titles" : self.title, + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } - def items(self): - continuation = None - gcmcontinuation = None - while True: - if continuation is None: - file_list_request = self.request( - "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa - page=self.page - ) - ) - else: - file_list_request = self.request( - "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa - page=self.page, - continuation=continuation, - gcmcontinuation=gcmcontinuation, - ) - ) - file_list = file_list_request.json() - - for file_index in list(file_list["query"]["pages"]): - image = file_list["query"]["pages"][file_index]["imageinfo"][0] - - metadata = image - metadata["filename"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[0] - metadata["extension"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[1] - - yield Message.Directory, {"page": self.page, "lang": "common"} - yield Message.Url, image["url"], image - else: - # We arrived at the end of the response - # checking if there are more files to retrieve - try: - continuation_info = file_list["continue"] - except KeyError: - # No more continuation info: all files were retrieved - break - else: - # Continuation info is present - # there are still files to retrieve - continuation = continuation_info["continue"] - gcmcontinuation = continuation_info["gcmcontinue"] - - # giving a rest to Wikipedia API - time.sleep(1) - - -class WikimediaUtils: - @staticmethod - def clean_name(name): - name = re.sub(r"^\w+:", "", name) - filename = ".".join(name.split(".")[:-1]) - extension = name.split(".")[-1] - return filename, extension +class WikimediaCategoryExtractor(WikimediaExtractor): + subcategory = "category" + pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)" + example = "https://commons.wikimedia.org/wiki/Category:NAME" + + def _init(self): + self.params = { + "generator": "categorymembers", + "gcmtitle" : self.title, + "gcmtype" : "file", + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index d3107b47..34566465 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -139,6 +139,7 @@ CATEGORY_MAP = { "webmshare" : "webmshare", "webtoons" : "Webtoon", "wikiart" : "WikiArt.org", + "wikimediacommons": "Wikimedia Commons", "xbunkr" : "xBunkr", "xhamster" : "xHamster", "xvideos" : "XVideos", diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py new file mode 100644 index 00000000..882741d5 --- /dev/null +++ b/test/results/wikibooks.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikibooks.org/wiki/Title", + "#category": ("wikimedia", "wikibooks", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikibooks.org/wiki/Category:Title", + "#category": ("wikimedia", "wikibooks", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py new file mode 100644 index 00000000..6cc03e34 --- /dev/null +++ b/test/results/wikimediacommons.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg", + "#category": ("wikimedia", "wikimediacommons", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", + "#category": ("wikimedia", "wikimediacommons", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikinews.py b/test/results/wikinews.py new file mode 100644 index 00000000..8a2af25e --- /dev/null +++ b/test/results/wikinews.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikinews.org/wiki/Title", + "#category": ("wikimedia", "wikinews", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikinews.org/wiki/Category:Title", + "#category": ("wikimedia", "wikinews", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py new file mode 100644 index 00000000..87499878 --- /dev/null +++ b/test/results/wikipedia.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikipedia.org/wiki/Title", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Athena", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#pattern" : r"https://upload.wikimedia.org/wikipedia/.+", + "#count" : range(50, 100), + + "bitdepth" : int, + "canonicaltitle": str, + "comment" : str, + "commonmetadata": dict, + "date" : "type:datetime", + "descriptionshorturl": str, + "descriptionurl": str, + "extension" : str, + "extmetadata" : dict, + "filename" : str, + "height" : int, + "metadata" : dict, + "mime" : r"re:image/\w+", + "page" : "Athena", + "sha1" : r"re:^[0-9a-f]{40}$", + "size" : int, + "timestamp" : str, + "url" : str, + "user" : str, + "userid" : int, + "width" : int, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Category:Physics", + "#category": ("wikimedia", "wikipedia", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py new file mode 100644 index 00000000..5e6fb321 --- /dev/null +++ b/test/results/wikiquote.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiquote.org/wiki/Title", + "#category": ("wikimedia", "wikiquote", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiquote.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiquote", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikisource.py b/test/results/wikisource.py new file mode 100644 index 00000000..afdee23e --- /dev/null +++ b/test/results/wikisource.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikisource.org/wiki/Title", + "#category": ("wikimedia", "wikisource", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikisource.org/wiki/Category:Title", + "#category": ("wikimedia", "wikisource", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py new file mode 100644 index 00000000..d455fbac --- /dev/null +++ b/test/results/wikispecies.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://species.wikimedia.org/wiki/Geranospiza", + "#category": ("wikimedia", "wikispecies", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : "https://upload.wikimedia.org/wikipedia/commons/0/01/Geranospiza_caerulescens.jpg", + "#sha1_content": "3a17c14b15489928e4154f826af1c42afb5a523e", +}, + +{ + "#url" : "https://species.wikimedia.org/wiki/Category:Names", + "#category": ("wikimedia", "wikispecies", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py new file mode 100644 index 00000000..58565f49 --- /dev/null +++ b/test/results/wikiversity.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiversity.org/wiki/Title", + "#category": ("wikimedia", "wikiversity", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiversity.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiversity", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py new file mode 100644 index 00000000..c7a016f5 --- /dev/null +++ b/test/results/wiktionary.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wiktionary.org/wiki/Word", + "#category": ("wikimedia", "wiktionary", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wiktionary.org/wiki/Category:Words", + "#category": ("wikimedia", "wiktionary", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) From 89066844f4937bcaa3b15efcd199431ab9d6d246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 18 Jan 2024 03:20:36 +0100 Subject: [PATCH 313/344] add 'config_instance' method to allow for a more streamlined access to BaseExtractor instance options --- gallery_dl/extractor/common.py | 8 ++++++-- gallery_dl/extractor/gelbooru_v02.py | 12 +++--------- gallery_dl/extractor/mastodon.py | 12 ++++-------- gallery_dl/extractor/oauth.py | 4 ++-- gallery_dl/extractor/philomena.py | 11 +++-------- gallery_dl/extractor/shimmie2.py | 15 ++++----------- gallery_dl/extractor/urlshortener.py | 11 +++-------- scripts/supportedsites.py | 2 +- 8 files changed, 26 insertions(+), 49 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 0dd05ef2..cf0f8c90 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -102,6 +102,9 @@ class Extractor(): def config_accumulate(self, key): return config.accumulate(self._cfgpath, key) + def config_instance(self, key, default=None): + return default + def _config_shared(self, key, default=None): return config.interpolate_common( ("extractor",), self._cfgpath, key, default) @@ -735,9 +738,10 @@ class BaseExtractor(Extractor): for index, group in enumerate(match.groups()): if group is not None: if index: - self.category, self.root = self.instances[index-1] + self.category, self.root, info = self.instances[index-1] if not self.root: self.root = text.root_from_url(match.group(0)) + self.config_instance = info.get else: self.root = group self.category = group.partition("://")[2] @@ -757,7 +761,7 @@ class BaseExtractor(Extractor): root = info["root"] if root: root = root.rstrip("/") - instance_list.append((category, root)) + instance_list.append((category, root, info)) pattern = info.get("pattern") if not pattern: diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 0c8af3d5..f8ab71c9 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -22,11 +22,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): def _init(self): self.api_key = self.config("api-key") self.user_id = self.config("user-id") - - try: - self.api_root = INSTANCES[self.category]["api_root"] - except KeyError: - self.api_root = self.root + self.api_root = self.config_instance("api_root") or self.root if self.category == "realbooru": self.items = self._items_realbooru @@ -161,7 +157,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): post["tags_" + key] = " ".join(value) -INSTANCES = { +BASE_PATTERN = GelbooruV02Extractor.update({ "realbooru": { "root": "https://realbooru.com", "pattern": r"realbooru\.com", @@ -187,9 +183,7 @@ INSTANCES = { "root": "https://xbooru.com", "pattern": r"xbooru\.com", }, -} - -BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES) +}) class GelbooruV02TagExtractor(GelbooruV02Extractor): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 0b63d6c1..68b41961 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -75,7 +75,7 @@ class MastodonExtractor(BaseExtractor): account["acct"], account["moved"]["acct"]) -INSTANCES = { +BASE_PATTERN = MastodonExtractor.update({ "mastodon.social": { "root" : "https://mastodon.social", "pattern" : r"mastodon\.social", @@ -100,9 +100,7 @@ INSTANCES = { "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", } -} - -BASE_PATTERN = MastodonExtractor.update(INSTANCES) + "(?:/web)?" +}) + "(?:/web)?" class MastodonUserExtractor(MastodonExtractor): @@ -174,10 +172,8 @@ class MastodonAPI(): if access_token is None or access_token == "cache": access_token = _access_token_cache(extractor.instance) if not access_token: - try: - access_token = INSTANCES[extractor.category]["access-token"] - except (KeyError, TypeError): - pass + access_token = extractor.config_instance("access-token") + if access_token: self.headers = {"Authorization": "Bearer " + access_token} else: diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 16901607..8c8a5a99 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -358,8 +358,8 @@ class OAuthMastodon(OAuthBase): yield Message.Version, 1 from . import mastodon - for application in mastodon.INSTANCES.values(): - if self.instance == application["root"].partition("://")[2]: + for _, root, application in mastodon.MastodonExtractor.instances: + if self.instance == root.partition("://")[2]: break else: application = self._register(self.instance) diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index ac6a391e..339646ff 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -32,7 +32,7 @@ class PhilomenaExtractor(BooruExtractor): post["date"] = text.parse_datetime(post["created_at"]) -INSTANCES = { +BASE_PATTERN = PhilomenaExtractor.update({ "derpibooru": { "root": "https://derpibooru.org", "pattern": r"(?:www\.)?derpibooru\.org", @@ -48,9 +48,7 @@ INSTANCES = { "pattern": r"furbooru\.org", "filter_id": "2", }, -} - -BASE_PATTERN = PhilomenaExtractor.update(INSTANCES) +}) class PhilomenaPostExtractor(PhilomenaExtractor): @@ -176,10 +174,7 @@ class PhilomenaAPI(): if filter_id: params["filter_id"] = filter_id elif not api_key: - try: - params["filter_id"] = INSTANCES[extr.category]["filter_id"] - except (KeyError, TypeError): - params["filter_id"] = "2" + params["filter_id"] = extr.config_instance("filter_id") or "2" params["page"] = extr.page_start params["per_page"] = extr.per_page diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 8a08fabb..3da8d42e 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -19,17 +19,12 @@ class Shimmie2Extractor(BaseExtractor): archive_fmt = "{id}" def _init(self): - try: - instance = INSTANCES[self.category] - except KeyError: - return - - cookies = instance.get("cookies") + cookies = self.config_instance("cookies") if cookies: domain = self.root.rpartition("/")[2] self.cookies_update_dict(cookies, domain=domain) - file_url = instance.get("file_url") + file_url = self.config_instance("file_url") if file_url: self.file_url_fmt = file_url @@ -73,7 +68,7 @@ class Shimmie2Extractor(BaseExtractor): return "'" -INSTANCES = { +BASE_PATTERN = Shimmie2Extractor.update({ "loudbooru": { "root": "https://loudbooru.com", "pattern": r"loudbooru\.com", @@ -97,9 +92,7 @@ INSTANCES = { "root": "https://rule34hentai.net", "pattern": r"rule34hentai\.net", }, -} - -BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=/?)?" +}) + r"/(?:index\.php\?q=/?)?" class Shimmie2TagExtractor(Shimmie2Extractor): diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index f2e65214..49a3debd 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -15,7 +15,7 @@ class UrlshortenerExtractor(BaseExtractor): basecategory = "urlshortener" -INSTANCES = { +BASE_PATTERN = UrlshortenerExtractor.update({ "bitly": { "root": "https://bit.ly", "pattern": r"bit\.ly", @@ -26,9 +26,7 @@ INSTANCES = { "root": "https://t.co", "pattern": r"t\.co", }, -} - -BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES) +}) class UrlshortenerLinkExtractor(UrlshortenerExtractor): @@ -42,10 +40,7 @@ class UrlshortenerLinkExtractor(UrlshortenerExtractor): self.id = match.group(match.lastindex) def _init(self): - try: - self.headers = INSTANCES[self.category]["headers"] - except Exception: - self.headers = None + self.headers = self.config_instance("headers") def items(self): response = self.request( diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 34566465..859cbbad 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -457,7 +457,7 @@ def build_extractor_list(): domains[category] = domain(extr) else: base = categories[extr.basecategory] - for category, root in extr.instances: + for category, root, info in extr.instances: base[category].append(extr.subcategory) if category not in domains: if not root: From ea553a1d55e8e633019a3128ef6337c54b8f9031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 18 Jan 2024 15:36:16 +0100 Subject: [PATCH 314/344] [wikimedia] generalize (#1443) - support mediawiki.org - support mariowiki.com (#3660) - combine code into a single extractor (use prefix as subcategory) - handle non-wiki instances - unescape titles --- docs/supportedsites.md | 30 ++++++++---- gallery_dl/extractor/wikimedia.py | 80 ++++++++++++++++++++----------- scripts/supportedsites.py | 2 + test/results/mariowiki.py | 19 ++++++++ test/results/mediawiki.py | 24 ++++++++++ test/results/wikibooks.py | 2 +- test/results/wikimediacommons.py | 2 +- test/results/wikinews.py | 2 +- test/results/wikipedia.py | 2 +- test/results/wikiquote.py | 2 +- test/results/wikisource.py | 2 +- test/results/wikispecies.py | 2 +- test/results/wikiversity.py | 2 +- test/results/wiktionary.py | 2 +- 14 files changed, 126 insertions(+), 47 deletions(-) create mode 100644 test/results/mariowiki.py create mode 100644 test/results/mediawiki.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d3d2a8a3..4a6d8bd2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1484,55 +1484,67 @@ Consider all listed sites to potentially be NSFW. <tr> <td>Wikipedia</td> <td>https://www.wikipedia.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wiktionary</td> <td>https://www.wiktionary.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikiquote</td> <td>https://www.wikiquote.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikibooks</td> <td>https://www.wikibooks.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikisource</td> <td>https://www.wikisource.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikinews</td> <td>https://www.wikinews.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikiversity</td> <td>https://www.wikiversity.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikispecies</td> <td>https://species.wikimedia.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> <td></td> </tr> <tr> <td>Wikimedia Commons</td> <td>https://commons.wikimedia.org/</td> - <td>Articles, Categories</td> + <td>Articles</td> + <td></td> +</tr> +<tr> + <td>MediaWiki</td> + <td>https://www.mediawiki.org/</td> + <td>Articles</td> + <td></td> +</tr> +<tr> + <td>Super Mario Wiki</td> + <td>https://www.mariowiki.com/</td> + <td>Articles</td> <td></td> </tr> diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 1a896515..ffbf950e 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -7,7 +7,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Wikimedia and Wikipedia""" +"""Extractors for Wikimedia sites""" from .common import BaseExtractor, Message from .. import text @@ -22,7 +22,41 @@ class WikimediaExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.title = match.group(match.lastindex) + path = match.group(match.lastindex) + + if path.startswith("wiki/"): + path = path[5:] + self.api_path = "/w/api.php" + else: + self.api_path = "/api.php" + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + } + else: + self.params = { + "generator": "images", + "titles" : path, + } + + def _init(self): + api_path = self.config_instance("api-path") + if api_path: + if api_path[0] == "/": + self.api_url = self.root + api_path + else: + self.api_url = api_path + else: + self.api_url = self.root + self.api_path def items(self): for info in self._pagination(self.params): @@ -51,9 +85,14 @@ class WikimediaExtractor(BaseExtractor): https://opendata.stackexchange.com/questions/13381 """ - url = self.root + "/w/api.php" + url = self.api_url params["action"] = "query" params["format"] = "json" + params["prop"] = "imageinfo" + params["iiprop"] = ( + "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth" + ) while True: data = self.request(url, params=params).json() @@ -109,36 +148,19 @@ BASE_PATTERN = WikimediaExtractor.update({ "root": "https://commons.wikimedia.org", "pattern": r"commons\.wikimedia\.org", }, + "mediawiki": { + "root": "https://www.mediawiki.org", + "pattern": r"(?:www\.)?mediawiki\.org", + }, + "mariowiki": { + "root": "https://www.mariowiki.com", + "pattern": r"(?:www\.)?mariowiki\.com", + }, }) class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" - pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" + pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" - - def _init(self): - self.params = { - "generator": "images", - "titles" : self.title, - "prop" : "imageinfo", - "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" - "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", - } - - -class WikimediaCategoryExtractor(WikimediaExtractor): - subcategory = "category" - pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)" - example = "https://commons.wikimedia.org/wiki/Category:NAME" - - def _init(self): - self.params = { - "generator": "categorymembers", - "gcmtitle" : self.title, - "gcmtype" : "file", - "prop" : "imageinfo", - "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" - "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", - } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 859cbbad..50b6e5d8 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -88,7 +88,9 @@ CATEGORY_MAP = { "mangapark" : "MangaPark", "mangaread" : "MangaRead", "mangasee" : "MangaSee", + "mariowiki" : "Super Mario Wiki", "mastodon.social": "mastodon.social", + "mediawiki" : "MediaWiki", "micmicidol" : "MIC MIC IDOL", "myhentaigallery": "My Hentai Gallery", "myportfolio" : "Adobe Portfolio", diff --git a/test/results/mariowiki.py b/test/results/mariowiki.py new file mode 100644 index 00000000..ebb8d6e6 --- /dev/null +++ b/test/results/mariowiki.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.mariowiki.com/Rabbit", + "#category": ("wikimedia", "wikibooks", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#pattern" : r"https://mario\.wiki\.gallery/images/.+", + "#count" : range(20, 50), +}, + +) diff --git a/test/results/mediawiki.py b/test/results/mediawiki.py new file mode 100644 index 00000000..683d0d36 --- /dev/null +++ b/test/results/mediawiki.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.mediawiki.org/wiki/Help:Navigation", + "#category": ("wikimedia", "mediawiki", "help"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : ( + "https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg", + "https://upload.wikimedia.org/wikipedia/commons/6/62/PD-icon.svg", + "https://upload.wikimedia.org/wikipedia/commons/0/0e/Vector_Sidebar.png", + "https://upload.wikimedia.org/wikipedia/commons/7/77/Vector_page_tabs.png", + "https://upload.wikimedia.org/wikipedia/commons/6/6e/Vector_user_links.png", + ), +}, + +) diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py index 882741d5..da4d761d 100644 --- a/test/results/wikibooks.py +++ b/test/results/wikibooks.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikibooks.org/wiki/Category:Title", "#category": ("wikimedia", "wikibooks", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py index 6cc03e34..a16d069a 100644 --- a/test/results/wikimediacommons.py +++ b/test/results/wikimediacommons.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", "#category": ("wikimedia", "wikimediacommons", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikinews.py b/test/results/wikinews.py index 8a2af25e..79817fdb 100644 --- a/test/results/wikinews.py +++ b/test/results/wikinews.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikinews.org/wiki/Category:Title", "#category": ("wikimedia", "wikinews", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py index 87499878..e8e8f694 100644 --- a/test/results/wikipedia.py +++ b/test/results/wikipedia.py @@ -47,7 +47,7 @@ __tests__ = ( { "#url" : "https://en.wikipedia.org/wiki/Category:Physics", "#category": ("wikimedia", "wikipedia", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py index 5e6fb321..8365e3b7 100644 --- a/test/results/wikiquote.py +++ b/test/results/wikiquote.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikiquote.org/wiki/Category:Title", "#category": ("wikimedia", "wikiquote", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikisource.py b/test/results/wikisource.py index afdee23e..0ac1bb0f 100644 --- a/test/results/wikisource.py +++ b/test/results/wikisource.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikisource.org/wiki/Category:Title", "#category": ("wikimedia", "wikisource", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py index d455fbac..26aca84b 100644 --- a/test/results/wikispecies.py +++ b/test/results/wikispecies.py @@ -19,7 +19,7 @@ __tests__ = ( { "#url" : "https://species.wikimedia.org/wiki/Category:Names", "#category": ("wikimedia", "wikispecies", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py index 58565f49..2e64ca31 100644 --- a/test/results/wikiversity.py +++ b/test/results/wikiversity.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikiversity.org/wiki/Category:Title", "#category": ("wikimedia", "wikiversity", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py index c7a016f5..4a643ab5 100644 --- a/test/results/wiktionary.py +++ b/test/results/wiktionary.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wiktionary.org/wiki/Category:Words", "#category": ("wikimedia", "wiktionary", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) From a416d4c3d5b5b315342d19514085006c47bcc323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 18 Jan 2024 16:05:41 +0100 Subject: [PATCH 315/344] [sankaku] support post URLs with alphanumeric IDs (#5073) --- gallery_dl/extractor/sankaku.py | 2 +- test/results/sankaku.py | 35 ++++++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 602895c4..b3b7a9cc 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/posts?(?:/show)?/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/test/results/sankaku.py b/test/results/sankaku.py index 89396daa..361fd7a2 100644 --- a/test/results/sankaku.py +++ b/test/results/sankaku.py @@ -118,18 +118,38 @@ __tests__ = ( }, { - "#url" : "https://sankaku.app/post/show/360451", + "#url" : "https://sankaku.app/posts/y0abGlDOr2o", "#category": ("booru", "sankaku", "post"), "#class" : sankaku.SankakuPostExtractor, "#options" : {"tags": True}, "#sha1_content": "5e255713cbf0a8e0801dc423563c34d896bb9229", - "tags_artist" : ["bonocho"], - "tags_studio" : ["dc_comics"], - "tags_medium" : list, - "tags_copyright": list, - "tags_character": list, - "tags_general" : list, + "tags_artist": [ + "bonocho", + ], + "tags_character": [ + "batman", + "letty_whiterock", + "bruce_wayne", + "the_joker", + "heath_ledger", + ], + "tags_copyright": [ + "batman_(series)", + "the_dark_knight", + ], + "tags_studio": [ + "dc_comics", + ], + "tags_general": list, +}, + +{ + "#url" : "https://sankaku.app/post/show/360451", + "#comment" : "legacy post URL", + "#category": ("booru", "sankaku", "post"), + "#class" : sankaku.SankakuPostExtractor, + "#pattern" : r"https://s\.sankakucomplex\.com/data/ac/8e/ac8e3b92ea328ce9cf7211e69c905bf9\.jpg\?e=.+", }, { @@ -137,6 +157,7 @@ __tests__ = ( "#comment" : "'contentious_content'", "#category": ("booru", "sankaku", "post"), "#class" : sankaku.SankakuPostExtractor, + "#auth" : True, "#pattern" : r"https://s\.sankakucomplex\.com/data/13/3c/133cda3bfde249c504284493903fb985\.jpg", }, From 93b4120e77f23bafac8ae6bc85c0db52d4f9f35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 18 Jan 2024 21:49:33 +0100 Subject: [PATCH 316/344] [gelbooru] support 'all' and empty tag (#5076) --- gallery_dl/extractor/gelbooru.py | 2 +- gallery_dl/extractor/gelbooru_v02.py | 4 +++- test/results/gelbooru.py | 14 ++++++++++++++ test/results/safebooru.py | 14 ++++++++++++++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index e37b2e92..83f13922 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -118,7 +118,7 @@ class GelbooruBase(): class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): """Extractor for images from gelbooru.com based on search-tags""" - pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)" + pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)" example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG" diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index f8ab71c9..c7866bc4 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -190,7 +190,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)" example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG" def __init__(self, match): @@ -202,6 +202,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): return {"search_tags": self.tags} def posts(self): + if self.tags == "all": + self.tags = "" return self._pagination({"tags": self.tags}) diff --git a/test/results/gelbooru.py b/test/results/gelbooru.py index 8d9ead27..b2f99ed1 100644 --- a/test/results/gelbooru.py +++ b/test/results/gelbooru.py @@ -15,6 +15,20 @@ __tests__ = ( "#count" : 5, }, +{ + "#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=all", + "#category": ("booru", "gelbooru", "tag"), + "#class" : gelbooru.GelbooruTagExtractor, + "#range" : "1-3", + "#count" : 3, +}, + +{ + "#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=", + "#category": ("booru", "gelbooru", "tag"), + "#class" : gelbooru.GelbooruTagExtractor, +}, + { "#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=meiya_neon", "#category": ("booru", "gelbooru", "tag"), diff --git a/test/results/safebooru.py b/test/results/safebooru.py index 0c183204..f82f9681 100644 --- a/test/results/safebooru.py +++ b/test/results/safebooru.py @@ -16,6 +16,20 @@ __tests__ = ( "#sha1_content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", }, +{ + "#url" : "https://safebooru.org/index.php?page=post&s=list&tags=all", + "#category": ("gelbooru_v02", "safebooru", "tag"), + "#class" : gelbooru_v02.GelbooruV02TagExtractor, + "#range" : "1-3", + "#count" : 3, +}, + +{ + "#url" : "https://safebooru.org/index.php?page=post&s=list&tags=", + "#category": ("gelbooru_v02", "safebooru", "tag"), + "#class" : gelbooru_v02.GelbooruV02TagExtractor, +}, + { "#url" : "https://safebooru.org/index.php?page=pool&s=show&id=11", "#category": ("gelbooru_v02", "safebooru", "pool"), From 44f2c15a042c5dea02a80d7130dc925ffe45c126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 19 Jan 2024 03:05:45 +0100 Subject: [PATCH 317/344] [wikimedia] handle 'File:' paths --- gallery_dl/extractor/wikimedia.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index ffbf950e..e4331315 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -34,7 +34,8 @@ class WikimediaExtractor(BaseExtractor): prefix = pre.lower() if sep else None self.title = path = text.unquote(path) - self.subcategory = prefix + if prefix: + self.subcategory = prefix if prefix == "category": self.params = { @@ -42,6 +43,10 @@ class WikimediaExtractor(BaseExtractor): "gcmtitle" : path, "gcmtype" : "file", } + elif prefix == "file": + self.params = { + "titles" : path, + } else: self.params = { "generator": "images", From fc4e737f673bd2a98fdcda9c3b68a90864b6f31d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 19 Jan 2024 03:08:43 +0100 Subject: [PATCH 318/344] [wikimedia] include 'sha1' in default filenames --- gallery_dl/extractor/wikimedia.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index e4331315..49843111 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -16,6 +16,7 @@ from .. import text class WikimediaExtractor(BaseExtractor): """Base class for wikimedia extractors""" basecategory = "wikimedia" + filename_fmt = "{filename} ({sha1[:8]}).{extension}" directory_fmt = ("{category}", "{page}") archive_fmt = "{sha1}" request_interval = (1.0, 2.0) From 2007cb2f59f0d509c4fe08a8e7c131b3067f3a85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 19 Jan 2024 03:15:30 +0100 Subject: [PATCH 319/344] [tests] check extractor category values --- gallery_dl/extractor/mangadex.py | 3 ++- test/results/blogger.py | 2 +- test/results/mariowiki.py | 2 +- test/results/raddle.py | 24 ++++++++++++------------ test/results/wikimediacommons.py | 3 ++- test/test_extractor.py | 23 +++++++++++++++++------ 6 files changed, 35 insertions(+), 22 deletions(-) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index d287d5cf..bca7e4db 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -158,8 +158,9 @@ class MangadexListExtractor(MangadexExtractor): def __init__(self, match): MangadexExtractor.__init__(self, match) - if match.group(2) != "feed": + if match.group(2) == "feed": self.subcategory = "list-feed" + else: self.items = self._items_titles def chapters(self): diff --git a/test/results/blogger.py b/test/results/blogger.py index aeb82f76..eef96459 100644 --- a/test/results/blogger.py +++ b/test/results/blogger.py @@ -24,7 +24,7 @@ __tests__ = ( { "#url" : "blogger:http://www.julianbunker.com/search?q=400mm", - "#category": ("blogger", "1www.julianbunker.com", "search"), + "#category": ("blogger", "www.julianbunker.com", "search"), "#class" : blogger.BloggerSearchExtractor, }, diff --git a/test/results/mariowiki.py b/test/results/mariowiki.py index ebb8d6e6..72c4cd52 100644 --- a/test/results/mariowiki.py +++ b/test/results/mariowiki.py @@ -10,7 +10,7 @@ from gallery_dl.extractor import wikimedia __tests__ = ( { "#url" : "https://www.mariowiki.com/Rabbit", - "#category": ("wikimedia", "wikibooks", "article"), + "#category": ("wikimedia", "mariowiki", "article"), "#class" : wikimedia.WikimediaArticleExtractor, "#pattern" : r"https://mario\.wiki\.gallery/images/.+", "#count" : range(20, 50), diff --git a/test/results/raddle.py b/test/results/raddle.py index 0c9de429..24710e94 100644 --- a/test/results/raddle.py +++ b/test/results/raddle.py @@ -10,7 +10,7 @@ from gallery_dl.extractor import postmill __tests__ = ( { "#url" : "https://raddle.me/", - "#category": ("postmill", "raddle.me", "home"), + "#category": ("postmill", "raddle", "home"), "#class" : postmill.PostmillHomeExtractor, "#range" : "1-25", "#count" : 25, @@ -18,7 +18,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/traa", - "#category": ("postmill", "raddle.me", "forum"), + "#category": ("postmill", "raddle", "forum"), "#class" : postmill.PostmillForumExtractor, "#count" : 1, "#pattern" : r"^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", @@ -26,7 +26,7 @@ __tests__ = ( { "#url" : "https://raddle.me/user/Sam_the_enby/submissions", - "#category": ("postmill", "raddle.me", "usersubmissions"), + "#category": ("postmill", "raddle", "usersubmissions"), "#class" : postmill.PostmillUserSubmissionsExtractor, "#range" : "1-25", "#count" : 25, @@ -34,13 +34,13 @@ __tests__ = ( { "#url" : "https://raddle.me/tag/Trans", - "#category": ("postmill", "raddle.me", "tag"), + "#category": ("postmill", "raddle", "tag"), "#class" : postmill.PostmillTagExtractor, }, { "#url" : "https://raddle.me/search?q=tw", - "#category": ("postmill", "raddle.me", "search"), + "#category": ("postmill", "raddle", "search"), "#class" : postmill.PostmillSearchExtractor, "#range" : "1-50", "#count" : 50, @@ -48,7 +48,7 @@ __tests__ = ( { "#url" : "https://raddle.me/160845", - "#category": ("postmill", "raddle.me", "shorturl"), + "#category": ("postmill", "raddle", "shorturl"), "#class" : postmill.PostmillShortURLExtractor, "#pattern" : r"^https://raddle\.me/f/egg_irl/160845/egg_irl$", }, @@ -56,7 +56,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/NonBinary/179017/scattered-thoughts-would-appreciate-advice-immensely-tw", "#comment" : "Text post", - "#category": ("postmill", "raddle.me", "post"), + "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "#sha1_url" : "99277f815820810d9d7e219d455f818601858378", "#sha1_content": "7a1159e1e45f2ce8e2c8b5959f6d66b042776f3b", @@ -66,7 +66,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/egg_irl/160845", "#comment" : "Image post", - "#category": ("postmill", "raddle.me", "post"), + "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "#sha1_url" : "48663f767ea258fcd545ab5aa0e734f98f434388", "#sha1_content": "431e938082c2b59c44888a83cfc711cd1f0e910a", @@ -76,7 +76,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/trans/177042/tw-vent-nsfw-suicide-i-lost-no-nut-november-tw-trauma", "#comment" : "Image + text post (with text enabled)", - "#category": ("postmill", "raddle.me", "post"), + "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "#options" : {"save-link-post-body": True}, "#pattern" : r"^(text:[\s\S]+|https://raddle\.me/submission_images/[0-9a-f]+\.png)$", @@ -86,7 +86,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/videos/179541/raisins-and-sprite", "#comment" : "Link post", - "#category": ("postmill", "raddle.me", "post"), + "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "#urls" : "https://m.youtube.com/watch?v=RFJCA5zcZxI", "#count" : 1, @@ -95,7 +95,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/Anime/150698/neo-tokyo-1987-link-to-the-english-dub-version-last-link", "#comment" : "Link + text post (with text disabled)", - "#category": ("postmill", "raddle.me", "post"), + "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "#pattern" : r"^https://fantasyanime\.com/anime/neo-tokyo-dub$", "#count" : 1, @@ -104,7 +104,7 @@ __tests__ = ( { "#url" : "https://raddle.me/f/egg_irl/166855/4th-wall-breaking-please-let-this-be-a-flair-egg-irl", "#comment" : "Post with multiple flairs", - "#category": ("postmill", "raddle.me", "post"), + "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "flair" : ["Gender non-specific", "4th wall breaking"], }, diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py index a16d069a..b61a9061 100644 --- a/test/results/wikimediacommons.py +++ b/test/results/wikimediacommons.py @@ -10,8 +10,9 @@ from gallery_dl.extractor import wikimedia __tests__ = ( { "#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg", - "#category": ("wikimedia", "wikimediacommons", "article"), + "#category": ("wikimedia", "wikimediacommons", "file"), "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : "https://upload.wikimedia.org/wikipedia/commons/f/fa/Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_%2824762757525%29.jpg", }, { diff --git a/test/test_extractor.py b/test/test_extractor.py index d2dd643c..3b590ff4 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -24,6 +24,11 @@ from gallery_dl.extractor.directlink import DirectlinkExtractor # noqa E402 _list_classes = extractor._list_classes +try: + from test import results +except ImportError: + results = None + class FakeExtractor(Extractor): category = "fake" @@ -92,17 +97,23 @@ class TestExtractorModule(unittest.TestCase): with self.assertRaises(TypeError): FakeExtractor.from_url(invalid) + @unittest.skipIf(not results, "no test data") + def test_categories(self): + for result in results.all(): + url = result["#url"] + extr = result["#class"].from_url(url) + base, cat, sub = result["#category"] + self.assertEqual(extr.category, cat, url) + self.assertEqual(extr.subcategory, sub, url) + self.assertEqual(extr.basecategory, base, url) + + @unittest.skipIf(not results, "no test data") def test_unique_pattern_matches(self): - try: - import test.results - except ImportError: - raise unittest.SkipTest("no test data") - # collect testcase URLs test_urls = [] append = test_urls.append - for result in test.results.all(): + for result in results.all(): append((result["#url"], result["#class"])) # iterate over all testcase URLs From a1c1e80f67a18576650c9eacc0c3c89377760bf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 19 Jan 2024 14:18:46 +0100 Subject: [PATCH 320/344] [giantessbooru] update domain --- docs/supportedsites.md | 2 +- gallery_dl/extractor/shimmie2.py | 24 ++++++++++++------------ test/results/giantessbooru.py | 22 +++++++++++++++++----- 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 4a6d8bd2..1f9602ad 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1395,7 +1395,7 @@ Consider all listed sites to potentially be NSFW. </tr> <tr> <td>Giantessbooru</td> - <td>https://giantessbooru.com/</td> + <td>https://sizechangebooru.com/</td> <td>Posts, Tag Searches</td> <td></td> </tr> diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 3da8d42e..67f38c4b 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -75,8 +75,8 @@ BASE_PATTERN = Shimmie2Extractor.update({ "cookies": {"ui-tnc-agreed": "true"}, }, "giantessbooru": { - "root": "https://giantessbooru.com", - "pattern": r"giantessbooru\.com", + "root": "https://sizechangebooru.com", + "pattern": r"(?:sizechange|giantess)booru\.com", "cookies": {"agreed": "true"}, }, "tentaclerape": { @@ -176,25 +176,25 @@ class Shimmie2TagExtractor(Shimmie2Extractor): extr = text.extract_from(self.request(url).text) while True: - pid = extr('href="./index.php?q=/post/view/', '&') + pid = extr("href='./index.php?q=/post/view/", "&") if not pid: break - tags, dimensions, size = extr('title="', '"').split(" // ") + tags, dimensions, size = extr("title='", "'").split(" // ") width, _, height = dimensions.partition("x") yield { "file_url": file_url_fmt(pid), - "id": pid, - "md5": "", - "tags": tags, - "width": width, - "height": height, - "size": text.parse_bytes(size[:-1]), + "id" : pid, + "md5" : "", + "tags" : tags, + "width" : width, + "height" : height, + "size" : text.parse_bytes(size[:-1]), } pnum += 1 - if not extr('/{}">{}<'.format(pnum, pnum), ">"): + if not extr("/{0}'>{0}<".format(pnum), ">"): return @@ -241,7 +241,7 @@ class Shimmie2PostExtractor(Shimmie2Extractor): "id" : self.post_id, "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : "", - "file_url": self.root + extr('id="main_image" src=".', '"'), + "file_url": self.root + extr("id='main_image' src='.", "'"), "width" : extr("orig_width =", ";"), "height" : 0, "size" : 0, diff --git a/test/results/giantessbooru.py b/test/results/giantessbooru.py index 5909c191..e35b9d10 100644 --- a/test/results/giantessbooru.py +++ b/test/results/giantessbooru.py @@ -9,14 +9,20 @@ from gallery_dl.extractor import shimmie2 __tests__ = ( { - "#url" : "https://giantessbooru.com/index.php?q=/post/list/drawing/1", + "#url" : "https://sizechangebooru.com/index.php?q=/post/list/drawing/1", "#category": ("shimmie2", "giantessbooru", "tag"), "#class" : shimmie2.Shimmie2TagExtractor, - "#pattern" : r"https://giantessbooru\.com/index\.php\?q=/image/\d+\.jpg", + "#pattern" : r"https://sizechangebooru\.com/index\.php\?q=/image/\d+\.jpg", "#range" : "1-100", "#count" : 100, }, +{ + "#url" : "https://giantessbooru.com/index.php?q=/post/list/drawing/1", + "#category": ("shimmie2", "giantessbooru", "tag"), + "#class" : shimmie2.Shimmie2TagExtractor, +}, + { "#url" : "https://giantessbooru.com/post/list/drawing/1", "#category": ("shimmie2", "giantessbooru", "tag"), @@ -24,14 +30,14 @@ __tests__ = ( }, { - "#url" : "https://giantessbooru.com/index.php?q=/post/view/41", + "#url" : "https://sizechangebooru.com/index.php?q=/post/view/41", "#category": ("shimmie2", "giantessbooru", "post"), "#class" : shimmie2.Shimmie2PostExtractor, - "#pattern" : r"https://giantessbooru\.com/index\.php\?q=/image/41\.jpg", + "#urls" : "https://sizechangebooru.com/index.php?q=/image/41.jpg", "#sha1_content": "79115ed309d1f4e82e7bead6948760e889139c91", "extension": "jpg", - "file_url" : "https://giantessbooru.com/index.php?q=/image/41.jpg", + "file_url" : "https://sizechangebooru.com/index.php?q=/image/41.jpg", "filename" : "41", "height" : 0, "id" : 41, @@ -41,6 +47,12 @@ __tests__ = ( "width" : 1387, }, +{ + "#url" : "https://giantessbooru.com/index.php?q=/post/view/41", + "#category": ("shimmie2", "giantessbooru", "post"), + "#class" : shimmie2.Shimmie2PostExtractor, +}, + { "#url" : "https://giantessbooru.com/post/view/41", "#category": ("shimmie2", "giantessbooru", "post"), From b0a441f1e37add81874726a9b01e6266472e36c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 19 Jan 2024 19:34:16 +0100 Subject: [PATCH 321/344] [nitter] remove 'nitter.lacontrevoie.fr' "Fermeture de Nitter / Closing down Nitter" --- docs/supportedsites.md | 6 --- gallery_dl/extractor/nitter.py | 4 -- test/results/nitter1d4us.py | 35 +++++++++++++ test/results/nitterlacontrevoiefr.py | 73 ---------------------------- 4 files changed, 35 insertions(+), 83 deletions(-) delete mode 100644 test/results/nitterlacontrevoiefr.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1f9602ad..ba9e5b91 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1293,12 +1293,6 @@ Consider all listed sites to potentially be NSFW. <td>Media Files, Replies, Search Results, Tweets</td> <td></td> </tr> -<tr> - <td>Nitter.lacontrevoie.fr</td> - <td>https://nitter.lacontrevoie.fr/</td> - <td>Media Files, Replies, Search Results, Tweets</td> - <td></td> -</tr> <tr> <td>Nitter.1d4.us</td> <td>https://nitter.1d4.us/</td> diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index bc7b308b..d36f5098 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -235,10 +235,6 @@ BASE_PATTERN = NitterExtractor.update({ "root": "https://nitter.net", "pattern": r"nitter\.net", }, - "nitter.lacontrevoie.fr": { - "root": "https://nitter.lacontrevoie.fr", - "pattern": r"nitter\.lacontrevoie\.fr", - }, "nitter.1d4.us": { "root": "https://nitter.1d4.us", "pattern": r"nitter\.1d4\.us", diff --git a/test/results/nitter1d4us.py b/test/results/nitter1d4us.py index b816b44f..a54f8cbb 100644 --- a/test/results/nitter1d4us.py +++ b/test/results/nitter1d4us.py @@ -57,4 +57,39 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "filename": r"re:EaK.{12}", }, +{ + "#url" : "https://nitter.1d4.us/i/status/894001459754180609", + "#comment" : "4 images", + "#category": ("nitter", "nitter.1d4.us", "tweet"), + "#class" : nitter.NitterTweetExtractor, + "#sha1_url": "bc6a91792ff6ec3ab9046f4f27299cc0e7ca7ce3", +}, + +{ + "#url" : "https://nitter.1d4.us/i/status/1065692031626829824", + "#comment" : "video", + "#category": ("nitter", "nitter.1d4.us", "tweet"), + "#class" : nitter.NitterTweetExtractor, + "#pattern" : r"ytdl:https://nitter\.1d4\.us/video/enc/F00083CDE8D74/aHR0cHM6Ly92aWRlby50d2ltZy5jb20vZXh0X3R3X3ZpZGVvLzEwNjU2OTE4Njg0MzkwMDcyMzIvcHUvcGwvbnY4aFVRQzFSMFNqaHpjWi5tM3U4P3RhZz01", + + "extension": "mp4", + "filename" : "nv8hUQC1R0SjhzcZ", +}, + +{ + "#url" : "https://nitter.1d4.us/i/status/1460044411165888515", + "#comment" : "deleted quote tweet (#2225)", + "#category": ("nitter", "nitter.1d4.us", "tweet"), + "#class" : nitter.NitterTweetExtractor, + "#count" : 0, +}, + +{ + "#url" : "https://nitter.1d4.us/i/status/1486373748911575046", + "#comment" : "'Misleading' content", + "#category": ("nitter", "nitter.1d4.us", "tweet"), + "#class" : nitter.NitterTweetExtractor, + "#count" : 4, +}, + ) diff --git a/test/results/nitterlacontrevoiefr.py b/test/results/nitterlacontrevoiefr.py deleted file mode 100644 index b2a81ec1..00000000 --- a/test/results/nitterlacontrevoiefr.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import nitter - - -__tests__ = ( -{ - "#url" : "https://nitter.lacontrevoie.fr/supernaturepics", - "#category": ("nitter", "nitter.lacontrevoie.fr", "tweets"), - "#class" : nitter.NitterTweetsExtractor, - "#pattern" : r"https://nitter\.lacontrevoie\.fr/pic/orig/media%2FCGMNYZvW0AIVoom\.jpg", - "#range" : "1", - "#sha1_url": "54f4b55f2099dcc248f3fb7bfacf1349e08d8e2d", -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/supernaturepics/with_replies", - "#category": ("nitter", "nitter.lacontrevoie.fr", "replies"), - "#class" : nitter.NitterRepliesExtractor, -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/supernaturepics/media", - "#category": ("nitter", "nitter.lacontrevoie.fr", "media"), - "#class" : nitter.NitterMediaExtractor, -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/supernaturepics/search", - "#category": ("nitter", "nitter.lacontrevoie.fr", "search"), - "#class" : nitter.NitterSearchExtractor, -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/i/status/894001459754180609", - "#comment" : "4 images", - "#category": ("nitter", "nitter.lacontrevoie.fr", "tweet"), - "#class" : nitter.NitterTweetExtractor, - "#sha1_url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff", -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/i/status/1065692031626829824", - "#comment" : "video", - "#category": ("nitter", "nitter.lacontrevoie.fr", "tweet"), - "#class" : nitter.NitterTweetExtractor, - "#pattern" : r"ytdl:https://nitter\.lacontrevoie\.fr/video/[0-9A-F]{10,}/https%3A%2F%2Fvideo.twimg.com%2Fext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2Fnv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", - - "extension": "mp4", - "filename" : "nv8hUQC1R0SjhzcZ", -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/i/status/1460044411165888515", - "#comment" : "deleted quote tweet (#2225)", - "#category": ("nitter", "nitter.lacontrevoie.fr", "tweet"), - "#class" : nitter.NitterTweetExtractor, - "#count" : 0, -}, - -{ - "#url" : "https://nitter.lacontrevoie.fr/i/status/1486373748911575046", - "#comment" : "'Misleading' content", - "#category": ("nitter", "nitter.lacontrevoie.fr", "tweet"), - "#class" : nitter.NitterTweetExtractor, - "#count" : 4, -}, - -) From b41d9bf616b0ccd2ddcd19479917d79ba039305c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 19 Jan 2024 22:24:39 +0100 Subject: [PATCH 322/344] [paheal] fix 'source' metadata --- gallery_dl/extractor/paheal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 89c0d2f7..52267241 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -56,7 +56,7 @@ class PahealExtractor(Extractor): "date" : text.parse_datetime( extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), "source" : text.unescape(text.extr( - extr(">Source Link<", "</td>"), "href='", "'")), + extr(">Source Link<", "</td>"), "href='", "'")), } dimensions, size, ext = extr("Info</th><td>", "<").split(" // ") From 321861af7ece04e4cd6e4e2243c292805e128fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 00:26:41 +0100 Subject: [PATCH 323/344] [erome] fix 'count' metadata --- gallery_dl/extractor/erome.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 6a0e069a..8c9da2f6 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,24 +44,26 @@ class EromeExtractor(Extractor): pos = page.index('<div class="user-profile', pos) user, pos = text.extract( page, 'href="https://www.erome.com/', '"', pos) - count, pos = text.extract( - page, 'fa-camera"></i>', '</span>', pos) + + urls = [] + groups = page.split('<div class="media-group"') + for group in util.advance(groups, 1): + url = (text.extr(group, '<source src="', '"') or + text.extr(group, 'data-src="', '"')) + if url: + urls.append(url) data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), + "count" : len(urls), "_http_headers": {"Referer": url}, - "count" : text.parse_int(count), } yield Message.Directory, data - groups = page.split('<div class="media-group"') - for data["num"], group in enumerate(util.advance(groups, 1), 1): - url = (text.extr(group, '<source src="', '"') or - text.extr(group, 'data-src="', '"')) - if url: - yield Message.Url, url, text.nameext_from_url(url, data) + for data["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, data) def albums(self): return () From 375eefb8869c59a14ecec69a1014b00480ea890b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 02:21:40 +0100 Subject: [PATCH 324/344] [chevereto] remove 'pixl.li' "Pixl is closing down" "All images will be deleted January 1st." --- docs/supportedsites.md | 6 --- gallery_dl/extractor/chevereto.py | 4 -- test/results/pixl.py | 63 ------------------------------- 3 files changed, 73 deletions(-) delete mode 100644 test/results/pixl.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ba9e5b91..4da570e8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1065,12 +1065,6 @@ Consider all listed sites to potentially be NSFW. <td>Albums, individual Images, User Profiles</td> <td></td> </tr> -<tr> - <td>Pixl</td> - <td>https://pixl.li/</td> - <td>Albums, individual Images, User Profiles</td> - <td></td> -</tr> <tr> <td>IMG.Kiwi</td> <td>https://img.kiwi/</td> diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 2bf200b0..ef5a44c3 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -38,10 +38,6 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://jpg4.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, - "pixl": { - "root": "https://pixl.li", - "pattern": r"pixl\.(?:li|is)", - }, "imgkiwi": { "root": "https://img.kiwi", "pattern": r"img\.kiwi", diff --git a/test/results/pixl.py b/test/results/pixl.py deleted file mode 100644 index e82353ee..00000000 --- a/test/results/pixl.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import chevereto - - -__tests__ = ( -{ - "#url" : "https://pixl.li/image/894x1023-1c8d6dd3b1b0cd4b0d286b229157a7de.z3DwHB", - "#category": ("chevereto", "pixl", "image"), - "#class" : chevereto.CheveretoImageExtractor, - "#urls" : "https://i.pixl.li/894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de.jpg", - "#sha1_content": "3279b86d0ac42348c703770c4781ecdc300fc13c", - - "album": "", - "extension": "jpg", - "filename": "894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de", - "id": "z3DwHB", - "url": "https://i.pixl.li/894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de.jpg", - "user": "matafaka1", -}, - -{ - "#url" : "https://pixl.is/image/894x1023-1c8d6dd3b1b0cd4b0d286b229157a7de.z3DwHB", - "#category": ("chevereto", "pixl", "image"), - "#class" : chevereto.CheveretoImageExtractor, -}, - -{ - "#url" : "https://pixl.li/album/estelasaubi.D0bJf", - "#category": ("chevereto", "pixl", "album"), - "#class" : chevereto.CheveretoAlbumExtractor, - "#pattern" : chevereto.CheveretoImageExtractor.pattern, - "#count" : 173, -}, - -{ - "#url" : "https://pixl.li/mjstik", - "#category": ("chevereto", "pixl", "user"), - "#class" : chevereto.CheveretoUserExtractor, - "#pattern" : chevereto.CheveretoImageExtractor.pattern, - "#range" : "1-20", - "#count" : 20, -}, - -{ - "#url" : "https://pixl.li/mjstik/albums", - "#category": ("chevereto", "pixl", "user"), - "#class" : chevereto.CheveretoUserExtractor, - "#pattern" : chevereto.CheveretoAlbumExtractor.pattern, - "#count" : 285, -}, - -{ - "#url" : "https://pixl.is/renford/albums", - "#category": ("chevereto", "pixl", "user"), - "#class" : chevereto.CheveretoUserExtractor, -}, - -) From 9ca6117c67a8305fd02c0960dc7bed9c26754d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 02:53:44 +0100 Subject: [PATCH 325/344] [hbrowse] remove module website gone --- docs/supportedsites.md | 6 --- gallery_dl/extractor/hbrowse.py | 92 --------------------------------- test/results/hbrowse.py | 28 ---------- 3 files changed, 126 deletions(-) delete mode 100644 gallery_dl/extractor/hbrowse.py delete mode 100644 test/results/hbrowse.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 4da570e8..c6756c6a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -271,12 +271,6 @@ Consider all listed sites to potentially be NSFW. <td>Archive, Individual Posts, Home Feed, Search Results</td> <td></td> </tr> -<tr> - <td>HBrowse</td> - <td>https://www.hbrowse.com/</td> - <td>Chapters, Manga</td> - <td></td> -</tr> <tr> <td>Hentai Cosplay</td> <td>https://hentai-cosplays.com/</td> diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py deleted file mode 100644 index a5221409..00000000 --- a/gallery_dl/extractor/hbrowse.py +++ /dev/null @@ -1,92 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.hbrowse.com/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text, util, exception - - -class HbrowseBase(): - """Base class for hbrowse extractors""" - category = "hbrowse" - root = "https://www.hbrowse.com" - - def parse_page(self, page, data): - """Parse metadata on 'page' and add it to 'data'""" - data, pos = text.extract_all(page, ( - ('manga' , '<td class="listLong">', '</td>'), - ('artist', '<td class="listLong">', '</td>'), - ('total' , '<td class="listLong">', ' '), - ('origin', '<td class="listLong">', '</td>'), - ), values=data) - - if not data["manga"] and "<b>Warning</b>" in page: - msg = page.rpartition(">")[2].strip() - raise exception.StopExtraction("Site is not accessible: '%s'", msg) - - tags = text.extract(page, 'class="listTable"', '</table>', pos)[0] - - data["manga"] = text.unescape(data["manga"]) - data["total"] = text.parse_int(data["total"]) - data["artist"] = text.remove_html(data["artist"]) - data["origin"] = text.remove_html(data["origin"]) - data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"')) - return data - - -class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): - """Extractor for manga-chapters from hbrowse.com""" - directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}") - filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" - "{page:>03}.{extension}") - archive_fmt = "{manga_id}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" - example = "https://www.hbrowse.com/12345/c00000" - - def __init__(self, match): - self.path, self.gid, self.chapter = match.groups() - self.path += "/" - ChapterExtractor.__init__(self, match) - - def metadata(self, page): - return self.parse_page(page, { - "manga_id": text.parse_int(self.gid), - "chapter": text.parse_int(self.chapter) - }) - - def images(self, page): - base = self.root + "/data" + self.path - json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" - return [(base + name, None) for name in util.json_loads(json_data)] - - -class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): - """Extractor for manga from hbrowse.com""" - chapterclass = HbrowseChapterExtractor - reverse = False - pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" - example = "https://www.hbrowse.com/12345" - - def chapters(self, page): - results = [] - data = self.parse_page(page, { - "manga_id": text.parse_int( - self.manga_url.rstrip("/").rpartition("/")[2]) - }) - - pos = 0 - needle = '<td class="listMiddle">\n<a class="listLink" href="' - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - title, pos = text.extract(page, '>View ', '<', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2][1:]) - data["title"] = title - results.append((text.urljoin(self.root, url), data.copy())) diff --git a/test/results/hbrowse.py b/test/results/hbrowse.py deleted file mode 100644 index c4b3dcd6..00000000 --- a/test/results/hbrowse.py +++ /dev/null @@ -1,28 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import hbrowse - - -__tests__ = ( -{ - "#url" : "https://www.hbrowse.com/10363/c00000", - "#category": ("", "hbrowse", "chapter"), - "#class" : hbrowse.HbrowseChapterExtractor, - "#sha1_url" : "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", - "#sha1_metadata": "274996f6c809e5250b6ff3abbc5147e29f89d9a5", - "#sha1_content" : "44578ebbe176c2c27434966aef22945787e2781e", -}, - -{ - "#url" : "https://www.hbrowse.com/10363", - "#category": ("", "hbrowse", "manga"), - "#class" : hbrowse.HbrowseMangaExtractor, - "#sha1_url" : "b89682bfb86c11d2af0dc47463804ec3ac4aadd6", - "#sha1_metadata": "4b15fda1858a69de1fbf5afddfe47dd893397312", -}, - -) From 95991511182a30f87c95e93a876142525ebd1c78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 16:44:48 +0100 Subject: [PATCH 326/344] [issuu] fix extraction --- gallery_dl/extractor/issuu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index f6170c28..54c6539f 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -29,8 +29,9 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): example = "https://issuu.com/issuu/docs/TITLE/" def metadata(self, page): + pos = page.rindex('id="initial-data"') data = util.json_loads(text.rextract( - page, '<script data-json="', '"')[0].replace(""", '"')) + page, '<script data-json="', '"', pos)[0].replace(""", '"')) doc = data["initialDocumentData"]["document"] doc["date"] = text.parse_datetime( From acc94ac18791d19288d4d7f19846b80f2d103e98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 17:56:07 +0100 Subject: [PATCH 327/344] [realbooru] fix extraction revert ac97aca99c6f59ee679a19440360eed46b7c3d18 --- gallery_dl/extractor/gelbooru_v02.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index c7866bc4..7ab6d023 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -25,7 +25,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.api_root = self.config_instance("api_root") or self.root if self.category == "realbooru": - self.items = self._items_realbooru + self._file_url = self._file_url_realbooru self._tags = self._tags_realbooru def _api_request(self, params): @@ -124,28 +124,6 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url - def _items_realbooru(self): - from .common import Message - data = self.metadata() - - for post in self.posts(): - try: - html = self._html(post) - url = post["file_url"] = text.rextract( - html, 'href="', '"', html.index(">Original<"))[0] - except Exception: - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) - continue - - text.nameext_from_url(url, post) - post.update(data) - self._prepare(post) - self._tags(post, html) - - yield Message.Directory, post - yield Message.Url, url, post - def _tags_realbooru(self, post, page): tag_container = text.extr(page, 'id="tagLink"', '</div>') tags = collections.defaultdict(list) From 0d367ce1b98292d543d5fe48c60eab48bdb0056b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 18:02:36 +0100 Subject: [PATCH 328/344] [tests] update extractor results --- test/results/2ch.py | 2 +- test/results/500px.py | 12 ++--- test/results/8muses.py | 5 +- test/results/artstation.py | 6 +-- test/results/deviantart.py | 10 ++-- test/results/endchan.py | 7 ++- test/results/exhentai.py | 5 +- test/results/fanbox.py | 4 +- test/results/fantia.py | 6 +-- test/results/fashionnova.py | 6 +-- test/results/idolcomplex.py | 6 +-- test/results/imagefap.py | 29 ++++++----- test/results/imgur.py | 6 +-- test/results/itaku.py | 6 +-- test/results/kemonoparty.py | 18 ++----- test/results/komikcast.py | 3 +- test/results/mangaread.py | 2 +- test/results/mastodonsocial.py | 3 +- test/results/micmicidol.py | 17 ++++--- test/results/myhentaigallery.py | 2 +- test/results/newgrounds.py | 37 ++++++++------ test/results/paheal.py | 8 +-- test/results/pinterest.py | 1 + test/results/pixiv.py | 90 +++++++++++++++++---------------- test/results/pornpics.py | 1 + test/results/realbooru.py | 46 +++++++++++++++-- test/results/rule34us.py | 2 +- test/results/snootbooru.py | 3 -- test/results/tumblr.py | 2 +- test/results/twibooru.py | 2 +- test/results/twitter.py | 8 +-- test/results/unsplash.py | 19 +++---- test/results/vsco.py | 2 +- test/results/wikifeet.py | 2 +- test/results/xbooru.py | 2 +- test/results/xhamster.py | 46 ++++++++++------- test/results/zerochan.py | 1 + test/test_results.py | 7 ++- 38 files changed, 241 insertions(+), 193 deletions(-) diff --git a/test/results/2ch.py b/test/results/2ch.py index 5400292c..a15af39c 100644 --- a/test/results/2ch.py +++ b/test/results/2ch.py @@ -28,7 +28,7 @@ __tests__ = ( "filename" : str, "fullname" : str, "height" : int, - "lasthit" : 1705273977, + "lasthit" : range(1705000000, 1900000000), "md5" : r"re:[0-9a-f]{32}", "name" : r"re:\d+\.\w+", "num" : int, diff --git a/test/results/500px.py b/test/results/500px.py index 39b618bc..5630e78e 100644 --- a/test/results/500px.py +++ b/test/results/500px.py @@ -10,22 +10,22 @@ _500px = getattr(gallery_dl.extractor, "500px") __tests__ = ( { - "#url" : "https://500px.com/p/light_expression_photography", + "#url" : "https://500px.com/p/fashvamp", "#category": ("", "500px", "user"), "#class" : _500px._500pxUserExtractor, - "#pattern" : r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2", + "#pattern" : r"https?://drscdn.500px.org/photo/\d+/m%3D4096(_k%3D1)?/v2\?sig=", "#range" : "1-99", "#count" : 99, }, { - "#url" : "https://500px.com/light_expression_photography", + "#url" : "https://500px.com/fashvamp", "#category": ("", "500px", "user"), "#class" : _500px._500pxUserExtractor, }, { - "#url" : "https://web.500px.com/light_expression_photography", + "#url" : "https://web.500px.com/fashvamp", "#category": ("", "500px", "user"), "#class" : _500px._500pxUserExtractor, }, @@ -34,8 +34,8 @@ __tests__ = ( "#url" : "https://500px.com/p/fashvamp/galleries/lera", "#category": ("", "500px", "gallery"), "#class" : _500px._500pxGalleryExtractor, + "#pattern" : r"https?://drscdn.500px.org/photo/\d+/m%3D4096_k%3D1/v2\?sig=", "#count" : 3, - "#sha1_url": "002dc81dee5b4a655f0e31ad8349e8903b296df6", "gallery": dict, "user" : dict, @@ -57,8 +57,8 @@ __tests__ = ( "#url" : "https://500px.com/photo/222049255/queen-of-coasts", "#category": ("", "500px", "image"), "#class" : _500px._500pxImageExtractor, + "#pattern" : r"https://drscdn\.500px\.org/photo/222049255/m%3D4096_k%3D1/v2\?sig=\w+", "#count" : 1, - "#sha1_url": "fbdf7df39325cae02f5688e9f92935b0e7113315", "camera" : "Canon EOS 600D", "camera_info" : dict, diff --git a/test/results/8muses.py b/test/results/8muses.py index da19e4ef..7dfb8460 100644 --- a/test/results/8muses.py +++ b/test/results/8muses.py @@ -6,6 +6,7 @@ gallery_dl = __import__("gallery_dl.extractor.8muses") _8muses = getattr(gallery_dl.extractor, "8muses") +from gallery_dl import exception __tests__ = ( @@ -65,9 +66,7 @@ __tests__ = ( "#comment" : "non-ASCII characters", "#category": ("", "8muses", "album"), "#class" : _8muses._8musesAlbumExtractor, - "#count" : 2, - - "name": r"re:From Trainers to Pokémons", + "#exception": exception.HttpError, }, ) diff --git a/test/results/artstation.py b/test/results/artstation.py index 262bad88..d34437f4 100644 --- a/test/results/artstation.py +++ b/test/results/artstation.py @@ -107,7 +107,7 @@ __tests__ = ( "#category": ("", "artstation", "image"), "#class" : artstation.ArtstationImageExtractor, "#pattern" : r"https?://\w+\.artstation\.com/p/assets/images/images/008/760/279/4k/.+", - "#sha1_content": "7b113871465fdc09d127adfdc2767d51cf45a7e9", + "#sha1_content": "3f211ce0d6ecdb502db2cdf7bbeceb11d8421170", }, { @@ -124,7 +124,7 @@ __tests__ = ( "#category": ("", "artstation", "image"), "#class" : artstation.ArtstationImageExtractor, "#options" : {"external": True}, - "#pattern" : "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0", + "#pattern" : r"ytdl:https://www\.youtube(-nocookie)?\.com/embed/JNFfJtwwrU0", "#range" : "2", }, @@ -154,7 +154,7 @@ __tests__ = ( "#category": ("", "artstation", "following"), "#class" : artstation.ArtstationFollowingExtractor, "#pattern" : artstation.ArtstationUserExtractor.pattern, - "#count" : ">= 50", + "#count" : ">= 40", }, ) diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 41cb3219..e8050246 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -262,7 +262,7 @@ __tests__ = ( "allows_comments" : True, "author" : { "type" : "regular", - "usericon": "https://a.deviantart.net/avatars/g/d/gdldev.jpg?2", + "usericon": "https://a.deviantart.net/avatars/g/d/gdldev.jpg?12", "userid" : "1A12BA26-33C2-AA0A-7678-0B6DFBA7AC8E", "username": "gdldev" }, @@ -530,7 +530,7 @@ __tests__ = ( "#category": ("", "deviantart", "status"), "#class" : deviantart.DeviantartStatusExtractor, "#count" : 4, - "#sha1_url": "bf4c44c0c60ff2648a880f4c3723464ad3e7d074", + "#sha1_url": "62ee48ff3405c7714dca70abf42e8e39731012fc", }, { @@ -538,7 +538,7 @@ __tests__ = ( "#category": ("", "deviantart", "status"), "#class" : deviantart.DeviantartStatusExtractor, "#options" : {"journals": "none"}, - "#pattern" : r"https://images-wixmp-\w+\.wixmp\.com/f/[^/]+/[^.]+\.jpg\?token=", + "#pattern" : r"https://images-wixmp-\w+\.wixmp\.com/intermediary/f/[^/]+/[^.]+\.jpg", "#count" : 1, }, @@ -580,7 +580,7 @@ __tests__ = ( "#category": ("", "deviantart", "status"), "#class" : deviantart.DeviantartStatusExtractor, "#options" : {"journals": "text"}, - "#sha1_url": "c8744f7f733a3029116607b826321233c5ca452d", + "#sha1_url": "10a336bdee7b9692919461443a7dde44d495818c", }, { @@ -766,7 +766,7 @@ __tests__ = ( "#url" : "https://www.deviantart.com/view/706871727", "#category": ("", "deviantart", "deviation"), "#class" : deviantart.DeviantartDeviationExtractor, - "#sha1_content": "3f62ae0c2fca2294ac28e41888ea06bb37c22c65", + "#sha1_content": "87dff6056fc9a2bf77f75317a1e00e18451b3c80", }, { diff --git a/test/results/endchan.py b/test/results/endchan.py index 293a8569..97d34c3b 100644 --- a/test/results/endchan.py +++ b/test/results/endchan.py @@ -9,11 +9,10 @@ from gallery_dl.extractor import lynxchan __tests__ = ( { - "#url" : "https://endchan.org/yuri/res/193483.html", + "#url" : "https://endchan.org/yuri/res/33621.html", "#category": ("lynxchan", "endchan", "thread"), "#class" : lynxchan.LynxchanThreadExtractor, - "#pattern" : r"https://endchan\.org/\.media/[^.]+(\.\w+)?$", - "#count" : ">= 19", + "#urls" : "https://endchan.org/.media/358c089df4be990e9f7b636e1ce83d3e-imagejpeg.jpg", }, { @@ -27,7 +26,7 @@ __tests__ = ( "#category": ("lynxchan", "endchan", "board"), "#class" : lynxchan.LynxchanBoardExtractor, "#pattern" : lynxchan.LynxchanThreadExtractor.pattern, - "#count" : ">= 9", + "#count" : ">= 8", }, { diff --git a/test/results/exhentai.py b/test/results/exhentai.py index 9165e764..f7967f1a 100644 --- a/test/results/exhentai.py +++ b/test/results/exhentai.py @@ -108,6 +108,7 @@ __tests__ = ( "#category": ("", "exhentai", "search"), "#class" : exhentai.ExhentaiSearchExtractor, "#pattern" : exhentai.ExhentaiGalleryExtractor.pattern, + "#auth" : True, "#range" : "1-30", "#count" : 30, @@ -119,8 +120,8 @@ __tests__ = ( "#url" : "https://e-hentai.org/favorites.php", "#category": ("", "exhentai", "favorite"), "#class" : exhentai.ExhentaiFavoriteExtractor, - "#pattern" : r"https?://e-hentai\.org/g/1200119/d55c44d3d0", - "#count" : 1, + "#auth" : True, + "#urls" : "https://e-hentai.org/g/1200119/d55c44d3d0", }, { diff --git a/test/results/fanbox.py b/test/results/fanbox.py index 32f13096..b47b72f8 100644 --- a/test/results/fanbox.py +++ b/test/results/fanbox.py @@ -69,9 +69,9 @@ __tests__ = ( "#category": ("", "fanbox", "post"), "#class" : fanbox.FanboxPostExtractor, "#options" : {"embeds": True}, - "#count" : 10, + "#count" : 8, - "title" : "イラスト+SS|義足の炭鉱少年が義足を見せてくれるだけ 【全体公開版】", + "title" : "イラスト+SS|【全体公開版】義足の探鉱夫男子が義足を見せてくれるだけ ", "tags" : list, "articleBody" : dict, "hasAdultContent": True, diff --git a/test/results/fantia.py b/test/results/fantia.py index 70773fb0..5867e786 100644 --- a/test/results/fantia.py +++ b/test/results/fantia.py @@ -17,7 +17,7 @@ __tests__ = ( "fanclub_user_id": 52152, "tags" : list, - "title" : str, + "post_title" : str, }, { @@ -26,7 +26,7 @@ __tests__ = ( "#class" : fantia.FantiaPostExtractor, "#pattern" : r"https://(c\.fantia\.jp/uploads/post/file/1166373/|cc\.fantia\.jp/uploads/post_content_photo/file/732549[01]|fantia\.jp/posts/1166373/album_image\?)", - "blogpost_text" : r"re:^$|This is a test.\n\nThis is a test.\n\n|Link to video:\nhttps://www.youtube.com/watch\?v=5SSdvNcAagI\n\nhtml img from another site:\n\n\n\n\n\n", + "blogpost_text" : r"re:^$|This is a test.\n\n(This is a test.)?\n\n|Link to video:\nhttps://www.youtube.com/watch\?v=5SSdvNcAagI\n\nhtml img from another site:\n\n\n\n\n\n", "comment" : "\n\n", "content_category": r"re:thumb|blog|photo_gallery", "content_comment" : str, @@ -56,7 +56,7 @@ __tests__ = ( "#url" : "https://fantia.jp/posts/508363", "#category": ("", "fantia", "post"), "#class" : fantia.FantiaPostExtractor, - "#count" : 0, + "#count" : 6, "post_title": "zunda逆バニーでおしりコッショリ", "tags" : list, diff --git a/test/results/fashionnova.py b/test/results/fashionnova.py index 4225264f..9cee0e23 100644 --- a/test/results/fashionnova.py +++ b/test/results/fashionnova.py @@ -29,11 +29,11 @@ __tests__ = ( }, { - "#url" : "https://www.fashionnova.com/products/essential-slide-red", + "#url" : "https://www.fashionnova.com/products/all-my-life-legging-black", "#category": ("shopify", "fashionnova", "product"), "#class" : shopify.ShopifyProductExtractor, - "#pattern" : r"https?://cdn\d*\.shopify.com/", - "#count" : 3, + "#pattern" : r"https?://cdn\d*\.shopify\.com/s/files/", + "#count" : 8, }, { diff --git a/test/results/idolcomplex.py b/test/results/idolcomplex.py index 6f6f883e..0152ee88 100644 --- a/test/results/idolcomplex.py +++ b/test/results/idolcomplex.py @@ -75,12 +75,12 @@ __tests__ = ( "id" : 694215, "md5" : "509eccbba54a43cea6b275a65b93c51d", "rating" : "g", - "tags" : "lyumos the_witcher shani_(the_witcher) 1girl cosplay green_eyes non-asian redhead waistcoat wreath 3:2_aspect_ratio", + "tags" : "lyumos the_witcher shani_(the_witcher) 1girl green_eyes non-asian redhead waistcoat wreath cosplay 3:2_aspect_ratio", "tags_character": "shani_(the_witcher)", "tags_copyright": "the_witcher", - "tags_general" : "1girl cosplay green_eyes non-asian redhead waistcoat wreath", + "tags_general" : "1girl green_eyes non-asian redhead waistcoat wreath", "tags_idol" : "lyumos", - "tags_medium" : "3:2_aspect_ratio", + "tags_medium" : "cosplay 3:2_aspect_ratio", "vote_average" : range(4, 5), "vote_count" : range(25, 40), "width" : 1024, diff --git a/test/results/imagefap.py b/test/results/imagefap.py index b4f3ab81..bec94011 100644 --- a/test/results/imagefap.py +++ b/test/results/imagefap.py @@ -46,25 +46,30 @@ __tests__ = ( }, { - "#url" : "https://www.imagefap.com/gallery/6180555", + "#url" : "https://www.imagefap.com/gallery/6706356", "#comment" : "description (#3905)", "#category": ("", "imagefap", "gallery"), "#class" : imagefap.ImagefapGalleryExtractor, "#range" : "1", "categories" : [ - "Amateur", - "Softcore", - "Homemade", + "Lesbian", + "Fetish", + "Animated GIFS", ], - "count" : 36, - "description": "Nude and dressed sluts showing off the goods", - "gallery_id" : 6180555, - "image_id" : int, - "num" : int, - "tags" : [], - "title" : "Dressed or Undressed MG*", - "uploader" : "splitopen", + "count" : 75, + "description": "A mixed collection of pics and gifs depicting lesbian femdom.\n\nAll images originally found on various Tumblr blogs and through the internet.\n\nObviously I don't own any of the images so if you do and you would like them removed please just let me know and I shall remove them straight away.", + "gallery_id" : 6706356, + "tags" : [ + "lesbian", + "femdom", + "lesbian femdom", + "lezdom", + "dominant women", + "submissive women", + ], + "title" : "Lezdom, Lesbian Femdom, Lesbian Domination - 3", + "uploader" : "pussysimon", }, { diff --git a/test/results/imgur.py b/test/results/imgur.py index 9024bad2..36fb53c4 100644 --- a/test/results/imgur.py +++ b/test/results/imgur.py @@ -232,7 +232,7 @@ __tests__ = ( "#comment" : "large album", "#category": ("", "imgur", "album"), "#class" : imgur.ImgurAlbumExtractor, - "#sha1_url": "de748c181a04d18bef1de9d4f4866ef0a06d632b", + "#exception": exception.HttpError, }, { @@ -255,7 +255,7 @@ __tests__ = ( "#comment" : "empty, no 'media' (#2557)", "#category": ("", "imgur", "album"), "#class" : imgur.ImgurAlbumExtractor, - "#count" : 0, + "#exception": exception.HttpError, }, { @@ -291,7 +291,7 @@ __tests__ = ( "#url" : "https://imgur.com/gallery/eD9CT", "#category": ("", "imgur", "gallery"), "#class" : imgur.ImgurGalleryExtractor, - "#pattern" : "https://imgur.com/a/eD9CT", + "#exception": exception.HttpError, }, { diff --git a/test/results/itaku.py b/test/results/itaku.py index 8a5b5066..27b59414 100644 --- a/test/results/itaku.py +++ b/test/results/itaku.py @@ -55,12 +55,12 @@ __tests__ = ( "tags_character" : ["hatsune_miku"], "tags_copyright" : ["vocaloid"], "tags_general": [ - "female", - "green_eyes", "twintails", "green_hair", - "gloves", "flag", + "gloves", + "green_eyes", + "female", "racing_miku", ], "title" : "Racing Miku 2022 Ver.", diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index c3dbdf73..a320dd65 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -60,7 +60,7 @@ __tests__ = ( "added" : "2020-05-06T20:28:02.302000", "content" : str, "count" : 1, - "date" : "dt:2019-08-11 02:09:04", + "date" : "dt:2019-08-10 17:09:04", "edited" : None, "embed" : dict, "extension" : "jpeg", @@ -68,7 +68,7 @@ __tests__ = ( "hash" : "210f35388e28bbcf756db18dd516e2d82ce758e0d32881eeee76d43e1716d382", "id" : "506575", "num" : 1, - "published" : "2019-08-11T02:09:04", + "published" : "2019-08-10T17:09:04", "service" : "fanbox", "shared_file": False, "subcategory": "fanbox", @@ -295,23 +295,13 @@ __tests__ = ( }, { - "#url" : "https://kemono.party/favorites", + "#url" : "https://kemono.su/favorites", "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyUserExtractor.pattern, "#auth" : True, "#count" : 3, - "#sha1_url": "f4b5b796979bcba824af84206578c79101c7f0e1", -}, - -{ - "#url" : "https://kemono.party/favorites?type=post", - "#category": ("", "kemonoparty", "favorite"), - "#class" : kemonoparty.KemonopartyFavoriteExtractor, - "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, - "#auth" : True, - "#count" : 3, - "#sha1_url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", + "#sha1_url": "902c656c8002a3257ef9e255cb69bca1937373d4", }, { diff --git a/test/results/komikcast.py b/test/results/komikcast.py index fa35c95f..bfbd3e47 100644 --- a/test/results/komikcast.py +++ b/test/results/komikcast.py @@ -49,7 +49,8 @@ __tests__ = ( "#url" : "https://komikcast.me/chapter/soul-land-ii-chapter-300-1-bahasa-indonesia/", "#category": ("", "komikcast", "chapter"), "#class" : komikcast.KomikcastChapterExtractor, - "#sha1_url" : "f2674e31b41a7f009f2f292652be2aefb6612d3f", + "#pattern" : r"https://svr\d\.imgkc\.my\.id/wp-content/img/S/Soul_Land_II/300\.1/\d\d\.jpg", + "#count" : 9, "#sha1_metadata": "cb646cfed3d45105bd645ab38b2e9f7d8c436436", }, diff --git a/test/results/mangaread.py b/test/results/mangaread.py index 40fe4e21..4330a13d 100644 --- a/test/results/mangaread.py +++ b/test/results/mangaread.py @@ -69,7 +69,7 @@ __tests__ = ( "#url" : "https://www.mangaread.org/manga/kanan-sama-wa-akumade-choroi", "#category": ("", "mangaread", "manga"), "#class" : mangaread.MangareadMangaExtractor, - "#pattern" : r"https://www\.mangaread\.org/manga/kanan-sama-wa-akumade-choroi/chapter-\d+(-.+)?/", + "#pattern" : r"https://www\.mangaread\.org/manga/kanan-sama-wa-akumade-choroi/chapter-\d+([_-].+)?/", "#count" : ">= 13", "manga" : "Kanan-sama wa Akumade Choroi", diff --git a/test/results/mastodonsocial.py b/test/results/mastodonsocial.py index d97d8976..ffd46a41 100644 --- a/test/results/mastodonsocial.py +++ b/test/results/mastodonsocial.py @@ -82,6 +82,7 @@ __tests__ = ( "#class" : mastodon.MastodonFollowingExtractor, "#extractor": False, "#urls" : ( + "https://ravenation.club/@soundwarrior20", "https://mastodon.social/@0x4f", "https://mastodon.social/@RustyBertrand", "https://mastodon.social/@christianselig", @@ -105,7 +106,7 @@ __tests__ = ( "header_static" : str, "id" : r"re:\d+", "last_status_at": r"re:\d{4}-\d{2}-\d{2}", - "locked" : False, + "locked" : bool, "note" : str, "statuses_count": int, "uri" : str, diff --git a/test/results/micmicidol.py b/test/results/micmicidol.py index f66bbd75..70ef9d26 100644 --- a/test/results/micmicidol.py +++ b/test/results/micmicidol.py @@ -15,19 +15,19 @@ __tests__ = ( "#urls" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgtpSSdrol9aKP_ztcc_mp9TUUS0U_t2DYJuGX3XCs6X5CkxIb-pM98QlxbkgJFvQj-0e6RbXNBf047qyMDZLcPJsm9dTqAn2XkTVfLhWRaxxVvIYnHYu0R0d7WsAUSFs0MDe4Sotpuqp5DQnjr45T17CXKbWtq9cR3op9dDQh3yiw2a6_HInIjLRm5io/s0/000-micmicidol.jpg", "blog": { - "date" : "dt:2023-09-18 19:48:53", + "date" : "dt:2023-12-13 08:31:54", "description": "", - "id" : "7192714164191173242", + "id" : "4995780788834875038", "kind" : "blogger#blog", "locale" : { - "country" : "TW", - "language": "zh", + "country" : "", + "language": "en", "variant" : "", }, "name" : "MIC MIC IDOL", "pages" : int, "posts" : int, - "published" : "2023-09-18T12:48:53-07:00", + "published" : "2023-12-13T16:31:54+08:00", "updated" : str, "url" : "http://www.micmicidol.club/" }, @@ -36,17 +36,17 @@ __tests__ = ( "content" : " ", "date" : "dt:2023-11-18 08:01:00", "etag" : str, - "id" : "5395888649239375388", + "id" : "2417999400144347191", "kind" : "blogger#post", "labels" : [ "- Cover", "Weekly Taishu", "Weekly Taishu Cover", ], - "published": "2023-11-18T00:01:00-08:00", + "published": "2023-11-18T16:01:00+08:00", "replies" : "0", "title" : "Weekly Taishu 週刊大衆 2023.11.13 Cover", - "updated" : "2023-11-18T03:00:42-08:00", + "updated" : "2023-12-13T16:41:54+08:00", "url" : "http://www.micmicidol.club/2023/11/weekly-taishu-20231113-cover.html" }, "num" : 1, @@ -67,6 +67,7 @@ __tests__ = ( "#class" : blogger.BloggerSearchExtractor, "#range" : "1-25", "#count" : 25, + "#archive" : False, "query" : "cover", }, diff --git a/test/results/myhentaigallery.py b/test/results/myhentaigallery.py index 283d6d01..0fa4e558 100644 --- a/test/results/myhentaigallery.py +++ b/test/results/myhentaigallery.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://myhentaigallery.com/g/16247", "#category": ("", "myhentaigallery", "gallery"), "#class" : myhentaigallery.MyhentaigalleryGalleryExtractor, - "#pattern" : r"https://images\.myhentaicomics\.com/mhg/images/[^/]+/original/\d+\.jpg", + "#pattern" : r"https://images\.myhentaicomics\.com/m\w\w/images/[^/]+/original/\d+\.jpg", "artist" : list, "count" : 11, diff --git a/test/results/newgrounds.py b/test/results/newgrounds.py index a50dbe36..e3740309 100644 --- a/test/results/newgrounds.py +++ b/test/results/newgrounds.py @@ -72,13 +72,14 @@ __tests__ = ( "#comment" : "extra files in 'art-image-row' elements - native PNG files (#4642)", "#category": ("", "newgrounds", "image"), "#class" : newgrounds.NewgroundsImageExtractor, + "#auth" : True, "#urls" : ( "https://art.ngfiles.com/images/5009000/5009916_14628_zedrinbot_nazrin-tanlines.265f7b6beec5855a349e2646e90cbc01.png?f1695698131", - "https://art.ngfiles.com/images/5009000/5009916_14632_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695698148", - "https://art.ngfiles.com/images/5009000/5009916_14634_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695698148", - "https://art.ngfiles.com/images/5009000/5009916_14633_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695698148", - "https://art.ngfiles.com/images/5009000/5009916_14635_zedrinbot_nazrin-tanlines.6a7aa4fd63e5f8077ad29314568246cc.png?f1695698149", - "https://art.ngfiles.com/images/5009000/5009916_14636_zedrinbot_nazrin-tanlines.6a7aa4fd63e5f8077ad29314568246cc.png?f1695698149", + "https://art.ngfiles.com/images/5009000/5009916_14632_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695727318", + "https://art.ngfiles.com/images/5009000/5009916_14634_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695727321", + "https://art.ngfiles.com/images/5009000/5009916_14633_zedrinbot_nazrin-tanlines.40bd62fbf5875806cda6b004b348114a.png?f1695727318", + "https://art.ngfiles.com/images/5009000/5009916_14635_zedrinbot_nazrin-tanlines.6a7aa4fd63e5f8077ad29314568246cc.png?f1695727321", + "https://art.ngfiles.com/images/5009000/5009916_14636_zedrinbot_nazrin-tanlines.6a7aa4fd63e5f8077ad29314568246cc.png?f1695727322", ), }, @@ -165,7 +166,7 @@ From The ZJ "Late """, "#comment" : "flash animation (#1257)", "#category": ("", "newgrounds", "media"), "#class" : newgrounds.NewgroundsMediaExtractor, - "#pattern" : r"https://uploads\.ungrounded\.net/161000/161181_ddautta_mask__550x281_\.swf\?f1081628129", + "#urls" : "https://uploads.ungrounded.net/161000/161181_ddautta_mask__550x281_.swf", "type": "movie", }, @@ -193,8 +194,10 @@ From The ZJ "Late """, "#comment" : "flash game", "#category": ("", "newgrounds", "media"), "#class" : newgrounds.NewgroundsMediaExtractor, - "#pattern" : r"https://uploads\.ungrounded\.net/829000/829032_picovsbeardx\.swf\?f1641968445", - "#range" : "1", + "#urls" : ( + "https://uploads.ungrounded.net/829000/829032_picovsbeardx.swf", + "https://uploads.ungrounded.net/tmp/img/521000/iu_521265_5431202.gif", + ), "artist" : [ "dungeonation", @@ -229,7 +232,7 @@ From The ZJ "Late """, ], "title" : "PICO VS BEAR DX", "type" : "game", - "url" : "https://uploads.ungrounded.net/829000/829032_picovsbeardx.swf?f1641968445", + "url" : "https://uploads.ungrounded.net/829000/829032_picovsbeardx.swf", }, { @@ -244,8 +247,8 @@ From The ZJ "Late """, "#url" : "https://tomfulp.newgrounds.com/audio", "#category": ("", "newgrounds", "audio"), "#class" : newgrounds.NewgroundsAudioExtractor, - "#pattern" : r"https://audio.ngfiles.com/\d+/\d+_.+\.mp3", - "#count" : ">= 4", + "#pattern" : r"https://(audio\.ngfiles\.com/\d+/\d+_.+\.mp3|uploads\.ungrounded\.net/.+\.png)", + "#count" : ">= 10", }, { @@ -261,7 +264,7 @@ From The ZJ "Late """, "#url" : "https://tomfulp.newgrounds.com/games", "#category": ("", "newgrounds", "games"), "#class" : newgrounds.NewgroundsGamesExtractor, - "#pattern" : r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+", + "#pattern" : r"https://uploads.ungrounded.net(/alternate)?/(\d+/\d+_.+|tmp/.+)", "#range" : "1-10", "#count" : 10, }, @@ -270,7 +273,7 @@ From The ZJ "Late """, "#url" : "https://tomfulp.newgrounds.com", "#category": ("", "newgrounds", "user"), "#class" : newgrounds.NewgroundsUserExtractor, - "#pattern" : "https://tomfulp.newgrounds.com/art$", + "#urls" : "https://tomfulp.newgrounds.com/art", }, { @@ -278,8 +281,12 @@ From The ZJ "Late """, "#category": ("", "newgrounds", "user"), "#class" : newgrounds.NewgroundsUserExtractor, "#options" : {"include": "all"}, - "#pattern" : "https://tomfulp.newgrounds.com/(art|audio|movies)$", - "#count" : 3, + "#urls" : ( + "https://tomfulp.newgrounds.com/art", + "https://tomfulp.newgrounds.com/audio", + "https://tomfulp.newgrounds.com/games", + "https://tomfulp.newgrounds.com/movies", + ), }, { diff --git a/test/results/paheal.py b/test/results/paheal.py index 46b210f6..0d826297 100644 --- a/test/results/paheal.py +++ b/test/results/paheal.py @@ -44,7 +44,7 @@ __tests__ = ( "id" : 2446128, "md5" : "b0ceda9d860df1d15b60293a7eb465c1", "search_tags": "Ayane_Suzuki", - "size" : 205312, + "size" : 204800, "source" : "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=19957280", "tags" : "Ayane_Suzuki Idolmaster idolmaster_dearly_stars Zanzi", "uploader" : "XXXname", @@ -65,7 +65,7 @@ __tests__ = ( "height" : 660, "id" : 481609, "md5" : "bbdc1c33410c2cdce7556c7990be26b7", - "size" : 157389, + "size" : 157696, "source" : "", "tags" : "Ayumu_Kasuga Azumanga_Daioh inanimate Vuvuzela", "uploader" : "CaptainButtface", @@ -80,7 +80,7 @@ __tests__ = ( "date" : "dt:2010-06-25 13:51:17", "height" : 800, "md5" : "b39edfe455a0381110c710d6ed2ef57d", - "size" : 758989, + "size" : 758784, "source" : "http://www.furaffinity.net/view/4057821/", "tags" : "inanimate thelost-dragon Vuvuzela", "uploader": "leacheate_soup", @@ -100,7 +100,7 @@ __tests__ = ( "height" : 2500, "id" : 3864982, "md5" : "7629fc0ff77e32637dde5bf4f992b2cb", - "size" : 18454938, + "size" : 18874368, "source" : "https://twitter.com/VG_Worklog/status/1302407696294055936", "tags" : "animated Metal_Gear Metal_Gear_Solid_V Quiet Vg_erotica webm", "uploader" : "justausername", diff --git a/test/results/pinterest.py b/test/results/pinterest.py index 26671bff..e92e50ac 100644 --- a/test/results/pinterest.py +++ b/test/results/pinterest.py @@ -57,6 +57,7 @@ __tests__ = ( "#comment" : "secret board (#1055)", "#category": ("", "pinterest", "board"), "#class" : pinterest.PinterestBoardExtractor, + "#auth" : True, "#count" : 2, }, diff --git a/test/results/pixiv.py b/test/results/pixiv.py index 34c44f8c..87a69513 100644 --- a/test/results/pixiv.py +++ b/test/results/pixiv.py @@ -407,50 +407,54 @@ __tests__ = ( }, { - "#url" : "https://www.pixiv.net/novel/show.php?id=19612040", - "#comment" : "full series", + "#url" : "https://www.pixiv.net/novel/show.php?id=12101012", "#category": ("", "pixiv", "novel"), "#class" : pixiv.PixivNovelExtractor, "#count" : 1, - "#sha1_content": "8c818474153cbd2f221ee08766e1d634c821d8b4", - - "caption" : r"re:「無能な名無し」と呼ばれ虐げられて育った鈴\(すず\)は、", - "comment_access_control": 0, - "create_date" : "2023-04-02T15:18:58+09:00", - "date" : "dt:2023-04-02 06:18:58", - "id" : 19612040, - "is_bookmarked" : False, - "is_muted" : False, - "is_mypixiv_only" : False, - "is_original" : True, - "is_x_restricted" : False, - "novel_ai_type" : 1, - "page_count" : 1, - "rating" : "General", - "restrict" : 0, - "series" : { - "id" : 10278364, - "title": "龍の贄嫁〜無能な名無しと虐げられていましたが、どうやら異母妹に霊力を搾取されていたようです〜", + "#sha1_content": "20f4a62f0e87ae2cb9f5a787b6c641bfa4eabf93", + + "caption" : "<br />第一印象から決めてました!<br /><br />素敵な表紙はいもこは妹さん(<strong><a href=\"pixiv://illusts/53802907\">illust/53802907</a></strong>)からお借りしました。<br /><br />たくさんのコメント、タグありがとうございます、本当に嬉しいです。お返事できていませんが、一つ一つ目を通させていただいてます。タイトルも込みで読んでくださってすごく嬉しいです。ありがとうございます……!!<br /><br />■12/19付けルキラン20位を頂きました…!大変混乱していますがすごく嬉しいです。ありがとうございます! <br /><br />■2019/12/20デイリー15位、女子に人気8位をを頂きました…!?!?!?!?て、手が震える…。ありがとうございます…ひえええ。感謝してもしきれないです…!", + "create_date" : "2019-12-19T23:14:36+09:00", + "date" : "dt:2019-12-19 14:14:36", + "extension" : "txt", + "id" : 12101012, + "image_urls" : dict, + "is_bookmarked" : False, + "is_muted" : False, + "is_mypixiv_only": False, + "is_original" : False, + "is_x_restricted": False, + "novel_ai_type" : 0, + "page_count" : 1, + "rating" : "General", + "restrict" : 0, + "series" : { + "id" : 1479656, + "title": "一目惚れした彼らの話", }, - "tags" : [ - "和風ファンタジー", - "溺愛", - "神様", - "ヤンデレ", - "執着", - "異能", - "ざまぁ", - "学園", - "神嫁", + "tags" : [ + "鬼滅の夢", + "女主人公", + "煉獄杏寿郎", + "涙腺崩壊", + "なにこれすごい", + "来世で幸せになって欲しい", + "キメ学世界線できっと幸せになってる!!", + "あなたが神か!!", + "キメ学編を·····", + "鬼滅の夢小説10000users入り", ], - "text_length" : 5974, - "title" : "異母妹から「無能な名無し」と虐げられていた私、どうやら異母妹に霊力を搾取されていたようです(1)", - "user" : { - "account": "yukinaga_chifuyu", - "id" : 77055466, + "text_length" : 9569, + "title" : "本当は、一目惚れだった", + "total_bookmarks": range(17900, 20000), + "total_comments" : range(200, 400), + "total_view" : range(158000, 300000), + "user" : { + "account": "46_maru", + "id" : 888268, }, - "visible" : True, - "x_restrict" : 0, + "visible" : True, + "x_restrict" : 0, }, { @@ -463,12 +467,12 @@ __tests__ = ( }, { - "#url" : "https://www.pixiv.net/novel/show.php?id=19612040", + "#url" : "https://www.pixiv.net/novel/show.php?id=12101012", "#comment" : "full series", "#category": ("", "pixiv", "novel"), "#class" : pixiv.PixivNovelExtractor, "#options" : {"full-series": True}, - "#count" : 4, + "#count" : 2, }, { @@ -488,11 +492,11 @@ __tests__ = ( }, { - "#url" : "https://www.pixiv.net/novel/series/10278364", + "#url" : "https://www.pixiv.net/novel/series/1479656", "#category": ("", "pixiv", "novel-series"), "#class" : pixiv.PixivNovelSeriesExtractor, - "#count" : 4, - "#sha1_content": "b06abed001b3f6ccfb1579699e9a238b46d38ea2", + "#count" : 2, + "#sha1_content": "243ce593333bbfe26e255e3372d9c9d8cea22d5b", }, { diff --git a/test/results/pornpics.py b/test/results/pornpics.py index 2bcdcfec..47677c3a 100644 --- a/test/results/pornpics.py +++ b/test/results/pornpics.py @@ -27,6 +27,7 @@ __tests__ = ( "num" : int, "slug" : "british-beauty-danielle-flashes-hot-breasts-ass-and-snatch-in-the-forest", "tags" : [ + "MILF Outdoor", "Amateur MILF", "Nature", "Amateur Outdoor", diff --git a/test/results/realbooru.py b/test/results/realbooru.py index e99634b4..5eb26399 100644 --- a/test/results/realbooru.py +++ b/test/results/realbooru.py @@ -30,15 +30,51 @@ __tests__ = ( }, { - "#url" : "https://realbooru.com/index.php?page=post&s=view&id=668483", + "#url" : "https://realbooru.com/index.php?page=post&s=view&id=862054", + "#comment" : "regular post", "#category": ("gelbooru_v02", "realbooru", "post"), "#class" : gelbooru_v02.GelbooruV02PostExtractor, "#options" : {"tags": True}, - "#pattern" : r"https://realbooru\.com//?images/dc/b5/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", - "#sha1_content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", + "#urls" : "https://realbooru.com/images/8a/34/8a345820da989637c21ac013d522bf69.jpeg", + "#sha1_content": "f6213e6f25c3cb9e3cfefa6d4b3a78e44b9dea5b", - "tags_general": "1girl blonde blonde_hair blue_eyes cute female female_only looking_at_viewer smile solo solo_female teeth", - "tags_model" : "jennifer_lawrence", + "change" : "1705562002", + "created_at" : "Thu Jan 18 01:12:50 -0600 2024", + "creator_id" : "32011", + "date" : "dt:2024-01-18 07:12:50", + "file_url" : "https://realbooru.com/images/8a/34/8a345820da989637c21ac013d522bf69.jpeg", + "filename" : "8a345820da989637c21ac013d522bf69", + "has_children" : "false", + "has_comments" : "false", + "has_notes" : "false", + "height" : "1800", + "id" : "862054", + "md5" : "8a345820da989637c21ac013d522bf69", + "parent_id" : "", + "preview_height": "150", + "preview_url" : "https://realbooru.com/thumbnails/8a/34/thumbnail_8a345820da989637c21ac013d522bf69.jpg", + "preview_width" : "120", + "rating" : "e", + "sample_height" : "1063", + "sample_url" : "https://realbooru.com/samples/8a/34/sample_8a345820da989637c21ac013d522bf69.jpg", + "sample_width" : "850", + "score" : "", + "source" : "https://www.instagram.com/p/CwAO1UyJBnw", + "status" : "active", + "tags" : " 1girl asian bikini black_hair breasts cleavage female female_only floral_print instagram japanese kurita_emi large_breasts looking_at_viewer navel sauna short_hair side-tie_bikini sitting solo ", + "tags_copyright": "instagram", + "tags_general" : "1girl asian bikini black_hair breasts cleavage female female_only floral_print japanese large_breasts looking_at_viewer navel sauna short_hair side-tie_bikini sitting solo", + "tags_model" : "kurita_emi", + "width" : "1440", +}, + +{ + "#url" : "https://realbooru.com/index.php?page=post&s=view&id=568145", + "#comment" : "older post", + "#category": ("gelbooru_v02", "realbooru", "post"), + "#class" : gelbooru_v02.GelbooruV02PostExtractor, + "#urls" : "https://realbooru.com/images/f9/c8/f9c80c00a6add48b1d0abd3bd3ed75af.jpg", + "#sha1_content": "4a7424810f5f846c161b5d3b7c8b0a85a03368c8", }, ) diff --git a/test/results/rule34us.py b/test/results/rule34us.py index 43d8f66a..c3e193ba 100644 --- a/test/results/rule34us.py +++ b/test/results/rule34us.py @@ -28,7 +28,7 @@ __tests__ = ( "#url" : "https://rule34.us/index.php?r=posts/view&id=4576310", "#category": ("booru", "rule34us", "post"), "#class" : rule34us.Rule34usPostExtractor, - "#pattern" : r"https://video\.rule34\.us/images/a2/94/a294ff8e1f8e0efa041e5dc9d1480011\.mp4", + "#pattern" : r"https://video-cdn\d\.rule34\.us/images/a2/94/a294ff8e1f8e0efa041e5dc9d1480011\.mp4", "extension" : "mp4", "file_url" : str, diff --git a/test/results/snootbooru.py b/test/results/snootbooru.py index 822bad6e..25a294eb 100644 --- a/test/results/snootbooru.py +++ b/test/results/snootbooru.py @@ -64,9 +64,6 @@ __tests__ = ( "sport", "text" ], - "tags_type" : [ - "transparent" - ], "thumbnailUrl" : "data/generated-thumbnails/14511_e753313112755da6.jpg", "type" : "image", "user" : { diff --git a/test/results/tumblr.py b/test/results/tumblr.py index 70e334b5..12374e4a 100644 --- a/test/results/tumblr.py +++ b/test/results/tumblr.py @@ -103,7 +103,7 @@ __tests__ = ( "date-max" : "2015-04-25T00:00:00", "date-min" : "2015-04-01T00:00:00", }, - "#count" : 197, + "#count" : 193, }, { diff --git a/test/results/twibooru.py b/test/results/twibooru.py index ff87deec..a3aec152 100644 --- a/test/results/twibooru.py +++ b/test/results/twibooru.py @@ -44,7 +44,7 @@ __tests__ = ( "tag_ids" : list, "tags" : list, "thumbnails_generated": True, - "updated_at" : "2023-07-24T03:18:48.153Z", + "updated_at" : "2023-12-25T06:58:33.986Z", "upvotes" : int, "view_url" : "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width" : 576, diff --git a/test/results/twitter.py b/test/results/twitter.py index 4e228424..926c8021 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -187,7 +187,7 @@ __tests__ = ( "#url" : "https://twitter.com/i/lists/784214683683127296/members", "#category": ("", "twitter", "list-members"), "#class" : twitter.TwitterListMembersExtractor, - "#pattern" : twitter.TwitterTimelineExtractor.pattern, + "#pattern" : twitter.TwitterUserExtractor.pattern, "#range" : "1-40", "#count" : 40, }, @@ -463,7 +463,7 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, "#options" : {"retweets": True}, - "#count" : 0, + "#count" : 4, }, { @@ -509,8 +509,8 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, "#options" : {"cards": True}, - "#pattern" : r"https://pbs.twimg.com/card_img/157\d+/[\w-]+\?format=(jpg|png)&name=orig$", - "#range" : "1-2", + "#pattern" : r"https://pbs.twimg.com/card_img/174\d+/[\w-]+\?format=(jpg|png)&name=orig$", + "#range" : "1,3", }, { diff --git a/test/results/unsplash.py b/test/results/unsplash.py index 01692eec..29f793c7 100644 --- a/test/results/unsplash.py +++ b/test/results/unsplash.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://unsplash.com/photos/red-wooden-cross-on-gray-concrete-pathway-between-green-trees-during-daytime-kaoHI0iHJPM", "#category": ("", "unsplash", "image"), "#class" : unsplash.UnsplashImageExtractor, - "#urls" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3", + "#pattern" : r"https://images\.unsplash\.com/photo-1601823984263-b87b59798b", "alt_description": "red wooden cross on gray concrete pathway between green trees during daytime", "blur_hash" : "LIAwhq%e4TRjXAIBMyt89GRj%fj[", @@ -66,25 +66,18 @@ __tests__ = ( "sunrise", "traditional", "shrine", - "grey", "wallpaper", + "grey", "arbour", - "garden", "outdoors", + "garden", "gate", ], "tags_preview": list, "topic_submissions": {}, "topics" : [], - "updated_at" : "2023-11-24T08:17:36Z", - "urls": { - "full" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=srgb&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=85", - "raw" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3", - "regular" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=1080", - "small" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=400", - "small_s3": "https://s3.us-west-2.amazonaws.com/images.unsplash.com/small/photo-1601823984263-b87b59798b70", - "thumb" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=200", - }, + "updated_at" : "2024-01-19T08:21:54Z", + "urls": dict, "user": { "accepted_tos" : True, "bio" : "Professional photographer.\r\nBased in Japan.", @@ -121,7 +114,7 @@ __tests__ = ( "total_photos" : 86, "total_promoted_photos": 24, "twitter_username" : None, - "updated_at" : "2023-11-24T19:15:32Z", + "updated_at" : "2023-11-27T07:10:52Z", "username" : "_______life_" }, "views": range(2000000, 10000000), diff --git a/test/results/vsco.py b/test/results/vsco.py index 16f501d9..4e38435d 100644 --- a/test/results/vsco.py +++ b/test/results/vsco.py @@ -33,7 +33,7 @@ __tests__ = ( "#url" : "https://vsco.co/vsco/collection/1", "#category": ("", "vsco", "collection"), "#class" : vsco.VscoCollectionExtractor, - "#pattern" : r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/[\w-]+\.\w+", + "#pattern" : r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/[\w\s-]+\.\w+", "#range" : "1-80", "#count" : 80, }, diff --git a/test/results/wikifeet.py b/test/results/wikifeet.py index 56e391c5..2a8b849b 100644 --- a/test/results/wikifeet.py +++ b/test/results/wikifeet.py @@ -23,7 +23,7 @@ __tests__ = ( "pid" : int, "width" : int, "height" : int, - "shoesize" : "10 US", + "shoesize" : r"re:\d+ US", "type" : "women", "tags" : list, }, diff --git a/test/results/xbooru.py b/test/results/xbooru.py index 11160b42..978d3227 100644 --- a/test/results/xbooru.py +++ b/test/results/xbooru.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://xbooru.com/index.php?page=post&s=list&tags=konoyan", "#category": ("gelbooru_v02", "xbooru", "tag"), "#class" : gelbooru_v02.GelbooruV02TagExtractor, - "#count" : 11, + "#count" : 24, }, { diff --git a/test/results/xhamster.py b/test/results/xhamster.py index 93436d06..44675c50 100644 --- a/test/results/xhamster.py +++ b/test/results/xhamster.py @@ -9,11 +9,11 @@ from gallery_dl.extractor import xhamster __tests__ = ( { - "#url" : "https://xhamster.com/photos/gallery/11748968", + "#url" : "https://xhamster.com/photos/gallery/take-me-to-the-carwash-at-digitaldesire-15860946", "#category": ("", "xhamster", "gallery"), "#class" : xhamster.XhamsterGalleryExtractor, - "#pattern" : r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", - "#count" : ">= 144", + "#pattern" : r"https://ic-ph-ah\.xhcdn\.com/a/\w+/webp/000/\d+/\d+/\d+_1000\.jpg$", + "#count" : 19, "comments": int, "count" : int, @@ -26,32 +26,40 @@ __tests__ = ( "pageURL" : str, "thumbURL": str, "gallery" : { - "date" : "dt:2019-04-16 00:07:31", - "description": "", + "date" : "dt:2022-02-02 06:30:09", + "description": "Alina Henessy loves to wash her car, and we love seeing every inch of her gorgeous body. More at DigitalDesire.com", "dislikes" : int, - "id" : 11748968, + "id" : 15860946, "likes" : int, - "tags" : ["NON-Porn"], + "tags" : [ + "Babe", + "Public Nudity", + "Take", + "Taking", + "Masturbation", + "Take Me", + ], "thumbnail" : str, - "title" : "Make the world better.", - "views" : int, + "title" : "Take me to the carwash at DigitalDesire", + "views" : range(100000, 200000), + }, "user" : { - "id" : 16874672, - "name" : "Anonymousrants", - "retired" : bool, - "subscribers": int, - "url" : "https://xhamster.com/users/anonymousrants", - "verified" : bool, + "id" : 4741860, + "name" : "DaringSex", + "retired" : False, + "subscribers": range(25000, 50000), + "url" : "https://xhamster.com/users/daringsex", + "verified" : False, }, }, { - "#url" : "https://jp.xhamster2.com/photos/gallery/11748968", + "#url" : "https://jp.xhamster2.com/photos/gallery/take-me-to-the-carwash-at-digitaldesire-15860946", "#category": ("", "xhamster", "gallery"), "#class" : xhamster.XhamsterGalleryExtractor, - "#pattern" : r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", - "#count" : ">= 144", + "#pattern" : r"https://ic-ph-ah\.xhcdn\.com/a/\w+/webp/000/\d+/\d+/\d+_1000\.jpg$", + "#count" : 19, }, { @@ -97,7 +105,7 @@ __tests__ = ( }, { - "#url" : "https://xhamster.com/users/goldenpalomino/photos", + "#url" : "https://xhamster.com/users/daringsex/photos", "#category": ("", "xhamster", "user"), "#class" : xhamster.XhamsterUserExtractor, "#pattern" : xhamster.XhamsterGalleryExtractor.pattern, diff --git a/test/results/zerochan.py b/test/results/zerochan.py index 67b9da21..dac7890b 100644 --- a/test/results/zerochan.py +++ b/test/results/zerochan.py @@ -31,6 +31,7 @@ __tests__ = ( "#category": ("booru", "zerochan", "image"), "#class" : zerochan.ZerochanImageExtractor, "#pattern" : r"https://static\.zerochan\.net/Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg", + "#auth" : True, "author" : "YeFan 葉凡", "date" : "dt:2020-04-24 21:33:44", diff --git a/test/test_results.py b/test/test_results.py index 12fe59d5..680b0f94 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -42,7 +42,9 @@ AUTH = { "pixiv", "nijie", "horne", + "reddit", "seiga", + "fantia", "instagram", "twitter", } @@ -92,7 +94,8 @@ class TestExtractorResults(unittest.TestCase): if requires_auth: extr = result["#class"].from_url(result["#url"]) if not any(extr.config(key) for key in ( - "username", "cookies", "api-key", "client-id")): + "username", "cookies", "api-key", "client-id", + "refresh-token")): msg = "no auth" self._skipped.append((result["#url"], msg)) self.skipTest(msg) @@ -410,7 +413,7 @@ def generate_tests(): if v in ("f", "fail"): self.fail("manual test failure") else: - self._skipped.append((result["#url"], exc)) + self._skipped.append((result["#url"], "manual skip")) self.skipTest(exc) return test From 6f8592eaff0c84304d1f904d6ff3a8d15d67c202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2024 18:25:38 +0100 Subject: [PATCH 329/344] [hbrowse] remove from modules list --- gallery_dl/extractor/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 86308917..d6247362 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -55,7 +55,6 @@ modules = [ "gelbooru_v02", "gofile", "hatenablog", - "hbrowse", "hentai2read", "hentaicosplays", "hentaifoundry", From a6fed628dde288cdd680de426fa3c6ea094196cc Mon Sep 17 00:00:00 2001 From: Wiiplay123 <9746793+Wiiplay123@users.noreply.github.com> Date: Sat, 20 Jan 2024 15:07:52 -0600 Subject: [PATCH 330/344] [blogger] Fix lh*.googleusercontent.com forward slash bug, add support for lh*-**.googleusercontent.com Some URLs use "lh(number)-(locale).googleusercontent.com" format, so I added support for those. Also, "lh(number).googleusercontent.com" formats were broken because the regex was looking for a second forward slash. Examples: lh7.googleusercontent.com lh7-us.googleusercontent.com --- gallery_dl/extractor/blogger.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 58ae59db..b3b405de 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -37,7 +37,8 @@ class BloggerExtractor(BaseExtractor): findall_image = re.compile( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' - r'lh\d+\.googleusercontent\.com/|' + r'lh\d+\.googleusercontent\.com|' + r'lh\d+-\w+\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall From 6eb62f21401ad112aaf23734201f3908eabb0316 Mon Sep 17 00:00:00 2001 From: Wiiplay123 <9746793+Wiiplay123@users.noreply.github.com> Date: Sat, 20 Jan 2024 15:53:11 -0600 Subject: [PATCH 331/344] Combine lh*(-**).googleusercontent.com URL regex into one line. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mike Fährmann <mike_faehrmann@web.de> --- gallery_dl/extractor/blogger.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index b3b405de..402408e6 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -37,8 +37,7 @@ class BloggerExtractor(BaseExtractor): findall_image = re.compile( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' - r'lh\d+\.googleusercontent\.com|' - r'lh\d+-\w+\.googleusercontent\.com|' + r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall From df718887c2110a97f3f3a0dcc175401d4bc7c75f Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sun, 21 Jan 2024 09:50:27 +1100 Subject: [PATCH 332/344] [webtoons] fix extracting comic and episode name with commas --- gallery_dl/extractor/webtoons.py | 18 ++++++++++++++---- test/results/webtoons.py | 27 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index a4259358..949c7cb7 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -88,10 +88,20 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) - keywords = extr('<meta name="keywords" content="', '"').split(", ") title = extr('<meta property="og:title" content="', '"') descr = extr('<meta property="og:description" content="', '"') + if extr('<div class="subj_info"', '\n'): + comic_name = extr('>', '<') + episode_name = extr('<h1 class="subj_episode" title="', '"') + else: + comic_name = episode_name = "" + + if extr('<span class="tx _btnOpenEpisodeList ', '"'): + episode = extr('>#', '<') + else: + episode = "" + if extr('<div class="author_area"', '\n'): username = extr('/creator/', '"') author_name = extr('<span>', '</span>') @@ -104,9 +114,9 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): "title_no" : self.title_no, "episode_no" : self.episode_no, "title" : text.unescape(title), - "episode" : keywords[1], - "comic_name" : text.unescape(keywords[0]), - "episode_name": text.unescape(keywords[2]), + "episode" : episode, + "comic_name" : text.unescape(comic_name), + "episode_name": text.unescape(episode_name), "username" : username, "author_name" : text.unescape(author_name), "description" : text.unescape(descr), diff --git a/test/results/webtoons.py b/test/results/webtoons.py index 82831f02..3e6beec3 100644 --- a/test/results/webtoons.py +++ b/test/results/webtoons.py @@ -61,10 +61,37 @@ __tests__ = ( "comic_name" : "I want to be a cute anime girl", "episode_name": "209 - The story's story", + "episode" : "214", "username" : "m9huj", "author_name" : "Azul Crescent", }, +{ + "#url" : "https://www.webtoons.com/en/canvas/i-want-to-be-a-cute-anime-girl/174-not-194-it-was-a-typo-later/viewer?title_no=349416&episode_no=179", + "#category": ("", "webtoons", "episode"), + "#class" : webtoons.WebtoonsEpisodeExtractor, + "#count" : 4, + + "comic_name" : "I want to be a cute anime girl", + "episode_name": "174 (not 194, it was a typo) - Later", + "episode" : "179", + "username" : "m9huj", + "author_name" : "Azul Crescent", +}, + +{ + "#url" : "https://www.webtoons.com/en/canvas/us-over-here/1-the-wheel/viewer?title_no=919536&episode_no=1", + "#category": ("", "webtoons", "episode"), + "#class" : webtoons.WebtoonsEpisodeExtractor, + "#count" : 59, + + "comic_name" : "Us, over here", + "episode_name": "1. The Wheel", + "episode" : "1", + "username" : "i94q8", + "author_name" : "spin.ani", +}, + { "#url" : "https://www.webtoons.com/en/comedy/live-with-yourself/list?title_no=919", "#comment" : "english", From c7a42880abe3802c48cc7bbb93fcadd13ee58f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 21 Jan 2024 00:21:40 +0100 Subject: [PATCH 333/344] [wikimedia] support fandom wikis (#1443, #2677, #3378) Wikis hosted on fandom.com are just wikimedia instances and support its API. --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/wikimedia.py | 9 +++ test/results/fandom.py | 92 +++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 test/results/fandom.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c6756c6a..54771eff 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1523,6 +1523,12 @@ Consider all listed sites to potentially be NSFW. <td>Articles</td> <td></td> </tr> +<tr> + <td>Fandom</td> + <td>https://www.fandom.com/</td> + <td>Articles</td> + <td></td> +</tr> <tr> <td>Super Mario Wiki</td> <td>https://www.mariowiki.com/</td> diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 49843111..1eafc296 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -25,6 +25,10 @@ class WikimediaExtractor(BaseExtractor): BaseExtractor.__init__(self, match) path = match.group(match.lastindex) + if self.category == "fandom": + self.category = \ + "fandom-" + self.root.partition(".")[0].rpartition("/")[2] + if path.startswith("wiki/"): path = path[5:] self.api_path = "/w/api.php" @@ -158,6 +162,11 @@ BASE_PATTERN = WikimediaExtractor.update({ "root": "https://www.mediawiki.org", "pattern": r"(?:www\.)?mediawiki\.org", }, + "fandom": { + "root": None, + "pattern": r"[\w-]+\.fandom\.com", + "api-path": "/api.php", + }, "mariowiki": { "root": "https://www.mariowiki.com", "pattern": r"(?:www\.)?mariowiki\.com", diff --git a/test/results/fandom.py b/test/results/fandom.py new file mode 100644 index 00000000..40d82e93 --- /dev/null +++ b/test/results/fandom.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.fandom.com/wiki/Title", + "#comment" : "for scripts/supportedsites.py", + "#category": ("wikimedia", "fandom-www", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://mushishi.fandom.com/wiki/Yahagi", + "#category": ("wikimedia", "fandom-mushishi", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : "https://static.wikia.nocookie.net/mushi-shi/images/f/f8/Yahagi.png/revision/latest?cb=20150128052255", + + "bitdepth" : 8, + "canonicaltitle": "File:Yahagi.png", + "comment" : "", + "commonmetadata": { + "ResolutionUnit": 3, + "XResolution" : "3779/100", + "YResolution" : "3779/100", + }, + "date" : "dt:2015-01-28 05:22:55", + "descriptionshorturl": "https://mushishi.fandom.com/index.php?curid=2595", + "descriptionurl": "https://mushishi.fandom.com/wiki/File:Yahagi.png", + "extension" : "png", + "extmetadata" : { + "DateTime": { + "hidden": "", + "source": "mediawiki-metadata", + "value": "2015-01-28T05:22:55Z", + }, + "ObjectName": { + "hidden": "", + "source": "mediawiki-metadata", + "value": "Yahagi", + }, + }, + "filename" : "Yahagi", + "height" : 410, + "metadata" : { + "bitDepth" : 8, + "colorType" : "truecolour", + "duration" : 0, + "frameCount": 0, + "loopCount" : 1, + "metadata" : [ + { + "name" : "XResolution", + "value": "3779/100", + }, + { + "name" : "YResolution", + "value": "3779/100", + }, + { + "name" : "ResolutionUnit", + "value": 3, + }, + { + "name" : "_MW_PNG_VERSION", + "value": 1, + }, + ], + }, + "mime" : "image/png", + "page" : "Yahagi", + "sha1" : "e3078a97976215323dbabb0c86b7acc55b512d16", + "size" : 429912, + "timestamp" : "2015-01-28T05:22:55Z", + "url" : "https://static.wikia.nocookie.net/mushi-shi/images/f/f8/Yahagi.png/revision/latest?cb=20150128052255", + "user" : "ITHYRIAL", + "userid" : 4637089, + "width" : 728, +}, + +{ + "#url" : "https://projectsekai.fandom.com/wiki/Project_SEKAI_Wiki", + "#category": ("wikimedia", "fandom-projectsekai", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +) From f3ad91b44f24589a5f6d31e772a56da4993c11c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 21 Jan 2024 03:00:57 +0100 Subject: [PATCH 334/344] [bunkr] update domain (#5088) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/bunkr.py | 14 +++++++------- test/results/bunkr.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 54771eff..d6c88dd6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -123,7 +123,7 @@ Consider all listed sites to potentially be NSFW. </tr> <tr> <td>Bunkr</td> - <td>https://bunkrr.su/</td> + <td>https://bunkrr.ru/</td> <td>Albums, Media Files</td> <td></td> </tr> diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 26123b8b..e7fc14bb 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,13 +6,13 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkrr.su/""" +"""Extractors for https://bunkrr.ru/""" from .lolisafe import LolisafeAlbumExtractor from .. import text from urllib.parse import urlsplit, urlunsplit -BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)" +BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:[rs]u|la|is|to)" MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", @@ -27,11 +27,11 @@ CDN_HOSTED_EXTENSIONS = ( class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkrr.su albums""" + """Extractor for bunkrr.ru albums""" category = "bunkr" - root = "https://bunkrr.su" + root = "https://bunkrr.ru" pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://bunkrr.su/a/ID" + example = "https://bunkrr.ru/a/ID" def fetch_album(self, album_id): # album metadata @@ -84,11 +84,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): class BunkrMediaExtractor(BunkrAlbumExtractor): - """Extractor for bunkrr.su media links""" + """Extractor for bunkrr.ru media links""" subcategory = "media" directory_fmt = ("{category}",) pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)" - example = "https://bunkrr.su/v/FILENAME" + example = "https://bunkrr.ru/v/FILENAME" def fetch_album(self, album_id): try: diff --git a/test/results/bunkr.py b/test/results/bunkr.py index 66247622..317cf44f 100644 --- a/test/results/bunkr.py +++ b/test/results/bunkr.py @@ -9,7 +9,7 @@ from gallery_dl.extractor import bunkr __tests__ = ( { - "#url" : "https://bunkrr.su/a/Lktg9Keq", + "#url" : "https://bunkrr.ru/a/Lktg9Keq", "#category": ("lolisafe", "bunkr", "album"), "#class" : bunkr.BunkrAlbumExtractor, "#urls" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", From 0d3af0d35b684e78e2ea8a8b44d60fd3a186362f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 21 Jan 2024 15:16:25 +0100 Subject: [PATCH 335/344] [tests] ignore 'ytdl' categories when import fails (#5095) --- test/test_extractor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_extractor.py b/test/test_extractor.py index 3b590ff4..75a0b876 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -101,8 +101,14 @@ class TestExtractorModule(unittest.TestCase): def test_categories(self): for result in results.all(): url = result["#url"] - extr = result["#class"].from_url(url) base, cat, sub = result["#category"] + try: + extr = result["#class"].from_url(url) + except ImportError as exc: + if exc.name in ("youtube_dl", "yt_dlp"): + print("Skipping '{}' category checks".format(cat)) + continue + raise self.assertEqual(extr.category, cat, url) self.assertEqual(extr.subcategory, sub, url) self.assertEqual(extr.basecategory, base, url) From 67c99b13668cf388f59515670466f56040979c2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 21 Jan 2024 22:50:40 +0100 Subject: [PATCH 336/344] [patreon] prevent HttpError for stream.mux.com URLs --- gallery_dl/extractor/patreon.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index dfcfe24b..62d11f23 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -64,7 +64,12 @@ class PatreonExtractor(Extractor): postfile = post.get("post_file") if postfile: url = postfile["url"] - name = postfile.get("name") or self._filename(url) or url + name = postfile.get("name") + if not name: + if url.startswith("https://stream.mux.com/"): + name = url + else: + name = self._filename(url) or url return (("postfile", url, name),) return () From 0502256251fc36d6317173c12f40e7c3efde11c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 21 Jan 2024 23:02:50 +0100 Subject: [PATCH 337/344] release version 1.26.7 --- CHANGELOG.md | 43 +++++++++++++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b135b74..277250d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,48 @@ # Changelog +## 1.26.7 - 2024-01-21 +### Extractors +#### Additions +- [2ch] add support ([#1009](https://github.com/mikf/gallery-dl/issues/1009), [#3540](https://github.com/mikf/gallery-dl/issues/3540), [#4444](https://github.com/mikf/gallery-dl/issues/4444)) +- [deviantart:avatar] add `formats` option ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [hatenablog] add support ([#5036](https://github.com/mikf/gallery-dl/issues/5036), [#5037](https://github.com/mikf/gallery-dl/issues/5037)) +- [mangadex] add `list` extractor ([#5025](https://github.com/mikf/gallery-dl/issues/5025)) +- [steamgriddb] add support ([#5033](https://github.com/mikf/gallery-dl/issues/5033), [#5041](https://github.com/mikf/gallery-dl/issues/5041)) +- [wikimedia] add support ([#1443](https://github.com/mikf/gallery-dl/issues/1443), [#2906](https://github.com/mikf/gallery-dl/issues/2906), [#3660](https://github.com/mikf/gallery-dl/issues/3660), [#2340](https://github.com/mikf/gallery-dl/issues/2340)) +- [wikimedia] support `fandom` wikis ([#2677](https://github.com/mikf/gallery-dl/issues/2677), [#3378](https://github.com/mikf/gallery-dl/issues/3378)) +#### Fixes +- [blogger] fix `lh-*.googleusercontent.com` URLs ([#5091](https://github.com/mikf/gallery-dl/issues/5091)) +- [bunkr] update domain ([#5088](https://github.com/mikf/gallery-dl/issues/5088)) +- [deviantart] fix AttributeError for URLs without username ([#5065](https://github.com/mikf/gallery-dl/issues/5065)) +- [deviantart] fix `KeyError: 'premium_folder_data'` ([#5063](https://github.com/mikf/gallery-dl/issues/5063)) +- [deviantart:avatar] fix exception when `comments` are enabled ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [fuskator] make metadata extraction non-fatal ([#5039](https://github.com/mikf/gallery-dl/issues/5039)) +- [gelbooru] only log "Incomplete API response" for favorites ([#5045](https://github.com/mikf/gallery-dl/issues/5045)) +- [giantessbooru] update domain +- [issuu] fix extraction +- [nijie] fix download URLs of single image posts ([#5049](https://github.com/mikf/gallery-dl/issues/5049)) +- [patreon] fix `KeyError: 'name'` ([#5048](https://github.com/mikf/gallery-dl/issues/5048), [#5069](https://github.com/mikf/gallery-dl/issues/5069), [#5093](https://github.com/mikf/gallery-dl/issues/5093)) +- [pixiv] update API headers ([#5029](https://github.com/mikf/gallery-dl/issues/5029)) +- [realbooru] fix download URLs of older posts +- [twitter] revert to using `media` timeline by default ([#4953](https://github.com/mikf/gallery-dl/issues/4953)) +- [vk] transform image URLs to non-blurred versions ([#5017](https://github.com/mikf/gallery-dl/issues/5017)) +#### Improvements +- [batoto] support more mirror domains ([#5042](https://github.com/mikf/gallery-dl/issues/5042)) +- [batoto] improve v2 manga URL pattern +- [gelbooru] support `all` tag and URLs with empty tags ([#5076](https://github.com/mikf/gallery-dl/issues/5076)) +- [patreon] download `m3u8` manifests with ytdl +- [sankaku] support post URLs with alphanumeric IDs ([#5073](https://github.com/mikf/gallery-dl/issues/5073)) +#### Metadata +- [batoto] improve `manga_id` extraction ([#5042](https://github.com/mikf/gallery-dl/issues/5042)) +- [erome] fix `count` metadata +- [kemonoparty] add `revision_hash` metadata ([#4706](https://github.com/mikf/gallery-dl/issues/4706), [#4727](https://github.com/mikf/gallery-dl/issues/4727), [#5013](https://github.com/mikf/gallery-dl/issues/5013)) +- [paheal] fix `source` metadata +- [webtoons] extract more metadata ([#5061](https://github.com/mikf/gallery-dl/issues/5061), [#5094](https://github.com/mikf/gallery-dl/issues/5094)) +#### Removals +- [chevereto] remove `pixl.li` +- [hbrowse] remove module +- [nitter] remove `nitter.lacontrevoie.fr` + ## 1.26.6 - 2024-01-06 ### Extractors #### Additions diff --git a/README.rst b/README.rst index ee165e52..490b54cf 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d348b548..f99beaab 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.7-dev" +__version__ = "1.26.7" From beacfa7436188899db7666a1503b087ba13eb998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 25 Jan 2024 20:22:05 +0100 Subject: [PATCH 338/344] [bunkr] update domain to 'bunkr.sk' (#5114) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/bunkr.py | 14 +++++++------- gallery_dl/version.py | 2 +- test/results/bunkr.py | 8 +++++++- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d6c88dd6..e810f422 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -123,7 +123,7 @@ Consider all listed sites to potentially be NSFW. </tr> <tr> <td>Bunkr</td> - <td>https://bunkrr.ru/</td> + <td>https://bunkr.sk/</td> <td>Albums, Media Files</td> <td></td> </tr> diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index e7fc14bb..a6eb76ec 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,13 +6,13 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkrr.ru/""" +"""Extractors for https://bunkr.sk/""" from .lolisafe import LolisafeAlbumExtractor from .. import text from urllib.parse import urlsplit, urlunsplit -BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:[rs]u|la|is|to)" +BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:sk|[rs]u|la|is|to)" MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", @@ -27,11 +27,11 @@ CDN_HOSTED_EXTENSIONS = ( class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkrr.ru albums""" + """Extractor for bunkr.sk albums""" category = "bunkr" - root = "https://bunkrr.ru" + root = "https://bunkr.sk" pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://bunkrr.ru/a/ID" + example = "https://bunkr.sk/a/ID" def fetch_album(self, album_id): # album metadata @@ -84,11 +84,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): class BunkrMediaExtractor(BunkrAlbumExtractor): - """Extractor for bunkrr.ru media links""" + """Extractor for bunkr.sk media links""" subcategory = "media" directory_fmt = ("{category}",) pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)" - example = "https://bunkrr.ru/v/FILENAME" + example = "https://bunkr.sk/v/FILENAME" def fetch_album(self, album_id): try: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f99beaab..0f850903 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.7" +__version__ = "1.26.8-dev" diff --git a/test/results/bunkr.py b/test/results/bunkr.py index 317cf44f..99e88d0e 100644 --- a/test/results/bunkr.py +++ b/test/results/bunkr.py @@ -9,7 +9,7 @@ from gallery_dl.extractor import bunkr __tests__ = ( { - "#url" : "https://bunkrr.ru/a/Lktg9Keq", + "#url" : "https://bunkr.sk/a/Lktg9Keq", "#category": ("lolisafe", "bunkr", "album"), "#class" : bunkr.BunkrAlbumExtractor, "#urls" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", @@ -64,6 +64,12 @@ __tests__ = ( "#count" : 9, }, +{ + "#url" : "https://bunkrr.ru/a/Lktg9Keq", + "#category": ("lolisafe", "bunkr", "album"), + "#class" : bunkr.BunkrAlbumExtractor, +}, + { "#url" : "https://bunkrr.su/a/Lktg9Keq", "#category": ("lolisafe", "bunkr", "album"), From c28475d325a341c3552ded616071a3627c662d2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 25 Jan 2024 20:30:37 +0100 Subject: [PATCH 339/344] [kemonoparty] fix deleting 'name' in orginal objects (#5103) ... when computing 'revision_hash' regression caused by 3d68eda4 dict.copy() only creates a shallow copy I know that and still managed to get I wrong ... --- gallery_dl/extractor/kemonoparty.py | 2 ++ test/results/kemonoparty.py | 23 ++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 10228b5c..7d227476 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -240,7 +240,9 @@ class KemonopartyExtractor(Extractor): rev.pop("added", None) rev.pop("next", None) rev.pop("prev", None) + rev["file"] = rev["file"].copy() rev["file"].pop("name", None) + rev["attachments"] = [a.copy() for a in rev["attachments"]] for a in rev["attachments"]: a.pop("name", None) return util.sha1(self._json_dumps(rev)) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index a320dd65..deae0cb7 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -175,9 +175,26 @@ __tests__ = ( "#class" : kemonoparty.KemonopartyPostExtractor, "#urls" : "https://kemono.su/data/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", - "revision_id": 142470, + "file" : { + "hash": "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", + "name": "wip update.jpg", + "path": "/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", + "type": "file", + }, + "attachments": [ + { + "hash": "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", + "name": "wip update.jpg", + "path": "/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", + "type": "attachment", + }, + ], + "filename" : "wip update", + "extension" : "jpg", + "hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", + "revision_id" : 142470, "revision_index": 2, - "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", + "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", }, { @@ -191,7 +208,7 @@ __tests__ = ( "revision_id": range(134996, 3052965), "revision_index": range(1, 9), - "revision_hash": r"re:^[0-9a-f]{40}$", + "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", }, From afd20ef42c4a7ec4f1e364963633555eb7c4517b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 25 Jan 2024 23:37:12 +0100 Subject: [PATCH 340/344] [kemonoparty] implement filtering duplicate revisions (#5013) set 'revisions' to '"unique"' to have it ignore duplicate revisions --- docs/configuration.rst | 5 ++- gallery_dl/extractor/kemonoparty.py | 59 ++++++++++++++++++----------- test/results/kemonoparty.py | 15 ++++++++ 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index b3344722..6244437f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2212,12 +2212,15 @@ Description extractor.kemonoparty.revisions ------------------------------- Type - ``bool`` + * ``bool`` + * ``string`` Default ``false`` Description Extract post revisions. + Set this to ``"unique"`` to filter out duplicate revisions. + Note: This requires 1 additional HTTP request per post. diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 7d227476..41fde5c3 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -39,6 +39,8 @@ class KemonopartyExtractor(Extractor): def _init(self): self.revisions = self.config("revisions") + if self.revisions: + self.revisions_unique = (self.revisions == "unique") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' @@ -222,8 +224,37 @@ class KemonopartyExtractor(Extractor): self.root, server) return self.request(url).json() - @memcache(keyarg=1) - def _post_revisions(self, url): + def _revisions_post(self, post, url): + post["revision_id"] = 0 + + try: + revs = self.request(url + "/revisions").json() + except exception.HttpError: + post["revision_hash"] = self._revision_hash(post) + post["revision_index"] = 1 + return (post,) + revs.insert(0, post) + + for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) + + if self.revisions_unique: + uniq = [] + last = None + for rev in revs: + if last != rev["revision_hash"]: + last = rev["revision_hash"] + uniq.append(rev) + revs = uniq + + idx = len(revs) + for rev in revs: + rev["revision_index"] = idx + idx -= 1 + + return revs + + def _revisions_all(self, url): revs = self.request(url + "/revisions").json() idx = len(revs) @@ -277,18 +308,8 @@ class KemonopartyUserExtractor(KemonopartyExtractor): if self.revisions: for post in posts: - post["revision_hash"] = self._revision_hash(post) - post["revision_id"] = 0 post_url = "{}/post/{}".format(self.api_url, post["id"]) - try: - revs = self._post_revisions(post_url) - except exception.HttpError: - post["revision_index"] = 1 - yield post - else: - post["revision_index"] = len(revs) + 1 - yield post - yield from revs + yield from self._revisions_post(post, post_url) else: yield from posts @@ -316,18 +337,10 @@ class KemonopartyPostExtractor(KemonopartyExtractor): if not self.revision: post = self.request(self.api_url).json() if self.revisions: - post["revision_hash"] = self._revision_hash(post) - post["revision_id"] = 0 - try: - revs = self._post_revisions(self.api_url) - except exception.HttpError: - post["revision_index"] = 1 - else: - post["revision_index"] = len(revs) + 1 - return itertools.chain((post,), revs) + return self._revisions_post(post, self.api_url) return (post,) - revs = self._post_revisions(self.api_url) + revs = self._revisions_all(self.api_url) if not self.revision_id: return revs diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index deae0cb7..c726781a 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -197,6 +197,21 @@ __tests__ = ( "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", }, +{ + "#url" : "https://kemono.su/patreon/user/3161935/post/68231671", + "#comment" : "unique revisions (#5013)", + "#category": ("", "kemonoparty", "patreon"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#options" : {"revisions": "unique"}, + "#urls" : "https://kemono.su/data/88/52/88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86.jpg", + + "filename" : "wip update", + "hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", + "revision_id" : 0, + "revision_index": 1, + "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", +}, + { "#url" : "https://kemono.party/patreon/user/3161935/post/68231671/revisions", "#comment" : "revisions (#4498)", From 34a4ddc3996b19cacea8c8c88a432ec9104a068b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 26 Jan 2024 17:56:08 +0100 Subject: [PATCH 341/344] [sankaku] add 'id-format' option (#5073) --- docs/configuration.rst | 13 +++++++++++++ gallery_dl/extractor/sankaku.py | 12 ++++++++---- test/results/sankaku.py | 10 +++++++++- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6244437f..304ddb4f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3009,6 +3009,19 @@ Description restrict it to only one possible format. +extractor.sankaku.id-format +--------------------------- +Type + ``string`` +Default + ``"numeric"`` +Description + Format of ``id`` metadata fields. + + * ``"alphanumeric"`` or ``"alnum"``: 11-character alphanumeric IDs (``y0abGlDOr2o``) + * ``"numeric"`` or ``"legacy"``: numeric IDs (``360451``) + + extractor.sankaku.refresh ------------------------- Type diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index b3b7a9cc..caf3e169 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -179,12 +179,16 @@ class SankakuAPI(): def __init__(self, extractor): self.extractor = extractor self.headers = { - "Accept" : "application/vnd.sankaku.api+json;v=2", - "Platform": "web-app", - "Origin" : extractor.root, + "Accept" : "application/vnd.sankaku.api+json;v=2", + "Platform" : "web-app", + "Api-Version": None, + "Origin" : extractor.root, } - self.username, self.password = self.extractor._get_auth_info() + if extractor.config("id-format") in ("alnum", "alphanumeric"): + self.headers["Api-Version"] = "2" + + self.username, self.password = extractor._get_auth_info() if not self.username: self.authenticate = util.noop diff --git a/test/results/sankaku.py b/test/results/sankaku.py index 361fd7a2..cfdea0c4 100644 --- a/test/results/sankaku.py +++ b/test/results/sankaku.py @@ -119,11 +119,16 @@ __tests__ = ( { "#url" : "https://sankaku.app/posts/y0abGlDOr2o", + "#comment" : "extended tag categories; alphanumeric ID (#5073)", "#category": ("booru", "sankaku", "post"), "#class" : sankaku.SankakuPostExtractor, - "#options" : {"tags": True}, + "#options" : { + "tags" : True, + "id-format": "alphanumeric", + }, "#sha1_content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "id": "y0abGlDOr2o", "tags_artist": [ "bonocho", ], @@ -150,6 +155,8 @@ __tests__ = ( "#category": ("booru", "sankaku", "post"), "#class" : sankaku.SankakuPostExtractor, "#pattern" : r"https://s\.sankakucomplex\.com/data/ac/8e/ac8e3b92ea328ce9cf7211e69c905bf9\.jpg\?e=.+", + + "id": 360451, }, { @@ -169,6 +176,7 @@ __tests__ = ( "#options" : {"tags": True}, "#count" : 1, + "id" : 20758561, "tags" : list, "tags_general": [ "key(mangaka)", From 1f7101d606405c257ada8c15a69bf1fefe023e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 27 Jan 2024 00:24:41 +0100 Subject: [PATCH 342/344] [archivedmoe] fix thebarchive webm URLs (#5116) --- gallery_dl/extractor/foolfuuka.py | 9 +++++++-- test/results/archivedmoe.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index cedac0c3..715abcb7 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -24,6 +24,8 @@ class FoolfuukaExtractor(BaseExtractor): BaseExtractor.__init__(self, match) if self.category == "b4k": self.remote = self._remote_direct + elif self.category == "archivedmoe": + self.referer = False def items(self): yield Message.Directory, self.metadata() @@ -53,9 +55,12 @@ class FoolfuukaExtractor(BaseExtractor): def remote(self, media): """Resolve a remote media link""" - needle = '<meta http-equiv="Refresh" content="0; url=' page = self.request(media["remote_media_link"]).text - return text.extr(page, needle, '"') + url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"') + if url.endswith(".webm") and \ + url.startswith("https://thebarchive.com/"): + return url[:-1] + return url @staticmethod def _remote_direct(media): diff --git a/test/results/archivedmoe.py b/test/results/archivedmoe.py index 90b48877..42aae181 100644 --- a/test/results/archivedmoe.py +++ b/test/results/archivedmoe.py @@ -23,6 +23,18 @@ __tests__ = ( "#sha1_url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", }, +{ + "#url" : "https://archived.moe/b/thread/912594917/", + "#comment" : "broken thebarchive .webm URLs (#5116)", + "#category": ("foolfuuka", "archivedmoe", "thread"), + "#class" : foolfuuka.FoolfuukaThreadExtractor, + "#urls" : ( + "https://thebarchive.com/b/full_image/1705625299234839.gif", + "https://thebarchive.com/b/full_image/1705625431133806.web", + "https://thebarchive.com/b/full_image/1705626190307840.web", + ), +}, + { "#url" : "https://archived.moe/gd/", "#category": ("foolfuuka", "archivedmoe", "board"), From 3433481dd2b01969a74da2e9e0c76f32c2058ca9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 27 Jan 2024 01:10:14 +0100 Subject: [PATCH 343/344] [gofile] update 'website_token' extraction --- gallery_dl/extractor/gofile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 3928792e..289f91cb 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -73,7 +73,7 @@ class GofileFolderExtractor(Extractor): def _get_website_token(self): self.log.debug("Fetching website token") page = self.request(self.root + "/dist/js/alljs.js").text - return text.extr(page, 'fetchData.websiteToken = "', '"') + return text.extr(page, 'fetchData.wt = "', '"') def _get_content(self, content_id, password=None): if password is not None: @@ -81,7 +81,7 @@ class GofileFolderExtractor(Extractor): return self._api_request("getContent", { "contentId" : content_id, "token" : self.api_token, - "websiteToken": self.website_token, + "wt" : self.website_token, "password" : password, }) From 22647c2626eb8e4387407383ce9ee2508a507ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 27 Jan 2024 16:24:03 +0100 Subject: [PATCH 344/344] [naverwebtoon] fix 'title' for comics with empty tags (#5120) --- gallery_dl/extractor/naverwebtoon.py | 2 +- test/results/naverwebtoon.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index 72ee5b06..18eef4fd 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -46,7 +46,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): "episode" : self.episode, "comic" : extr('titleName: "', '"'), "tags" : [t.strip() for t in text.extract_iter( - extr("tagList: [", "}],"), '"tagName":"', '"')], + extr("tagList: [", "],"), '"tagName":"', '"')], "title" : extr('"subtitle":"', '"'), "author" : [a.strip() for a in text.extract_iter( extr('"writers":[', ']'), '"name":"', '"')], diff --git a/test/results/naverwebtoon.py b/test/results/naverwebtoon.py index 889e5bce..ba5ca97e 100644 --- a/test/results/naverwebtoon.py +++ b/test/results/naverwebtoon.py @@ -61,6 +61,24 @@ __tests__ = ( "title_id" : "765124", }, +{ + "#url" : "https://comic.naver.com/bestChallenge/detail?titleId=620732&no=334", + "#comment" : "empty tags (#5120)", + "#category": ("", "naverwebtoon", "episode"), + "#class" : naverwebtoon.NaverwebtoonEpisodeExtractor, + "#count" : 9, + + "artist" : [], + "author" : ["안트로anthrokim"], + "comic" : "백일몽화원", + "count" : 9, + "episode" : "334", + "num" : range(1, 9), + "tags" : [], + "title" : "321화... 성(省)", + "title_id": "620732", +}, + { "#url" : "https://comic.naver.com/bestChallenge/detail.nhn?titleId=771467&no=3", "#category": ("", "naverwebtoon", "episode"),
Games
JPG Fishhttps://jpg1.su/Albums, individual Images, User Profiles
Keenspot http://www.keenspot.com/
Chevereto Instances
JPG Fishhttps://jpg2.su/Albums, individual Images, User Profiles
Pixlhttps://pixl.li/Albums, individual Images, User Profiles
Danbooru Instances
Albums, individual Images, User Profiles
IMG.Kiwihttps://img.kiwi/Albums, individual Images, User Profiles
DeltaPornohttps://gallery.deltaporno.com/Albums, individual Images, User Profiles
Danbooru Instances
Bunkr https://bunkrr.su/AlbumsAlbums, Media Files
Reddit https://www.reddit.com/Home Feed, individual Images, Submissions, Subreddits, User ProfilesHome Feed, individual Images, Redirects, Submissions, Subreddits, User Profiles OAuth
Favorites, Followed Users, Images from Notes, User Profiles
Misskey.designhttps://misskey.design/Favorites, Followed Users, Images from Notes, User Profiles
Lesbian.energy https://lesbian.energy/Pools, Popular Images, Posts, Tag Searches
4archivehttps://4archive.org/Boards, Threads
4chan https://www.4chan.org/', '<'), "language" : extr('>Language:', ' '), "filesize" : text.parse_bytes(extr( - '>File Size:', '<').rstrip("Bb")), + '>File Size:', '<').rstrip("Bbi")), "filecount" : extr('>Length:', ' '), "favorites" : extr('id="favcount">', ' '), "rating" : extr(">Average: ", "<"), @@ -251,14 +256,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return data def metadata_from_api(self): - url = self.root + "/api.php" data = { - "method": "gdata", - "gidlist": ((self.gallery_id, self.gallery_token),), + "method" : "gdata", + "gidlist" : ((self.gallery_id, self.gallery_token),), "namespace": 1, } - data = self.request(url, method="POST", json=data).json() + data = self.request(self.api_url, method="POST", json=data).json() if "error" in data: raise exception.StopExtraction(data["error"]) @@ -269,7 +273,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): pos = page.index('
= 0: + origurl, pos = text.rextract(i6, '"', '"', pos) url = text.unescape(origurl) data = self._parse_original_info(text.extract( - page["i7"], "ownload original", "<", pos)[0]) + i6, "ownload original", "<", pos)[0]) else: url = imgurl data = self._parse_image_info(url) diff --git a/test/results/exhentai.py b/test/results/exhentai.py index 293449c8..9165e764 100644 --- a/test/results/exhentai.py +++ b/test/results/exhentai.py @@ -40,7 +40,7 @@ __tests__ = ( "group:seventh lowlife", "other:sample", ], - "thumb" : "https://exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f83bcb1630ab1350640-624622-736-1036-jpg_250.jpg", + "thumb" : "https://s.exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f83bcb1630ab1350640-624622-736-1036-jpg_250.jpg", "title" : "C93 [Seventh_Lowlife] Komi-san ha Tokidoki Daitan desu (Komi-san wa Komyushou desu) [Sample]", "title_jpn" : "(C93) [Comiketjack (わ!)] 古見さんは、時々大胆です。 (古見さんは、コミュ症です。) [見本]", "token" : "d55c44d3d0", From 56cd9d408de9eec5d406f7763a3b162a8b529c08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 30 Oct 2023 22:14:52 +0100 Subject: [PATCH 089/344] [weibo] fix Sina Visitor request --- gallery_dl/extractor/weibo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 168d5a0f..ed05e1f2 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -191,7 +191,7 @@ class WeiboExtractor(Extractor): headers = {"Referer": response.url} data = { "cb": "gen_callback", - "fp": '{"os":"1","browser":"Gecko91,0,0,0","fonts":"undefined",' + "fp": '{"os":"1","browser":"Gecko109,0,0,0","fonts":"undefined",' '"screenInfo":"1920*1080*24","plugins":""}', } @@ -203,8 +203,8 @@ class WeiboExtractor(Extractor): params = { "a" : "incarnate", "t" : data["tid"], - "w" : "2", - "c" : "{:>03}".format(data["confidence"]), + "w" : "3" if data.get("new_tid") else "2", + "c" : "{:>03}".format(data.get("confidence") or 100), "gc" : "", "cb" : "cross_domain", "from" : "weibo", From 44d7964c09b06c7771cb4bf87ec8ac8f84452d92 Mon Sep 17 00:00:00 2001 From: thatfuckingbird <18678-thatfuckingbird@users.noreply.gitgud.io> Date: Wed, 1 Nov 2023 15:44:28 +0100 Subject: [PATCH 090/344] [twitter] recognize fixupx.com URLs --- gallery_dl/extractor/twitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index cc8b8f62..4766ae59 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -15,7 +15,8 @@ import itertools import json import re -BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:(?:[fv]x)?twitter|x)\.com" +BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" + r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com") class TwitterExtractor(Extractor): From 72b18d701f7bf4b3190ad21a67780e532befb41a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 Nov 2023 15:23:28 +0100 Subject: [PATCH 091/344] represent util.NONE as 'null' in JSON output was '"None"' before --- gallery_dl/postprocessor/metadata.py | 3 ++- gallery_dl/util.py | 10 ++++++++-- test/test_util.py | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 5004bed6..93dd9a19 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -206,7 +206,8 @@ class MetadataPP(PostProcessor): sort_keys=options.get("sort", False), separators=options.get("separators"), indent=options.get("indent", indent), - check_circular=False, default=str, + check_circular=False, + default=util.json_default, ) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 62e7b4aa..6255d49e 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -223,8 +223,14 @@ def datetime_to_timestamp_string(dt): return "" +def json_default(obj): + if isinstance(obj, CustomNone): + return None + return str(obj) + + json_loads = json._default_decoder.decode -json_dumps = json.JSONEncoder(default=str).encode +json_dumps = json.JSONEncoder(default=json_default).encode def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): @@ -233,7 +239,7 @@ def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): obj, fp, ensure_ascii=ensure_ascii, indent=indent, - default=str, + default=json_default, sort_keys=True, ) fp.write("\n") diff --git a/test/test_util.py b/test/test_util.py index 0813a0bc..780f4751 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -750,6 +750,7 @@ def hash(value): self.assertIs(obj(), obj) self.assertIs(obj(1, "a"), obj) self.assertIs(obj(foo="bar"), obj) + self.assertEqual(util.json_dumps(obj), "null") i = 0 for _ in obj: From 91e20eb59b852f56197233e3d4e1a1650e98492f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 Nov 2023 15:25:01 +0100 Subject: [PATCH 092/344] [fantia] simplify 'tags' to a list of strings (#4752) --- gallery_dl/extractor/fantia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 4a67695f..6218f198 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -108,7 +108,7 @@ class FantiaExtractor(Extractor): "fanclub_user_name": resp["fanclub"]["user"]["name"], "fanclub_name": resp["fanclub"]["name"], "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), - "tags": resp["tags"], + "tags": [t["name"] for t in resp["tags"]], "_data": resp, } From fc8f86bf24d6216d796ac410821b2cedb680ce57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 Nov 2023 15:29:44 +0100 Subject: [PATCH 093/344] [hitomi] recognize 'imageset' gallery URLs (#4756) --- gallery_dl/extractor/hitomi.py | 2 +- test/results/hitomi.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index bc49ca31..88f57087 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor): category = "hitomi" root = "https://hitomi.la" pattern = (r"(?:https?://)?hitomi\.la" - r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)" + r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)" r"/(?:[^/?#]+-)?(\d+)") example = "https://hitomi.la/manga/TITLE-867789.html" diff --git a/test/results/hitomi.py b/test/results/hitomi.py index 10e94135..9039525c 100644 --- a/test/results/hitomi.py +++ b/test/results/hitomi.py @@ -92,6 +92,13 @@ __tests__ = ( "#class" : hitomi.HitomiGalleryExtractor, }, +{ + "#url" : "https://hitomi.la/imageset/867789.html", + "#comment" : "/imageset/ gallery (#4756)", + "#category": ("", "hitomi", "gallery"), + "#class" : hitomi.HitomiGalleryExtractor, +}, + { "#url" : "https://hitomi.la/reader/867789.html", "#category": ("", "hitomi", "gallery"), From cdf77e326f5a961c4560d249a4ca6312ca1bc105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 Nov 2023 15:32:48 +0100 Subject: [PATCH 094/344] [twitter] add test for fixupx.com --- test/results/twitter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/results/twitter.py b/test/results/twitter.py index 6f9efbba..4e228424 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -54,6 +54,12 @@ __tests__ = ( "#class" : twitter.TwitterUserExtractor, }, +{ + "#url" : "https://fixupx.com/supernaturepics", + "#category": ("", "twitter", "user"), + "#class" : twitter.TwitterUserExtractor, +}, + { "#url" : "https://x.com/supernaturepics", "#category": ("", "twitter", "user"), From 43d0c49d7ec3605a125f99719887d3a5fdb3c276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 Nov 2023 15:46:01 +0100 Subject: [PATCH 095/344] [exhentai] fix original image URLs (#4754) --- gallery_dl/extractor/exhentai.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 268385ef..549b8b2b 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -179,7 +179,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data.update(image) if self.limits: self._check_limits(data) - if "/fullimg.php" in url: + if "/fullimg" in url: data["_http_validate"] = _validate_response else: data["_http_validate"] = None @@ -275,11 +275,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key_next = extr("'", "'") iurl = extr('= 0: origurl, pos = text.rextract(i6, '"', '"', pos) url = text.unescape(origurl) @@ -337,7 +337,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["image_token"] = imgkey self._check_509(imgurl, data) - yield url, text.nameext_from_url(imgurl, data) + yield url, text.nameext_from_url(url, data) request["imgkey"] = nextkey From caf31e751c1caee6e0cb18f8d0c2c043d96fc26c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 Nov 2023 15:53:23 +0100 Subject: [PATCH 096/344] [kemonoparty] limit 'title' length (#4741) --- gallery_dl/extractor/kemonoparty.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 631ba266..cba62110 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -24,7 +24,7 @@ class KemonopartyExtractor(Extractor): category = "kemonoparty" root = "https://kemono.party" directory_fmt = ("{category}", "{service}", "{user}") - filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}" + filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" cookies_domain = ".kemono.party" From dd14adccf661d1c2c08054318ad522c980445e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 3 Nov 2023 23:39:58 +0100 Subject: [PATCH 097/344] [pixiv] allow cookies for non-OAuth URLs (#4760) --- gallery_dl/extractor/pixiv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 18a3ceb6..411d1912 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -517,6 +517,7 @@ class PixivPixivisionExtractor(PixivExtractor): directory_fmt = ("{category}", "pixivision", "{pixivision_id} {pixivision_title}") archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}" + cookies_domain = ".pixiv.net" pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)" example = "https://www.pixivision.net/en/a/12345" @@ -549,6 +550,9 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" + cookies_domain = ".pixiv.net" + browser = "firefox" + tls12 = False pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" example = "https://www.pixiv.net/user/12345/series/12345" From 3984a49abfc1059933d5ef3dc84283608740ebb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 3 Nov 2023 23:44:47 +0100 Subject: [PATCH 098/344] [nijie] set 1-2s delay between requests to avoid 429 errors --- gallery_dl/extractor/nijie.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index b902404c..76c5404e 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -19,6 +19,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): directory_fmt = ("{category}", "{user_id}") filename_fmt = "{image_id}_p{num}.{extension}" archive_fmt = "{image_id}_{num}" + request_interval = (1.0, 2.0) def __init__(self, match): BaseExtractor.__init__(self, match) From 007c433677ecac4597d2b798bcc9bc7026108749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 4 Nov 2023 00:17:41 +0100 Subject: [PATCH 099/344] [patreon] support 'id:' in place of a user name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://patreon.com/id:12345 … and remove 'campaign-id' config option --- docs/configuration.rst | 14 -------------- gallery_dl/extractor/patreon.py | 8 ++++---- test/results/patreon.py | 16 ++++++++++++++-- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 23cc8f5b..25b0ad9c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2440,20 +2440,6 @@ Description Note: This requires 1 additional HTTP request per post. -extractor.patreon.campaign-id ------------------------------ -Type - ``string`` -Default - ``"auto"`` -Description - Alternative way of specifying the ``campaign_id`` value of a creator - in case the automatic extraction method no longer functions. - - Another way of specifying this value is using a ``c`` or ``campaign_id`` - URL query parameter, e,g, ``https://www.patreon.com/NAME?c=12345``. - - extractor.patreon.files ----------------------- Type diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 351c5e3c..6aef9cbe 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -296,9 +296,8 @@ class PatreonCreatorExtractor(PatreonExtractor): return self._pagination(url) def _get_campaign_id(self, query): - campaign_id = self.config("campaign-id") - if campaign_id and campaign_id != "auto": - return str(campaign_id) + if self.creator.startswith("id:"): + return self.creator[3:] campaign_id = query.get("c") or query.get("campaign_id") if campaign_id: @@ -316,7 +315,8 @@ class PatreonCreatorExtractor(PatreonExtractor): data = self._extract_bootstrap(page) return data["campaign"]["data"]["id"] except (KeyError, ValueError) as exc: - self.log.debug(data) + if data: + self.log.debug(data) raise exception.StopExtraction( "Unable to extract campaign ID (%s: %s)", exc.__class__.__name__, exc) diff --git a/test/results/patreon.py b/test/results/patreon.py index adad12e3..d4557173 100644 --- a/test/results/patreon.py +++ b/test/results/patreon.py @@ -14,8 +14,8 @@ __tests__ = ( "#url" : "https://www.patreon.com/koveliana", "#category": ("", "patreon", "creator"), "#class" : patreon.PatreonCreatorExtractor, - "#range" : "1-25", - "#count" : ">= 25", + "#range" : "1-15", + "#count" : 15, "attachments" : list, "comment_count": int, @@ -58,6 +58,18 @@ __tests__ = ( "#class" : patreon.PatreonCreatorExtractor, }, +{ + "#url" : "https://www.patreon.com/user?c=369707", + "#category": ("", "patreon", "creator"), + "#class" : patreon.PatreonCreatorExtractor, +}, + +{ + "#url" : "https://www.patreon.com/id:369707", + "#category": ("", "patreon", "creator"), + "#class" : patreon.PatreonCreatorExtractor, +}, + { "#url" : "https://www.patreon.com/home", "#category": ("", "patreon", "user"), From f4e61fd1d5ed50c0db025b84cea838ff0dfc5bef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 4 Nov 2023 00:31:29 +0100 Subject: [PATCH 100/344] reword 'reddit.client-id' instructions (#4749) --- docs/configuration.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 25b0ad9c..0ea2eaed 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5279,9 +5279,14 @@ How To * login and visit the `apps `__ section of your account's preferences * click the "are you a developer? create an app..." button - * fill out the form, choose "installed app", preferably set - "http://localhost:6414/" as "redirect uri" and finally click - "create app" + * fill out the form: + + * choose a name + * select "installed app" + * set ``http://localhost:6414/`` as "redirect uri" + * solve the "I'm not a rebot" reCATCHA if needed + * click "create app" + * copy the client id (third line, under your application's name and "installed app") and put it in your configuration file as ``"client-id"`` From 69b931b9bb79d161633f9028963c3b6e2bf8293d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 4 Nov 2023 17:06:46 +0100 Subject: [PATCH 101/344] [exhentai] provide fallback URLs (#1021, #4745) --- gallery_dl/extractor/exhentai.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 549b8b2b..5185fe31 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -275,15 +275,19 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key_next = extr("'", "'") iurl = extr(' Date: Sat, 4 Nov 2023 17:30:27 +0100 Subject: [PATCH 102/344] [exhentai] try to avoid 'DH_KEY_TOO_SMALL' errors (#1021, #4593) --- gallery_dl/extractor/common.py | 2 ++ gallery_dl/extractor/exhentai.py | 1 + 2 files changed, 3 insertions(+) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 0d67df77..3bec4248 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -35,6 +35,7 @@ class Extractor(): root = "" cookies_domain = "" referer = True + ciphers = None tls12 = True browser = None request_interval = 0.0 @@ -305,6 +306,7 @@ class Extractor(): headers["User-Agent"] = useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" + ssl_ciphers = self.ciphers if BROTLI: headers["Accept-Encoding"] = "gzip, deflate, br" diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 5185fe31..182910cd 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -27,6 +27,7 @@ class ExhentaiExtractor(Extractor): cookies_names = ("ipb_member_id", "ipb_pass_hash") root = "https://exhentai.org" request_interval = 5.0 + ciphers = "DEFAULT:!DH" LIMIT = False From 6402f2950f2eac1cd0f85fa14ab83aea8b26d548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 4 Nov 2023 17:33:14 +0100 Subject: [PATCH 103/344] [pp:metadata] ignore non-string tag values (#4764) --- gallery_dl/postprocessor/metadata.py | 2 +- test/test_postprocessor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 93dd9a19..18d00e13 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -189,7 +189,7 @@ class MetadataPP(PostProcessor): tags = [] extend = tags.extend for tagdict in taglists: - extend([x for x in tagdict.values() if x is not None]) + extend([x for x in tagdict.values() if isinstance(x, str)]) tags.sort() fp.write("\n".join(tags) + "\n") diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index b64df882..fb1d739e 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -365,8 +365,8 @@ class MetadataTest(BasePostprocessorTest): self._create( {"mode": "tags"}, {"tags": [ - {"g": "foobar1", "m": "foobar2"}, - {"g": None, "m": "foobarbaz"} + {"g": "foobar1", "m": "foobar2", "u": True}, + {"g": None, "m": "foobarbaz", "u": [3, 4]}, ]}, ) with patch("builtins.open", mock_open()) as m: From 807ddde7e102f424ee3845f14450bf5f0c35fc79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 4 Nov 2023 21:58:52 +0100 Subject: [PATCH 104/344] release version 1.26.2 --- CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34607f2a..ad34930f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,44 @@ # Changelog +## 1.26.2 - 2023-11-04 +### Extractors +#### Additions +- [4archive] add `thread` and `board` extractors ([#1262](https://github.com/mikf/gallery-dl/issues/1262), [#2418](https://github.com/mikf/gallery-dl/issues/2418), [#4400](https://github.com/mikf/gallery-dl/issues/4400), [#4710](https://github.com/mikf/gallery-dl/issues/4710), [#4714](https://github.com/mikf/gallery-dl/issues/4714)) +- [hitomi] recognize `imageset` gallery URLs ([#4756](https://github.com/mikf/gallery-dl/issues/4756)) +- [kemonoparty] add `revision_index` metadata field ([#4727](https://github.com/mikf/gallery-dl/issues/4727)) +- [misskey] support `misskey.design` ([#4713](https://github.com/mikf/gallery-dl/issues/4713)) +- [reddit] support Reddit Mobile share links ([#4693](https://github.com/mikf/gallery-dl/issues/4693)) +- [sankaku] support `/posts/` tag search URLs ([#4740](https://github.com/mikf/gallery-dl/issues/4740)) +- [twitter] recognize `fixupx.com` URLs ([#4755](https://github.com/mikf/gallery-dl/issues/4755)) +#### Fixes +- [exhentai] update to site layout changes ([#4730](https://github.com/mikf/gallery-dl/issues/4730), [#4754](https://github.com/mikf/gallery-dl/issues/4754)) +- [exhentai] provide fallback URLs ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4745](https://github.com/mikf/gallery-dl/issues/4745)) +- [exhentai] disable `DH` ciphers to avoid `DH_KEY_TOO_SMALL` errors ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4593](https://github.com/mikf/gallery-dl/issues/4593)) +- [idolcomplex] disable sending Referer headers ([#4726](https://github.com/mikf/gallery-dl/issues/4726)) +- [instagram] update API headers +- [kemonoparty] fix parsing of non-standard `date` values ([#4676](https://github.com/mikf/gallery-dl/issues/4676)) +- [patreon] fix `campaign_id` extraction ([#4699](https://github.com/mikf/gallery-dl/issues/4699), [#4715](https://github.com/mikf/gallery-dl/issues/4715), [#4736](https://github.com/mikf/gallery-dl/issues/4736), [#4738](https://github.com/mikf/gallery-dl/issues/4738)) +- [pixiv] load cookies for non-OAuth URLs ([#4760](https://github.com/mikf/gallery-dl/issues/4760)) +- [twitter] fix avatars without `date` information ([#4696](https://github.com/mikf/gallery-dl/issues/4696)) +- [twitter] restore truncated retweet texts ([#3430](https://github.com/mikf/gallery-dl/issues/3430), [#4690](https://github.com/mikf/gallery-dl/issues/4690)) +- [weibo] fix Sina Visitor requests +#### Improvements +- [behance] unescape embed URLs ([#4742](https://github.com/mikf/gallery-dl/issues/4742)) +- [fantia] simplify `tags` to a list of strings ([#4752](https://github.com/mikf/gallery-dl/issues/4752)) +- [kemonoparty] limit `title` length ([#4741](https://github.com/mikf/gallery-dl/issues/4741)) +- [nijie] set 1-2s delay between requests to avoid 429 errors +- [patreon] provide ways to manually specify a user's campaign_id + - `https://www.patreon.com/id:12345` + - `https://www.patreon.com/USER?c=12345` + - `https://www.patreon.com/USER?campaign_id=12345` +- [twitter] cache `user_by_…` results ([#4719](https://github.com/mikf/gallery-dl/issues/4719)) +### Post Processors +#### Fixes +- [metadata] ignore non-string tag values ([#4764](https://github.com/mikf/gallery-dl/issues/4764)) +### Miscellaneous +#### Fixes +- prevent crash when `stdout.line_buffering` is not defined ([#642](https://github.com/mikf/gallery-dl/issues/642)) + ## 1.26.1 - 2023-10-21 ### Extractors #### Additions diff --git a/README.rst b/README.rst index 207b68ec..9c1b3388 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 29f1d055..5050174e 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.2-dev" +__version__ = "1.26.2" From 5e58d2b45557b866874733ffaa4a74328612e701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 Nov 2023 14:49:29 +0100 Subject: [PATCH 105/344] [instagram] fix exception on empty 'video_versions' (#4795) --- gallery_dl/extractor/instagram.py | 5 +++-- gallery_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index b0789be1..8ec6741d 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -217,9 +217,10 @@ class InstagramExtractor(Extractor): data["post_shortcode"]) continue - if "video_versions" in item: + video_versions = item.get("video_versions") + if video_versions: video = max( - item["video_versions"], + video_versions, key=lambda x: (x["width"], x["height"], x["type"]), ) media = video diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 5050174e..dd816f06 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.2" +__version__ = "1.26.3-dev" From e8b5e59a08de296b53299bc7d5b9a69f52d63251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 Nov 2023 19:35:29 +0100 Subject: [PATCH 106/344] [weibo] detect redirects to login page (#4773) --- gallery_dl/extractor/weibo.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index ed05e1f2..7413b5a0 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -41,9 +41,14 @@ class WeiboExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and "passport.weibo.com" in response.url: - self._sina_visitor_system(response) - response = Extractor.request(self, url, **kwargs) + if response.history: + if "login.sina.com" in response.url: + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", + response.url.partition("?")[0]) + if "passport.weibo.com" in response.url: + self._sina_visitor_system(response) + response = Extractor.request(self, url, **kwargs) return response From 7a0f145cbec9152558cff1db9be354eb87a2291d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 Nov 2023 23:40:33 +0100 Subject: [PATCH 107/344] [twitter] ignore promoted Tweets (#4790, #3894) add 'ads' option in case someone actually wants to download promoted content for whatever reason --- docs/configuration.rst | 10 ++++++++++ gallery_dl/extractor/twitter.py | 19 ++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 0ea2eaed..1f6ad0c1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3109,6 +3109,16 @@ Description See `Filters `__ for details. +extractor.twitter.ads +--------------------- +Type + ``bool`` +Default + ``false`` +Description + Fetch media from promoted Tweets. + + extractor.twitter.cards ----------------------- Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4766ae59..ca1e9067 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -43,6 +43,7 @@ class TwitterExtractor(Extractor): self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) self.cards = self.config("cards", False) + self.ads = self.config("ads", False) self.cards_blacklist = self.config("cards-blacklist") self.syndication = self.config("syndication") @@ -1034,7 +1035,7 @@ class TwitterAPI(): "focalTweetId": tweet_id, "referrer": "profile", "with_rux_injections": False, - "includePromotedContent": True, + "includePromotedContent": False, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, "withBirdwatchNotes": True, @@ -1049,7 +1050,7 @@ class TwitterAPI(): variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, - "includePromotedContent": True, + "includePromotedContent": False, "withQuickPromoteEligibilityTweetFields": True, "withVoice": True, "withV2Timeline": True, @@ -1061,7 +1062,7 @@ class TwitterAPI(): variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, - "includePromotedContent": True, + "includePromotedContent": False, "withCommunity": True, "withVoice": True, "withV2Timeline": True, @@ -1498,13 +1499,21 @@ class TwitterAPI(): for entry in tweets: try: - tweet = ((entry.get("content") or entry["item"]) - ["itemContent"]["tweet_results"]["result"]) + item = ((entry.get("content") or entry["item"]) + ["itemContent"]) + if "promotedMetadata" in item and not extr.ads: + extr.log.debug( + "Skipping %s (ad)", + (entry.get("entryId") or "").rpartition("-")[2]) + continue + + tweet = item["tweet_results"]["result"] if "tombstone" in tweet: tweet = self._process_tombstone( entry, tweet["tombstone"]) if not tweet: continue + if "tweet" in tweet: tweet = tweet["tweet"] legacy = tweet["legacy"] From 4288cea94a0dce84e514098a1236b05012750a69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 Nov 2023 00:34:49 +0100 Subject: [PATCH 108/344] [mastodon] fix reblogs (#4580) --- gallery_dl/extractor/mastodon.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 3c2b03ee..c5fe8407 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -45,6 +45,9 @@ class MastodonExtractor(BaseExtractor): attachments = status["media_attachments"] del status["media_attachments"] + if status["reblog"]: + attachments.extend(status["reblog"]["media_attachments"]) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -113,7 +116,10 @@ class MastodonUserExtractor(MastodonExtractor): return api.account_statuses( api.account_id_by_username(self.item), - only_media=not self.config("text-posts", False), + only_media=( + not self.reblogs and + not self.config("text-posts", False) + ), exclude_replies=not self.replies, ) From 3f591d5a4edd997c2c09bcf80a35f7a6ac083c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 Nov 2023 21:24:07 +0100 Subject: [PATCH 109/344] [mastodon] update test results --- test/results/mastodonsocial.py | 52 +++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/test/results/mastodonsocial.py b/test/results/mastodonsocial.py index 2ad0dc20..d97d8976 100644 --- a/test/results/mastodonsocial.py +++ b/test/results/mastodonsocial.py @@ -18,14 +18,28 @@ __tests__ = ( }, { - "#url" : "https://mastodon.social/@yoru_nine@pawoo.net", + "#url" : "https://mastodon.social/@ponapalt@ukadon.shillest.net", "#category": ("mastodon", "mastodon.social", "user"), "#class" : mastodon.MastodonUserExtractor, - "#pattern" : r"https://mastodon\.social/media_proxy/\d+/original", + "#pattern" : r"https://files\.mastodon\.social/cache/media_attachments/files/.+/original/\w{16}\.\w+$", "#range" : "1-10", "#count" : 10, }, +{ + "#url" : "https://mastodon.social/@gallerydl", + "#comment" : "reblogged/'boosted' posts (#4580)", + "#category": ("mastodon", "mastodon.social", "user"), + "#class" : mastodon.MastodonUserExtractor, + "#options" : {"reblogs": True}, + "#archive" : False, + "#urls": ( + "https://files.mastodon.social/media_attachments/files/111/330/852/486/713/967/original/2c25ade55a9d1af2.jpg", + "https://files.mastodon.social/media_attachments/files/111/331/603/082/304/823/original/e12cde371c88c1b0.png", + "https://files.mastodon.social/media_attachments/files/111/331/603/082/304/823/original/e12cde371c88c1b0.png", + ), +}, + { "#url" : "https://mastodon.social/@id:10843", "#category": ("mastodon", "mastodon.social", "user"), @@ -63,11 +77,41 @@ __tests__ = ( }, { - "#url" : "https://mastodon.social/users/0x4f/following", + "#url" : "https://mastodon.social/@gallerydl/following", "#category": ("mastodon", "mastodon.social", "following"), "#class" : mastodon.MastodonFollowingExtractor, - "#count" : ">= 20", "#extractor": False, + "#urls" : ( + "https://mastodon.social/@0x4f", + "https://mastodon.social/@RustyBertrand", + "https://mastodon.social/@christianselig", + "https://saturation.social/@clive", + "https://mastodon.social/@sjvn", + ), + + "acct" : str, + "avatar" : r"re:https://files.mastodon.social/.+\.\w+$", + "avatar_static" : r"re:https://files.mastodon.social/.+\.\w+$", + "bot" : False, + "created_at" : r"re:\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z", + "discoverable" : True, + "display_name" : str, + "emojis" : [], + "fields" : list, + "followers_count": int, + "following_count": int, + "group" : False, + "header" : str, + "header_static" : str, + "id" : r"re:\d+", + "last_status_at": r"re:\d{4}-\d{2}-\d{2}", + "locked" : False, + "note" : str, + "statuses_count": int, + "uri" : str, + "url" : str, + "username" : str, + }, { From 0435c6e603b3b44b9a98fe5f9969fe95d9abdf45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 12 Nov 2023 22:47:40 +0100 Subject: [PATCH 110/344] =?UTF-8?q?[exhentai]=20handle=20'Downloading=20?= =?UTF-8?q?=E2=80=A6=20requires=20GP'=20errors=20(#4576,=20#4763)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gallery_dl/extractor/exhentai.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 182910cd..db074b0f 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -171,6 +171,14 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): # declared inside 'items()' to be able to access 'data' if not response.history and response.headers.get( "content-type", "").startswith("text/html"): + page = response.text + self.log.warning("'%s'", page) + + if " requires GP" in page: + self.log.info("Falling back to non-original downloads") + self.original = False + return data["_url_1280"] + self._report_limits(data) return True @@ -296,6 +304,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') + data["_url_1280"] = iurl self.key_show = extr('var showkey="', '";') self._check_509(iurl, data) @@ -345,6 +354,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey + data["_url_1280"] = imgurl self._check_509(imgurl, data) yield url, text.nameext_from_url(url, data) From 2e4bf54644d49e56d10cb592b7860892dc3e7c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 13 Nov 2023 00:27:22 +0100 Subject: [PATCH 111/344] [hentaifoundry] check for and update expired sessions (#4694) --- gallery_dl/extractor/hentaifoundry.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 8ba23c2a..c75c90d4 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -133,9 +133,25 @@ class HentaifoundryExtractor(Extractor): return text.nameext_from_url(data["src"], data) - def _init_site_filters(self): + def _request_check(self, url, **kwargs): + self.request = self._request_original + + # check for Enter button / front page + # and update PHPSESSID and content filters if necessary + response = self.request(url, **kwargs) + content = response.content + if len(content) < 5000 and \ + b'
Date: Mon, 13 Nov 2023 16:24:54 +0100 Subject: [PATCH 112/344] [oauth] warn when cache is enabled but not writeable (#4771) --- gallery_dl/extractor/oauth.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 45313c51..d1f135d8 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, oauth, util, config, exception from ..output import stdout_write -from ..cache import cache +from ..cache import cache, memcache import urllib.parse import binascii import hashlib @@ -31,6 +31,9 @@ class OAuthBase(Extractor): def _init(self): self.cache = config.get(("extractor", self.category), "cache", True) + if self.cache and cache is memcache: + self.log.warning("cache file is not writeable") + self.cache = False def oauth_config(self, key, default=None): value = config.interpolate(("extractor", self.subcategory), key) From 4700051562060ac1e7ceccaae1d3dfa0832713f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 14 Nov 2023 20:38:11 +0100 Subject: [PATCH 113/344] rework and extend input file processing (#4732) - add 2 command-line options to modify input file contents - -I/--input-file-comment - -x/--input-file-delete - implement InputManager class - move code from util.py to __init__.py (mainly to avoid import cycles) --- docs/options.md | 12 +- gallery_dl/__init__.py | 276 ++++++++++++++++++++++++++++++++++------ gallery_dl/exception.py | 10 ++ gallery_dl/option.py | 44 +++++-- gallery_dl/util.py | 85 ------------- 5 files changed, 286 insertions(+), 141 deletions(-) diff --git a/docs/options.md b/docs/options.md index 2486cbfe..548b5868 100644 --- a/docs/options.md +++ b/docs/options.md @@ -6,8 +6,6 @@ ## General Options: -h, --help Print this help message and exit --version Print program version and exit - -i, --input-file FILE Download URLs found in FILE ('-' for stdin). - More than one --input-file can be specified -f, --filename FORMAT Filename format string for downloaded files ('/O' for "original" filenames) -d, --destination PATH Target location for file downloads @@ -19,6 +17,16 @@ --clear-cache MODULE Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) +## Input Options: + -i, --input-file FILE Download URLs found in FILE ('-' for stdin). + More than one --input-file can be specified + -I, --input-file-comment FILE + Download URLs found in FILE. Comment them out + after they were downloaded successfully. + -x, --input-file-delete FILE + Download URLs found in FILE. Delete them after + they were downloaded successfully. + ## Output Options: -q, --quiet Activate quiet mode -v, --verbose Print various debugging information diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index d3a0f588..1d64fefc 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -18,19 +18,6 @@ __email__ = "mike_faehrmann@web.de" __version__ = version.__version__ -def progress(urls, pformat): - """Wrapper around urls to output a simple progress indicator""" - if pformat is True: - pformat = "[{current}/{total}] {url}\n" - else: - pformat += "\n" - - pinfo = {"total": len(urls)} - for pinfo["current"], pinfo["url"] in enumerate(urls, 1): - output.stderr_write(pformat.format_map(pinfo)) - yield pinfo["url"] - - def main(): try: parser = option.build_parser() @@ -224,7 +211,7 @@ def main(): return config.initialize() else: - if not args.urls and not args.inputfiles: + if not args.urls and not args.input_files: parser.error( "The following arguments are required: URL\n" "Use 'gallery-dl --help' to get a list of all options.") @@ -238,22 +225,6 @@ def main(): else: jobtype = args.jobtype or job.DownloadJob - urls = args.urls - if args.inputfiles: - for inputfile in args.inputfiles: - try: - if inputfile == "-": - if sys.stdin: - urls += util.parse_inputfile(sys.stdin, log) - else: - log.warning( - "input file: stdin is not readable") - else: - with open(inputfile, encoding="utf-8") as file: - urls += util.parse_inputfile(file, log) - except OSError as exc: - log.warning("input file: %s", exc) - # unsupported file logging handler handler = output.setup_logging_handler( "unsupportedfile", fmt="{message}") @@ -263,25 +234,44 @@ def main(): ulog.propagate = False job.Job.ulog = ulog + # collect input URLs + input_manager = InputManager() + input_manager.log = input_log = logging.getLogger("inputfile") + input_manager.add_list(args.urls) + + if args.input_files: + for input_file, action in args.input_files: + try: + path = util.expand_path(input_file) + input_manager.add_file(path, action) + except Exception as exc: + input_log.error(exc) + return getattr(exc, "code", 128) + pformat = config.get(("output",), "progress", True) - if pformat and len(urls) > 1 and args.loglevel < logging.ERROR: - urls = progress(urls, pformat) - else: - urls = iter(urls) + if pformat and len(input_manager.urls) > 1 and \ + args.loglevel < logging.ERROR: + input_manager.progress(pformat) + # process input URLs retval = 0 - url = next(urls, None) - - while url is not None: + for url in input_manager: try: log.debug("Starting %s for '%s'", jobtype.__name__, url) - if isinstance(url, util.ExtendedUrl): + + if isinstance(url, ExtendedUrl): for opts in url.gconfig: config.set(*opts) with config.apply(url.lconfig): - retval |= jobtype(url.value).run() + status = jobtype(url.value).run() + else: + status = jobtype(url).run() + + if status: + retval |= status else: - retval |= jobtype(url).run() + input_manager.success() + except exception.TerminateExtraction: pass except exception.RestartExtraction: @@ -291,8 +281,7 @@ def main(): log.error("Unsupported URL '%s'", url) retval |= 64 - url = next(urls, None) - + input_manager.next() return retval except KeyboardInterrupt: @@ -304,3 +293,206 @@ def main(): if exc.errno != errno.EPIPE: raise return 1 + + +class InputManager(): + + def __init__(self): + self.urls = [] + self.files = () + self._index = 0 + self._current = None + self._pformat = None + + def add_url(self, url): + self.urls.append(url) + + def add_list(self, urls): + self.urls += urls + + def add_file(self, path, action=None): + """Process an input file. + + Lines starting with '#' and empty lines will be ignored. + Lines starting with '-' will be interpreted as a key-value pair + separated by an '='. where + 'key' is a dot-separated option name and + 'value' is a JSON-parsable string. + These configuration options will be applied + while processing the next URL only. + Lines starting with '-G' are the same as above, except these options + will be applied for *all* following URLs, i.e. they are Global. + Everything else will be used as a potential URL. + + Example input file: + + # settings global options + -G base-directory = "/tmp/" + -G skip = false + + # setting local options for the next URL + -filename="spaces_are_optional.jpg" + -skip = true + + https://example.org/ + + # next URL uses default filename and 'skip' is false. + https://example.com/index.htm # comment1 + https://example.com/404.htm # comment2 + """ + if path == "-" and not action: + try: + lines = sys.stdin.readlines() + except Exception: + raise exception.InputFileError("stdin is not readable") + path = None + else: + try: + with open(path, encoding="utf-8") as fp: + lines = fp.readlines() + except Exception as exc: + raise exception.InputFileError(str(exc)) + + if self.files: + self.files[path] = lines + else: + self.files = {path: lines} + + if action == "c": + action = self._action_comment + elif action == "d": + action = self._action_delete + else: + action = None + + gconf = [] + lconf = [] + indicies = [] + strip_comment = None + append = self.urls.append + + for n, line in enumerate(lines): + line = line.strip() + + if not line or line[0] == "#": + # empty line or comment + continue + + elif line[0] == "-": + # config spec + if len(line) >= 2 and line[1] == "G": + conf = gconf + line = line[2:] + else: + conf = lconf + line = line[1:] + if action: + indicies.append(n) + + key, sep, value = line.partition("=") + if not sep: + raise exception.InputFileError( + "Invalid KEY=VALUE pair '%s' on line %s in %s", + line, n+1, path) + + try: + value = util.json_loads(value.strip()) + except ValueError as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + raise exception.InputFileError( + "Unable to parse '%s' on line %s in %s", + value, n+1, path) + + key = key.strip().split(".") + conf.append((key[:-1], key[-1], value)) + + else: + # url + if " #" in line or "\t#" in line: + if strip_comment is None: + import re + strip_comment = re.compile(r"\s+#.*").sub + line = strip_comment("", line) + if gconf or lconf: + url = ExtendedUrl(line, gconf, lconf) + gconf = [] + lconf = [] + else: + url = line + + if action: + indicies.append(n) + append((url, path, action, indicies)) + indicies = [] + else: + append(url) + + def progress(self, pformat=True): + if pformat is True: + pformat = "[{current}/{total}] {url}\n" + else: + pformat += "\n" + self._pformat = pformat.format_map + + def next(self): + self._index += 1 + + def success(self): + if self._current: + url, path, action, indicies = self._current + lines = self.files[path] + action(lines, indicies) + try: + with open(path, "w", encoding="utf-8") as fp: + fp.writelines(lines) + except Exception as exc: + self.log.warning( + "Unable to update '%s' (%s: %s)", + path, exc.__class__.__name__, exc) + + @staticmethod + def _action_comment(lines, indicies): + for i in indicies: + lines[i] = "# " + lines[i] + + @staticmethod + def _action_delete(lines, indicies): + for i in indicies: + lines[i] = "" + + def __iter__(self): + self._index = 0 + return self + + def __next__(self): + try: + item = self.urls[self._index] + except IndexError: + raise StopIteration + + if isinstance(item, tuple): + self._current = item + item = item[0] + else: + self._current = None + + if self._pformat: + output.stderr_write(self._pformat({ + "total" : len(self.urls), + "current": self._index + 1, + "url" : item, + })) + return item + + +class ExtendedUrl(): + """URL with attached config key-value pairs""" + __slots__ = ("value", "gconfig", "lconfig") + + def __init__(self, url, gconf, lconf): + self.value = url + self.gconfig = gconf + self.lconfig = lconf + + def __str__(self): + return self.value diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index ef190f26..ee183fcc 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -21,6 +21,7 @@ Exception | +-- FilenameFormatError | +-- DirectoryFormatError +-- FilterError + +-- InputFileError +-- NoExtractorError +-- StopExtraction +-- TerminateExtraction @@ -99,6 +100,15 @@ class FilterError(GalleryDLException): code = 32 +class InputFileError(GalleryDLException): + """Error when parsing input file""" + code = 32 + + def __init__(self, message, *args): + GalleryDLException.__init__( + self, message % args if args else message) + + class NoExtractorError(GalleryDLException): """No extractor can handle the given URL""" code = 64 diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 1982b71d..2c15eecd 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -59,6 +59,12 @@ class OptionAction(argparse.Action): namespace.options_pp[key] = value +class InputfileAction(argparse.Action): + """Process input files""" + def __call__(self, parser, namespace, value, option_string=None): + namespace.input_files.append((value, self.const)) + + class Formatter(argparse.HelpFormatter): """Custom HelpFormatter class to customize help output""" def __init__(self, prog): @@ -100,12 +106,6 @@ def build_parser(): action="version", version=version.__version__, help="Print program version and exit", ) - general.add_argument( - "-i", "--input-file", - dest="inputfiles", metavar="FILE", action="append", - help=("Download URLs found in FILE ('-' for stdin). " - "More than one --input-file can be specified"), - ) general.add_argument( "-f", "--filename", dest="filename", metavar="FORMAT", @@ -149,6 +149,32 @@ def build_parser(): "(ALL to delete everything)", ) + input = parser.add_argument_group("Input Options") + input.add_argument( + "urls", + metavar="URL", nargs="*", + help=argparse.SUPPRESS, + ) + input.add_argument( + "-i", "--input-file", + dest="input_files", metavar="FILE", action=InputfileAction, const=None, + default=[], + help=("Download URLs found in FILE ('-' for stdin). " + "More than one --input-file can be specified"), + ) + input.add_argument( + "-I", "--input-file-comment", + dest="input_files", metavar="FILE", action=InputfileAction, const="c", + help=("Download URLs found in FILE. " + "Comment them out after they were downloaded successfully."), + ) + input.add_argument( + "-x", "--input-file-delete", + dest="input_files", metavar="FILE", action=InputfileAction, const="d", + help=("Download URLs found in FILE. " + "Delete them after they were downloaded successfully."), + ) + output = parser.add_argument_group("Output Options") output.add_argument( "-q", "--quiet", @@ -534,10 +560,4 @@ def build_parser(): help="Additional '=' post processor options", ) - parser.add_argument( - "urls", - metavar="URL", nargs="*", - help=argparse.SUPPRESS, - ) - return parser diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 6255d49e..62aa12da 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -487,82 +487,6 @@ CODES = { } -def parse_inputfile(file, log): - """Filter and process strings from an input file. - - Lines starting with '#' and empty lines will be ignored. - Lines starting with '-' will be interpreted as a key-value pair separated - by an '='. where 'key' is a dot-separated option name and 'value' is a - JSON-parsable value. These configuration options will be applied while - processing the next URL. - Lines starting with '-G' are the same as above, except these options will - be applied for *all* following URLs, i.e. they are Global. - Everything else will be used as a potential URL. - - Example input file: - - # settings global options - -G base-directory = "/tmp/" - -G skip = false - - # setting local options for the next URL - -filename="spaces_are_optional.jpg" - -skip = true - - https://example.org/ - - # next URL uses default filename and 'skip' is false. - https://example.com/index.htm # comment1 - https://example.com/404.htm # comment2 - """ - gconf = [] - lconf = [] - strip_comment = None - - for line in file: - line = line.strip() - - if not line or line[0] == "#": - # empty line or comment - continue - - elif line[0] == "-": - # config spec - if len(line) >= 2 and line[1] == "G": - conf = gconf - line = line[2:] - else: - conf = lconf - line = line[1:] - - key, sep, value = line.partition("=") - if not sep: - log.warning("input file: invalid = pair: %s", line) - continue - - try: - value = json_loads(value.strip()) - except ValueError as exc: - log.warning("input file: unable to parse '%s': %s", value, exc) - continue - - key = key.strip().split(".") - conf.append((key[:-1], key[-1], value)) - - else: - # url - if " #" in line or "\t#" in line: - if strip_comment is None: - strip_comment = re.compile(r"\s+#.*").sub - line = strip_comment("", line) - if gconf or lconf: - yield ExtendedUrl(line, gconf, lconf) - gconf = [] - lconf = [] - else: - yield line - - class CustomNone(): """None-style type that supports more operations than regular None""" __slots__ = () @@ -873,15 +797,6 @@ class FilterPredicate(): raise exception.FilterError(exc) -class ExtendedUrl(): - """URL with attached config key-value pairs""" - def __init__(self, url, gconf, lconf): - self.value, self.gconfig, self.lconfig = url, gconf, lconf - - def __str__(self): - return self.value - - class DownloadArchive(): def __init__(self, path, format_string, pragma=None, From 51e377e612148f0d8b949625a70414ed68873da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 14 Nov 2023 23:10:31 +0100 Subject: [PATCH 114/344] add '--cbz' command-line option --- docs/options.md | 1 + gallery_dl/option.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/docs/options.md b/docs/options.md index 548b5868..41daa721 100644 --- a/docs/options.md +++ b/docs/options.md @@ -119,6 +119,7 @@ ## Post-processing Options: --zip Store downloaded files in a ZIP archive + --cbz Store downloaded files in a CBZ archive --ugoira-conv Convert Pixiv Ugoira to WebM (requires FFmpeg) --ugoira-conv-lossless Convert Pixiv Ugoira to WebM in VP9 lossless mode diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 2c15eecd..dd1258f9 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -468,6 +468,15 @@ def build_parser(): action="append_const", const="zip", help="Store downloaded files in a ZIP archive", ) + postprocessor.add_argument( + "--cbz", + dest="postprocessors", + action="append_const", const={ + "name" : "zip", + "extension": "cbz", + }, + help="Store downloaded files in a CBZ archive", + ) postprocessor.add_argument( "--ugoira-conv", dest="postprocessors", action="append_const", const={ From c6ad9bcd9b77957250d4e5a55fe720bc094d02df Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Wed, 15 Nov 2023 07:08:52 -0500 Subject: [PATCH 115/344] [erome] add "count" for albums --- gallery_dl/extractor/erome.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 2aed678f..6e77d2f2 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,11 +44,13 @@ class EromeExtractor(Extractor): pos = page.index('
', '') data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "_http_headers": {"Referer": url}, + "count" : count, } yield Message.Directory, data From 387c8b095005007f43fea6d2226e93bfcc627e96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 15 Nov 2023 14:59:03 +0100 Subject: [PATCH 116/344] reword some (internal) option text --- gallery_dl/option.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index dd1258f9..4bef4f3e 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -44,23 +44,23 @@ class DeprecatedConfigConstAction(argparse.Action): namespace.options.append(((), self.dest, self.const)) -class ParseAction(argparse.Action): - """Parse = options and set them as config values""" +class ConfigParseAction(argparse.Action): + """Parse KEY=VALUE config options""" def __call__(self, parser, namespace, values, option_string=None): key, value = _parse_option(values) key = key.split(".") # splitting an empty string becomes [""] namespace.options.append((key[:-1], key[-1], value)) -class OptionAction(argparse.Action): - """Parse = options for """ +class PPParseAction(argparse.Action): + """Parse KEY=VALUE post processor options""" def __call__(self, parser, namespace, values, option_string=None): key, value = _parse_option(values) namespace.options_pp[key] = value class InputfileAction(argparse.Action): - """Process input files""" + """Collect input files""" def __call__(self, parser, namespace, value, option_string=None): namespace.input_files.append((value, self.const)) @@ -334,7 +334,8 @@ def build_parser(): configuration = parser.add_argument_group("Configuration Options") configuration.add_argument( "-o", "--option", - dest="options", metavar="KEY=VALUE", action=ParseAction, default=[], + dest="options", metavar="KEY=VALUE", + action=ConfigParseAction, default=[], help=("Additional options. " "Example: -o browser=firefox") , ) @@ -565,8 +566,9 @@ def build_parser(): ) postprocessor.add_argument( "-O", "--postprocessor-option", - dest="options_pp", metavar="OPT", action=OptionAction, default={}, - help="Additional '=' post processor options", + dest="options_pp", metavar="KEY=VALUE", + action=PPParseAction, default={}, + help="Additional post processor options", ) return parser From 97357e65ee1b9e5f14c37289f605041f941ced12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 15 Nov 2023 15:01:02 +0100 Subject: [PATCH 117/344] replace '--mtime-from-date' with a more generic '--mtime' --mtime-from-date -> --mtime date for the same effect as before (--mtime-from-date also still works, but --help now lists only --mtime) --- docs/options.md | 10 +++++----- gallery_dl/option.py | 23 +++++++++++++++++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/docs/options.md b/docs/options.md index 41daa721..270a9f0b 100644 --- a/docs/options.md +++ b/docs/options.md @@ -128,8 +128,9 @@ --write-metadata Write metadata to separate JSON files --write-info-json Write gallery metadata to a info.json file --write-tags Write image tags to separate text files - --mtime-from-date Set file modification times according to 'date' - metadata + --mtime FORMAT Set file modification times according to + metadata selected by FORMAT. Examples: 'date' or + 'status[date]' --exec CMD Execute CMD for each downloaded file. Supported replacement fields are {} or {_path}, {_directory}, {_filename}. Example: --exec @@ -138,6 +139,5 @@ successfully. Example: --exec-after "cd {_directory} && convert * ../doc.pdf" -P, --postprocessor NAME Activate the specified post processor - -O, --postprocessor-option OPT - Additional '=' post processor - options + -O, --postprocessor-option KEY=VALUE + Additional post processor options diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 4bef4f3e..aa42400d 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -65,6 +65,15 @@ class InputfileAction(argparse.Action): namespace.input_files.append((value, self.const)) +class MtimeAction(argparse.Action): + """Configure mtime post processor""" + def __call__(self, parser, namespace, value, option_string=None): + namespace.postprocessors.append({ + "name": "mtime", + "value": "{" + (self.const or value) + "}", + }) + + class Formatter(argparse.HelpFormatter): """Custom HelpFormatter class to customize help output""" def __init__(self, prog): @@ -466,7 +475,7 @@ def build_parser(): postprocessor.add_argument( "--zip", dest="postprocessors", - action="append_const", const="zip", + action="append_const", const="zip", default=[], help="Store downloaded files in a ZIP archive", ) postprocessor.add_argument( @@ -535,11 +544,17 @@ def build_parser(): action="append_const", const={"name": "metadata", "mode": "tags"}, help="Write image tags to separate text files", ) + postprocessor.add_argument( + "--mtime", + dest="postprocessors", metavar="FORMAT", action=MtimeAction, + help=("Set file modification times according to metadata " + "selected by FORMAT. Examples: 'date' or 'status[date]'"), + ) postprocessor.add_argument( "--mtime-from-date", - dest="postprocessors", - action="append_const", const="mtime", - help="Set file modification times according to 'date' metadata", + dest="postprocessors", nargs=0, action=MtimeAction, + const="date|status[date]", + help=argparse.SUPPRESS, ) postprocessor.add_argument( "--exec", From 168331d147715a5a306bd7adf043382ad6442e44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 16 Nov 2023 17:15:17 +0100 Subject: [PATCH 118/344] replace '--ugoira-conv' etc with a general '--ugoira' update --ugoira webm to use the same FFmpeg args as Danbooru --ugoira-conv -> --ugoira vp8 --ugoira-conv-lossless -> --ugoira vp9-lossless --ugoira-conv-copy -> --ugoira copy (--ugoira-conv and co still work as before, but --help now lists only --ugoira) --- docs/options.md | 8 ++-- gallery_dl/option.py | 98 +++++++++++++++++++++++++++++++++----------- 2 files changed, 76 insertions(+), 30 deletions(-) diff --git a/docs/options.md b/docs/options.md index 270a9f0b..f70e766d 100644 --- a/docs/options.md +++ b/docs/options.md @@ -120,11 +120,9 @@ ## Post-processing Options: --zip Store downloaded files in a ZIP archive --cbz Store downloaded files in a CBZ archive - --ugoira-conv Convert Pixiv Ugoira to WebM (requires FFmpeg) - --ugoira-conv-lossless Convert Pixiv Ugoira to WebM in VP9 lossless - mode - --ugoira-conv-copy Convert Pixiv Ugoira to MKV without re-encoding - any frames + --ugoira FORMAT Convert Pixiv Ugoira to FORMAT using FFmpeg. + Supported formats are 'webm', 'mp4', 'gif', + 'vp8', 'vp9', 'vp9-lossless', 'copy'. --write-metadata Write metadata to separate JSON files --write-info-json Write gallery metadata to a info.json file --write-tags Write image tags to separate text files diff --git a/gallery_dl/option.py b/gallery_dl/option.py index aa42400d..209452fe 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -66,7 +66,7 @@ class InputfileAction(argparse.Action): class MtimeAction(argparse.Action): - """Configure mtime post processor""" + """Configure mtime post processors""" def __call__(self, parser, namespace, value, option_string=None): namespace.postprocessors.append({ "name": "mtime", @@ -74,6 +74,64 @@ class MtimeAction(argparse.Action): }) +class UgoiraAction(argparse.Action): + """Configure ugoira post processors""" + def __call__(self, parser, namespace, value, option_string=None): + if self.const: + value = self.const + else: + value = value.strip().lower() + + if value in ("webm", "vp9"): + pp = { + "extension" : "webm", + "ffmpeg-args" : ("-c:v", "libvpx-vp9", + "-crf", "12", + "-b:v", "0", "-an"), + } + elif value == "vp9-lossless": + pp = { + "extension" : "webm", + "ffmpeg-args" : ("-c:v", "libvpx-vp9", + "-lossless", "1", + "-pix_fmt", "yuv420p", "-an"), + } + elif value == "vp8": + pp = { + "extension" : "webm", + "ffmpeg-args" : ("-c:v", "libvpx", + "-crf", "4", + "-b:v", "5000k", "-an"), + } + elif value == "mp4": + pp = { + "extension" : "mp4", + "ffmpeg-args" : ("-c:v", "libx264", "-an", "-b:v", "5M"), + "libx264-prevent-odd": True, + } + elif value == "gif": + pp = { + "extension" : "gif", + "ffmpeg-args" : ("-filter_complex", "[0:v] split [a][b];" + "[a] palettegen [p];[b][p] paletteuse"), + "repeat-last-frame": False, + } + elif value in ("mkv", "copy"): + pp = { + "extension" : "mkv", + "ffmpeg-args" : ("-c:v", "copy"), + "repeat-last-frame": False, + } + else: + parser.error("Unsupported Ugoira format '{}'".format(value)) + + pp["name"] = "ugoira" + pp["whitelist"] = ("pixiv", "danbooru") + + namespace.options.append(((), "ugoira", True)) + namespace.postprocessors.append(pp) + + class Formatter(argparse.HelpFormatter): """Custom HelpFormatter class to customize help output""" def __init__(self, prog): @@ -487,38 +545,28 @@ def build_parser(): }, help="Store downloaded files in a CBZ archive", ) + postprocessor.add_argument( + "--ugoira", + dest="postprocessors", metavar="FORMAT", action=UgoiraAction, + help=("Convert Pixiv Ugoira to FORMAT using FFmpeg. " + "Supported formats are 'webm', 'mp4', 'gif', " + "'vp8', 'vp9', 'vp9-lossless', 'copy'."), + ) postprocessor.add_argument( "--ugoira-conv", - dest="postprocessors", action="append_const", const={ - "name" : "ugoira", - "ffmpeg-args" : ("-c:v", "libvpx", "-crf", "4", "-b:v", "5000k"), - "ffmpeg-twopass": True, - "whitelist" : ("pixiv", "danbooru"), - }, - help="Convert Pixiv Ugoira to WebM (requires FFmpeg)", + dest="postprocessors", nargs=0, action=UgoiraAction, const="vp8", + help=argparse.SUPPRESS, ) postprocessor.add_argument( "--ugoira-conv-lossless", - dest="postprocessors", action="append_const", const={ - "name" : "ugoira", - "ffmpeg-args" : ("-c:v", "libvpx-vp9", "-lossless", "1", - "-pix_fmt", "yuv420p"), - "ffmpeg-twopass": False, - "whitelist" : ("pixiv", "danbooru"), - }, - help="Convert Pixiv Ugoira to WebM in VP9 lossless mode", + dest="postprocessors", nargs=0, action=UgoiraAction, + const="vp9-lossless", + help=argparse.SUPPRESS, ) postprocessor.add_argument( "--ugoira-conv-copy", - dest="postprocessors", action="append_const", const={ - "name" : "ugoira", - "extension" : "mkv", - "ffmpeg-args" : ("-c:v", "copy"), - "ffmpeg-twopass" : False, - "repeat-last-frame": False, - "whitelist" : ("pixiv", "danbooru"), - }, - help="Convert Pixiv Ugoira to MKV without re-encoding any frames", + dest="postprocessors", nargs=0, action=UgoiraAction, const="copy", + help=argparse.SUPPRESS, ) postprocessor.add_argument( "--write-metadata", From 8bf161e5742144a260763d7a0cf3a7847b0dbfbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 16 Nov 2023 17:37:15 +0100 Subject: [PATCH 119/344] reorder post processing options shown by --help --- docs/options.md | 24 +++++------ gallery_dl/option.py | 96 ++++++++++++++++++++++---------------------- 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/docs/options.md b/docs/options.md index f70e766d..6d22062b 100644 --- a/docs/options.md +++ b/docs/options.md @@ -118,24 +118,24 @@ and other delegated URLs ## Post-processing Options: + -P, --postprocessor NAME Activate the specified post processor + -O, --postprocessor-option KEY=VALUE + Additional post processor options + --write-metadata Write metadata to separate JSON files + --write-info-json Write gallery metadata to a info.json file + --write-tags Write image tags to separate text files --zip Store downloaded files in a ZIP archive --cbz Store downloaded files in a CBZ archive + --mtime NAME Set file modification times according to + metadata selected by NAME. Examples: 'date' or + 'status[date]' --ugoira FORMAT Convert Pixiv Ugoira to FORMAT using FFmpeg. Supported formats are 'webm', 'mp4', 'gif', 'vp8', 'vp9', 'vp9-lossless', 'copy'. - --write-metadata Write metadata to separate JSON files - --write-info-json Write gallery metadata to a info.json file - --write-tags Write image tags to separate text files - --mtime FORMAT Set file modification times according to - metadata selected by FORMAT. Examples: 'date' or - 'status[date]' --exec CMD Execute CMD for each downloaded file. Supported replacement fields are {} or {_path}, {_directory}, {_filename}. Example: --exec "convert {} {}.png && rm {}" - --exec-after CMD Execute CMD after all files were downloaded - successfully. Example: --exec-after "cd - {_directory} && convert * ../doc.pdf" - -P, --postprocessor NAME Activate the specified post processor - -O, --postprocessor-option KEY=VALUE - Additional post processor options + --exec-after CMD Execute CMD after all files were downloaded. + Example: --exec-after "cd {_directory} && + convert * ../doc.pdf" diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 209452fe..255d9f29 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -531,42 +531,15 @@ def build_parser(): } postprocessor = parser.add_argument_group("Post-processing Options") postprocessor.add_argument( - "--zip", - dest="postprocessors", - action="append_const", const="zip", default=[], - help="Store downloaded files in a ZIP archive", - ) - postprocessor.add_argument( - "--cbz", - dest="postprocessors", - action="append_const", const={ - "name" : "zip", - "extension": "cbz", - }, - help="Store downloaded files in a CBZ archive", - ) - postprocessor.add_argument( - "--ugoira", - dest="postprocessors", metavar="FORMAT", action=UgoiraAction, - help=("Convert Pixiv Ugoira to FORMAT using FFmpeg. " - "Supported formats are 'webm', 'mp4', 'gif', " - "'vp8', 'vp9', 'vp9-lossless', 'copy'."), - ) - postprocessor.add_argument( - "--ugoira-conv", - dest="postprocessors", nargs=0, action=UgoiraAction, const="vp8", - help=argparse.SUPPRESS, - ) - postprocessor.add_argument( - "--ugoira-conv-lossless", - dest="postprocessors", nargs=0, action=UgoiraAction, - const="vp9-lossless", - help=argparse.SUPPRESS, + "-P", "--postprocessor", + dest="postprocessors", metavar="NAME", action="append", default=[], + help="Activate the specified post processor", ) postprocessor.add_argument( - "--ugoira-conv-copy", - dest="postprocessors", nargs=0, action=UgoiraAction, const="copy", - help=argparse.SUPPRESS, + "-O", "--postprocessor-option", + dest="options_pp", metavar="KEY=VALUE", + action=PPParseAction, default={}, + help="Additional post processor options", ) postprocessor.add_argument( "--write-metadata", @@ -592,11 +565,26 @@ def build_parser(): action="append_const", const={"name": "metadata", "mode": "tags"}, help="Write image tags to separate text files", ) + postprocessor.add_argument( + "--zip", + dest="postprocessors", + action="append_const", const="zip", + help="Store downloaded files in a ZIP archive", + ) + postprocessor.add_argument( + "--cbz", + dest="postprocessors", + action="append_const", const={ + "name" : "zip", + "extension": "cbz", + }, + help="Store downloaded files in a CBZ archive", + ) postprocessor.add_argument( "--mtime", - dest="postprocessors", metavar="FORMAT", action=MtimeAction, + dest="postprocessors", metavar="NAME", action=MtimeAction, help=("Set file modification times according to metadata " - "selected by FORMAT. Examples: 'date' or 'status[date]'"), + "selected by NAME. Examples: 'date' or 'status[date]'"), ) postprocessor.add_argument( "--mtime-from-date", @@ -604,6 +592,29 @@ def build_parser(): const="date|status[date]", help=argparse.SUPPRESS, ) + postprocessor.add_argument( + "--ugoira", + dest="postprocessors", metavar="FORMAT", action=UgoiraAction, + help=("Convert Pixiv Ugoira to FORMAT using FFmpeg. " + "Supported formats are 'webm', 'mp4', 'gif', " + "'vp8', 'vp9', 'vp9-lossless', 'copy'."), + ) + postprocessor.add_argument( + "--ugoira-conv", + dest="postprocessors", nargs=0, action=UgoiraAction, const="vp8", + help=argparse.SUPPRESS, + ) + postprocessor.add_argument( + "--ugoira-conv-lossless", + dest="postprocessors", nargs=0, action=UgoiraAction, + const="vp9-lossless", + help=argparse.SUPPRESS, + ) + postprocessor.add_argument( + "--ugoira-conv-copy", + dest="postprocessors", nargs=0, action=UgoiraAction, const="copy", + help=argparse.SUPPRESS, + ) postprocessor.add_argument( "--exec", dest="postprocessors", metavar="CMD", @@ -618,20 +629,9 @@ def build_parser(): dest="postprocessors", metavar="CMD", action=AppendCommandAction, const={ "name": "exec", "event": "finalize"}, - help=("Execute CMD after all files were downloaded successfully. " + help=("Execute CMD after all files were downloaded. " "Example: --exec-after \"cd {_directory} " "&& convert * ../doc.pdf\""), ) - postprocessor.add_argument( - "-P", "--postprocessor", - dest="postprocessors", metavar="NAME", action="append", - help="Activate the specified post processor", - ) - postprocessor.add_argument( - "-O", "--postprocessor-option", - dest="options_pp", metavar="KEY=VALUE", - action=PPParseAction, default={}, - help="Additional post processor options", - ) return parser From 6c040afe0fbf02d5443a42fc79824d3435482b6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 17 Nov 2023 15:35:52 +0100 Subject: [PATCH 120/344] [tests] install yt-dlp from PyPI for Python 3.7 https://github.com/yt-dlp/yt-dlp/commit/f4b95acafcd69a50040730dfdf732e797278fdcc --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 46539068..2c6dfd93 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -40,7 +40,7 @@ jobs: 3.4|3.5) # don't install yt-dlp ;; - 3.6) + 3.6|3.7) # install from PyPI pip install yt-dlp ;; From ea78f67860b559915d27feb5c93a53c271b9b642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 17 Nov 2023 15:56:00 +0100 Subject: [PATCH 121/344] [downloader:http] skip files not passing filesize-min/-max (#4821) instead of failing the download --- gallery_dl/downloader/http.py | 6 ++++-- test/test_downloader.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 30ac0016..f493947e 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -200,13 +200,15 @@ class HttpDownloader(DownloaderBase): self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) - return False + pathfmt.temppath = "" + return True if self.maxsize and size > self.maxsize: self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) - return False + pathfmt.temppath = "" + return True build_path = False diff --git a/test/test_downloader.py b/test/test_downloader.py index 840e0780..f10465e5 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -214,7 +214,8 @@ class TestHTTPDownloader(TestDownloaderBase): self.downloader.minsize = 100 with self.assertLogs(self.downloader.log, "WARNING"): success = self.downloader.download(url, pathfmt) - self.assertFalse(success) + self.assertTrue(success) + self.assertEqual(pathfmt.temppath, "") def test_http_filesize_max(self): url = self.address + "/jpg" @@ -222,7 +223,8 @@ class TestHTTPDownloader(TestDownloaderBase): self.downloader.maxsize = 100 with self.assertLogs(self.downloader.log, "WARNING"): success = self.downloader.download(url, pathfmt) - self.assertFalse(success) + self.assertTrue(success) + self.assertEqual(pathfmt.temppath, "") class TestTextDownloader(TestDownloaderBase): From 6a753d9ff3605657ecb6794d1f9812b38610882e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 17 Nov 2023 22:07:01 +0100 Subject: [PATCH 122/344] [behance] support 'text' modules (#4799) --- gallery_dl/extractor/behance.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index a92918e9..6909785d 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -97,7 +97,8 @@ class BehanceGalleryExtractor(BehanceExtractor): yield Message.Directory, data for data["num"], (url, module) in enumerate(imgs, 1): data["module"] = module - data["extension"] = text.ext_from_url(url) + data["extension"] = (module.get("extension") or + text.ext_from_url(url)) yield Message.Url, url, data def get_gallery_data(self): @@ -171,8 +172,13 @@ class BehanceGalleryExtractor(BehanceExtractor): embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: embed = text.unescape(text.extr(embed, 'src="', '"')) + module["extension"] = "mp4" append(("ytdl:" + embed, module)) + elif mtype == "TextModule": + module["extension"] = "txt" + append(("text:" + module["text"], module)) + return result From 07cb584231d881525ed860d7ac478accae50c53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 17 Nov 2023 22:47:57 +0100 Subject: [PATCH 123/344] [behance] add 'modules' option (#4799) --- docs/configuration.rst | 13 +++++++++++++ gallery_dl/extractor/behance.py | 27 +++++++++++++++++++++------ test/results/behance.py | 9 +++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 1f6ad0c1..c22bd8c2 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1110,6 +1110,19 @@ Description The maximum possible value appears to be ``1920``. +extractor.behance.modules +------------------------- +Type + ``list`` of ``strings`` +Default + ``["image", "video", "mediacollection", "embed"]`` +Description + Selects which gallery modules to download from. + + Supported module types are + ``image``, ``video``, ``mediacollection``, ``embed``, ``text``. + + extractor.blogger.videos ------------------------ Type diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 6909785d..ad0caf94 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -89,6 +89,17 @@ class BehanceGalleryExtractor(BehanceExtractor): BehanceExtractor.__init__(self, match) self.gallery_id = match.group(1) + def _init(self): + BehanceExtractor._init(self) + + modules = self.config("modules") + if modules: + if isinstance(modules, str): + modules = modules.split(",") + self.modules = set(modules) + else: + self.modules = {"image", "video", "mediacollection", "embed"} + def items(self): data = self.get_gallery_data() imgs = self.get_images(data) @@ -134,13 +145,17 @@ class BehanceGalleryExtractor(BehanceExtractor): append = result.append for module in data["modules"]: - mtype = module["__typename"] + mtype = module["__typename"][:-6].lower() + + if mtype not in self.modules: + self.log.debug("Skipping '%s' module", mtype) + continue - if mtype == "ImageModule": + if mtype == "image": url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "VideoModule": + elif mtype == "video": try: renditions = module["videoData"]["renditions"] except Exception: @@ -159,7 +174,7 @@ class BehanceGalleryExtractor(BehanceExtractor): append((url, module)) - elif mtype == "MediaCollectionModule": + elif mtype == "mediacollection": for component in module["components"]: for size in component["imageSizes"].values(): if size: @@ -168,14 +183,14 @@ class BehanceGalleryExtractor(BehanceExtractor): append(("/".join(parts), module)) break - elif mtype == "EmbedModule": + elif mtype == "embed": embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: embed = text.unescape(text.extr(embed, 'src="', '"')) module["extension"] = "mp4" append(("ytdl:" + embed, module)) - elif mtype == "TextModule": + elif mtype == "text": module["extension"] = "txt" append(("text:" + module["text"], module)) diff --git a/test/results/behance.py b/test/results/behance.py index 2a23b3ed..abc1cb9a 100644 --- a/test/results/behance.py +++ b/test/results/behance.py @@ -63,6 +63,15 @@ __tests__ = ( "#count" : 3, }, +{ + "#url" : "https://www.behance.net/gallery/89270715/Moevir", + "#comment" : "'text' modules (#4799)", + "#category": ("", "behance", "gallery"), + "#class" : behance.BehanceGalleryExtractor, + "#options" : {"modules": "text"}, + "#urls" : """text:
Make Shift
https://www.moevir.com/News/make-shif
Moevir Magazine November Issue 2019
Photography by Caesar Lima @caephoto 
Model: Bee @phamhuongbee 
Makeup by Monica Alvarez @monicaalvarezmakeup 
Styling by Jessica Boal @jessicaboal 
Hair by James Gilbert @brandnewjames
Shot at Vila Sophia
""", +}, + { "#url" : "https://www.behance.net/gallery/177464639/Kimori", "#comment" : "mature content (#4417)", From b714df5a16d15723bd9d90343f0b7a93fd47a34f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 18 Nov 2023 01:16:49 +0100 Subject: [PATCH 124/344] disable 'downloader.progress' when using -q/--quiet (#4810) it didn't produce any output since output.mode is set to to "null", but it caused some unnecessary function calls --- gallery_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 1d64fefc..287faf18 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -115,6 +115,7 @@ def main(): output.configure_logging(args.loglevel) if args.loglevel >= logging.ERROR: config.set(("output",), "mode", "null") + config.set(("downloader",), "progress", None) elif args.loglevel <= logging.DEBUG: import platform import requests From 286d0cb098a24916ecccf5a24961bf4847073dca Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Fri, 17 Nov 2023 19:34:34 -0500 Subject: [PATCH 125/344] [tmohentai] add support --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/tmohentai.py | 78 +++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 gallery_dl/extractor/tmohentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a15566df..94cef0f7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -829,6 +829,12 @@ Consider all sites to be NSFW unless otherwise known.
Galleries
Tmohentaihttps://tmohentai.com/Galleries
Toyhouse https://toyhou.se/Collections, Galleries, User Profiles
Bloggerhttps://www.blogger.com/Blogs, Labels, Posts, Search Results
Bunkr https://bunkrr.su/
Blogger Instances
Blogspothttps://www.blogger.com/Blogs, Labels, Posts, Search Results
MIC MIC IDOLhttps://www.micmicidol.club/Blogs, Labels, Posts, Search Results
Chevereto Instances
TmohentaiTMOHentai https://tmohentai.com/ Galleries All Pins, Created Pins, Pins, pin.it Links, related Pins, Search Results, Sections, User Profiles Cookies
pixeldrainhttps://pixeldrain.com/Albums, Files
Pixhost https://pixhost.to/
Inkbunny https://inkbunny.net/Favorites, Followed Users, Pools, Posts, Search Results, User ProfilesFavorites, Followed Users, Pools, Posts, Search Results, Unread Submissions, User Profiles Supported
Posts, Tag Searches
Rule34Hentaihttps://rule34hentai.net/Posts, Tag Searches
szurubooru Instances
Bbw-chanhttps://bbw-chan.nl/https://bbw-chan.link/ Boards, Threads
", ">").split(" // ") - post["width"], _, height = dimensions.partition("x") + dimensions, size, ext = extr("Info", "<").split(" // ") post["size"] = text.parse_bytes(size[:-1]) + post["width"], _, height = dimensions.partition("x") post["height"], _, duration = height.partition(", ") post["duration"] = text.parse_float(duration[:-1]) + post["filename"] = "{} - {}".format(post_id, post["tags"]) + post["extension"] = ext return post @@ -112,6 +114,7 @@ class PahealTagExtractor(PahealExtractor): tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") + tags = text.unescape(tags) width, _, height = dimensions.partition("x") height, _, duration = height.partition(", ") @@ -119,9 +122,11 @@ class PahealTagExtractor(PahealExtractor): "id": pid, "md5": md5, "file_url": url, "width": width, "height": height, "duration": text.parse_float(duration[:-1]), - "tags": text.unescape(tags), + "tags": tags, "size": text.parse_bytes(size[:-1]), "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"), + "filename" : "{} - {}".format(pid, tags), + "extension": ext, } def _extract_data_ex(self, post): diff --git a/test/results/paheal.py b/test/results/paheal.py index 833f3f84..1772593b 100644 --- a/test/results/paheal.py +++ b/test/results/paheal.py @@ -12,8 +12,21 @@ __tests__ = ( "#url" : "https://rule34.paheal.net/post/list/Ayane_Suzuki/1", "#category": ("shimmie2", "paheal", "tag"), "#class" : paheal.PahealTagExtractor, - "#pattern" : r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", - "#count" : ">= 15", + "#pattern" : "https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20|https://r34i\.paheal-cdn\.net/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}$", + "#count" : range(70, 200), + + "date" : "type:datetime", + "extension": r"re:jpg|png", + "filename" : r"re:\d+ - \w+", + "duration" : float, + "height" : int, + "id" : int, + "md5" : r"re:[0-9a-f]{32}", + "search_tags": "Ayane_Suzuki", + "size" : int, + "tags" : str, + "width" : int, + }, { @@ -42,12 +55,12 @@ __tests__ = ( "#url" : "https://rule34.paheal.net/post/view/481609", "#category": ("shimmie2", "paheal", "post"), "#class" : paheal.PahealPostExtractor, - "#pattern" : r"https://tulip\.paheal\.net/_images/bbdc1c33410c2cdce7556c7990be26b7/481609%20-.+\.jpg", + "#urls" : "https://r34i.paheal-cdn.net/bb/dc/bbdc1c33410c2cdce7556c7990be26b7", "#sha1_content": "7b924bcf150b352ac75c9d281d061e174c851a11", "date" : "dt:2010-06-17 15:40:23", "extension": "jpg", - "file_url" : r"re:https://tulip.paheal.net/_images/bbdc1c33410c", + "file_url" : "https://r34i.paheal-cdn.net/bb/dc/bbdc1c33410c2cdce7556c7990be26b7", "filename" : "481609 - Ayumu_Kasuga Azumanga_Daioh inanimate Vuvuzela", "height" : 660, "id" : 481609, @@ -79,7 +92,7 @@ __tests__ = ( "#comment" : "video", "#category": ("shimmie2", "paheal", "post"), "#class" : paheal.PahealPostExtractor, - "#pattern" : r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637dde5bf4f992b2cb/3864982%20-%20animated%20Metal_Gear%20Metal_Gear_Solid_V%20Quiet%20Vg_erotica%20webm\.webm", + "#urls" : "https://r34i.paheal-cdn.net/76/29/7629fc0ff77e32637dde5bf4f992b2cb", "date" : "dt:2020-09-06 01:59:03", "duration" : 30.0, From 74c225f94e967dbdc0cc919a541c82d28dffcb16 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 22:33:33 -0500 Subject: [PATCH 236/344] [bato] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bato.py | 113 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + test/results/bato.py | 65 ++++++++++++++++++ 5 files changed, 186 insertions(+) create mode 100644 gallery_dl/extractor/bato.py create mode 100644 test/results/bato.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8e4c59a1..6040cd47 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -97,6 +97,12 @@ Consider all listed sites to potentially be NSFW. Albums, Artwork Listings, Challenges, Followed Users, individual Images, Likes, Search Results, User Profiles
Batohttps://bato.toChapters, Manga
BBC https://bbc.co.uk/Posts, User Profiles
Poringahttp://www.poringa.net/Posts Images, Search Results, User Profiles
Porn Image https://porn-images-xxx.com/
DeviantArt https://www.deviantart.com/Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, WatchesAvatars, Backgrounds, Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches OAuth
JPG Fishhttps://jpg2.su/https://jpg4.su/ Albums, individual Images, User Profiles
Posts, Tag Searches
Snootbooruhttps://snootbooru.com/Posts, Tag Searches
URL ShortenersAlbums
Nudecollecthttps://nudecollect.com/Albums, individual Images
Patreon https://www.patreon.com/individual Images, Tag Searches Supported
Zzuphttps://zzup.com/Galleries
かべうち https://kabe-uchiroom.com/
Batohttps://bato.tohttps://bato.to/ Chapters, Manga