From 7dd79eee936b1b30312303251e5597febf298159 Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Mon, 24 Apr 2023 00:01:51 -0400 Subject: [PATCH 001/154] save cookies to tempfile, then rename avoids wiping the cookies file if the disk is full --- gallery_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 9b010c59..18cf0e3a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -431,8 +431,9 @@ class Extractor(): return try: - with open(path, "w") as fp: + with open(path + ".tmp", "w") as fp: util.cookiestxt_store(fp, self.cookies) + os.replace(path + ".tmp", path) except OSError as exc: self.log.warning("cookies: %s", exc) From f8b037ed40806fb16225727c6a4e25fabf1c6331 Mon Sep 17 00:00:00 2001 From: termvacycurtocs <161976827+termvacycurtocs@users.noreply.github.com> Date: Sat, 2 Mar 2024 22:15:45 +0100 Subject: [PATCH 002/154] [Imagefap] Add folder metadata [Imagefap] Add "folder" metadata when downloading a folder or user profile. No additional request is made to the server. Use for example with the following configuration : "parent-metadata": true "directory":["{category}", "{uploader}", "{folder}", "{gallery_id} {title}"] --- gallery_dl/extractor/imagefap.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 3bdcfdfd..e1be8c9d 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -161,17 +161,19 @@ class ImagefapFolderExtractor(ImagefapExtractor): self.user = user or profile def items(self): - for gallery_id, name in self.galleries(self.folder_id): + for gallery_id, name, folder in self.galleries(self.folder_id): url = "{}/gallery/{}".format(self.root, gallery_id) data = { "gallery_id": gallery_id, "title" : text.unescape(name), + "folder" : text.unescape(folder), "_extractor": ImagefapGalleryExtractor, } yield Message.Queue, url, data def galleries(self, folder_id): """Yield gallery IDs and titles of a folder""" + folder="Uncategorized" if folder_id == "-1": if self._id: url = "{}/usergallery.php?userid={}&folderid=-1".format( @@ -187,11 +189,14 @@ class ImagefapFolderExtractor(ImagefapExtractor): extr = text.extract_from(self.request(url, params=params).text) cnt = 0 + if folder_id != -1 and params["page"] == 0: + folder = extr('class\'blk_galleries\'>','') + while True: gid = extr('", "<") + yield gid, extr("", "<"), folder cnt += 1 if cnt < 20: From 0cbc910905b1cfa993b6263a8013e6cae09590e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 4 Mar 2024 22:31:35 +0100 Subject: [PATCH 003/154] [deviantart:avatar] fix 'index' for avatars without '?' (#5276) --- gallery_dl/extractor/deviantart.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 08961614..fb163085 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -585,7 +585,9 @@ class DeviantartAvatarExtractor(DeviantartExtractor): return () icon = user["usericon"] - index = icon.rpartition("?")[2] + _, sep, index = icon.rpartition("?") + if not sep: + index = "0" formats = self.config("formats") if not formats: From a767832332e21e55610c75f2aeb14ff8092424d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 4 Mar 2024 23:02:47 +0100 Subject: [PATCH 004/154] [deviantart:avatar] ignore default avatars (#5276) --- gallery_dl/extractor/deviantart.py | 10 +++++++++- test/results/deviantart.py | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index fb163085..9cbb21c2 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -18,12 +18,12 @@ import binascii import time import re - BASE_PATTERN = ( r"(?:https?://)?(?:" r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|" r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)" ) +DEFAULT_AVATAR = "https://a.deviantart.net/avatars/default.gif" class DeviantartExtractor(Extractor): @@ -177,6 +177,10 @@ class DeviantartExtractor(Extractor): for comment in deviation["comments"]: user = comment["user"] name = user["username"].lower() + if user["usericon"] == DEFAULT_AVATAR: + self.log.debug( + "Skipping avatar of '%s' (default)", name) + continue _user_details.update(name, user) url = "{}/{}/avatar/".format(self.root, name) @@ -585,6 +589,10 @@ class DeviantartAvatarExtractor(DeviantartExtractor): return () icon = user["usericon"] + if icon == DEFAULT_AVATAR: + self.log.debug("Skipping avatar of '%s' (default)", name) + return () + _, sep, index = icon.rpartition("?") if not sep: index = "0" diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 599ca9ff..2ffc485a 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -252,6 +252,14 @@ __tests__ = ( ), }, +{ + "#url" : "https://deviantart.com/h3813067/avatar", + "#comment" : "default avatar (#5276)", + "#category": ("", "deviantart", "avatar"), + "#class" : deviantart.DeviantartAvatarExtractor, + "#count" : 0, +}, + { "#url" : "https://deviantart.com/gdldev/banner", "#category": ("", "deviantart", "background"), From 461b55da4bb89b7d382424fec4db916e76982702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Mar 2024 00:00:36 +0100 Subject: [PATCH 005/154] [docs] deploy pages in both repositories mikf/galleryx-dl and gdl-org/docs --- .github/workflows/pages.yml | 82 ++++++++++++++++++++++++++++ .github/workflows/pages_dispatch.yml | 23 -------- 2 files changed, 82 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/pages.yml delete mode 100644 .github/workflows/pages_dispatch.yml diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 00000000..e4ac57b6 --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,82 @@ +name: GitHub Pages + +on: + workflow_dispatch: + push: + branches: + - master + paths: + - docs/** + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +jobs: + dispatch: + + runs-on: ubuntu-latest + + steps: + - name: Dispatch to gdl-org/docs + run: > + curl -L + -X POST + -H "Accept: application/vnd.github+json" + -H "Authorization: Bearer ${{ secrets.DISPATCH_TOKEN }}" + -H "X-GitHub-Api-Version: 2022-11-28" + https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches + -d '{"ref":"master"}' + + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/configure-pages@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Docutils + run: pip install docutils pygments + + - name: Update Links + working-directory: ./docs/ + run: sed --in-place 's/\.\(rst\|md\)\b/.html/g' -- *.md *.rst + + - name: reStructuredText to HTML + working-directory: ./docs/ + run: | + while read -r RST + do + python -m docutils --writer=html --output="${RST%.rst}.html" -- "$RST" + done < <(find . -type f -name "*.rst") + + - uses: actions/jekyll-build-pages@v1 + with: + source: ./docs/ + destination: ./_site/ + + - uses: actions/upload-pages-artifact@v3 + + deploy: + + runs-on: ubuntu-latest + + needs: build + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: + - uses: actions/deploy-pages@v4 + id: deployment diff --git a/.github/workflows/pages_dispatch.yml b/.github/workflows/pages_dispatch.yml deleted file mode 100644 index 835c1400..00000000 --- a/.github/workflows/pages_dispatch.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Dispatch GitHub Pages Build - -on: - workflow_dispatch: - push: - branches: - - "master" - paths: - - "docs/**" - -jobs: - dispatch: - runs-on: ubuntu-latest - steps: - - name: dispatch - run: > - curl -L - -X POST - -H "Accept: application/vnd.github+json" - -H "Authorization: Bearer ${{ secrets.DISPATCH_TOKEN }}" - -H "X-GitHub-Api-Version: 2022-11-28" - https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches - -d '{"ref":"master"}' From 96af12cec2b90da6220e84a4cbcde9f58ff07e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Mar 2024 02:57:30 +0100 Subject: [PATCH 006/154] [docker] update Dockerfile - add LANG=C.UTF-8 (better utf-8 support or so I heard) - restructure RUN commands to be more easily extendable and have simpler git diffs - swap 'apk' and 'pip' since 'apk' results are more likely to be stable and therefore cacheable - disable Python bytecode caching with -B - reduces image size by 10Mb - bytecode for explicitly installed packages (gallery-dl, yt-dlp, dependencies) is still cached - add some (hopefully) helpful command-line arguments to pip --- Dockerfile | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 77e97cd9..26c0d850 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,19 @@ FROM python:alpine -RUN python3 -m pip install --no-cache-dir -U pip && \ - python3 -m pip install --no-cache-dir -U gallery-dl yt-dlp -RUN apk update && \ - apk add --no-cache ffmpeg && \ - rm -rf /var/cache/apk/* +ENV LANG=C.UTF-8 + +RUN : \ + && apk --no-interactive update \ + && apk --no-cache --no-interactive add ffmpeg \ + && rm -rf /var/cache/apk \ + && : + +RUN : \ + && python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \ + pip \ + && python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \ + gallery-dl \ + yt-dlp \ + && rm -rf /root/.cache/pip \ + && : + ENTRYPOINT [ "gallery-dl" ] From 36fc510d3acaf4b17e083bc9045f310b014fc790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Mar 2024 03:15:20 +0100 Subject: [PATCH 007/154] [docker] update workflow - run on every push to master - build images from GitHub source instead of PyPI package - build arm64 image (#5227) - include more tags - build date as 'YYYYMMDD' - 'dev' for most recent build from master - 'latest' for most recent release build --- .github/workflows/docker.yml | 60 ++++++++++++++++++++---------------- Dockerfile | 2 +- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 1abff80c..46e67a4e 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,33 +1,47 @@ -name: docker +name: Docker Images on: workflow_dispatch: push: + branches: + - master tags: - v[0-9]+.[0-9]+.[0-9]+ + permissions: packages: write +concurrency: + group: docker + cancel-in-progress: false + jobs: - docker: + build: runs-on: ubuntu-latest + # on release commits, run only for tag event + if: ${{ ! startsWith( github.event.head_commit.message , 'release version ' ) || startsWith( github.ref , 'refs/tags/v' ) }} + steps: - uses: actions/checkout@v4 - # https://github.com/docker/setup-buildx-action - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - # https://github.com/docker/login-action - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 + - uses: docker/metadata-action@v5 + id: metadata with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GHCR_TOKEN }} + images: | + mikf123/gallery-dl + ghcr.io/mikf/gallery-dl + tags: | + type=ref,event=tag + type=raw,value=dev + type=sha,format=long,prefix= + type=raw,priority=500,value={{date 'YYYYMMDD'}} + + - uses: docker/setup-qemu-action@v3 + + - uses: docker/setup-buildx-action@v3 - name: Login to DockerHub uses: docker/login-action@v3 @@ -35,23 +49,17 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - # https://github.com/docker/metadata-action - - name: Generate Docker tags - uses: docker/metadata-action@v5 - id: metadata + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 with: - images: | - mikf123/gallery-dl - ghcr.io/mikf/gallery-dl - tags: | - type=sha,format=long,prefix= - type=ref,event=tag + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_TOKEN }} - # https://github.com/docker/build-push-action - - name: Build image - uses: docker/build-push-action@v5 + - uses: docker/build-push-action@v5 with: + context: . push: true tags: ${{ steps.metadata.outputs.tags }} labels: ${{ steps.metadata.outputs.labels }} - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 diff --git a/Dockerfile b/Dockerfile index 26c0d850..81bd8c12 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ RUN : \ && python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \ pip \ && python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \ - gallery-dl \ + https://github.com/mikf/gallery-dl/archive/refs/heads/master.tar.gz \ yt-dlp \ && rm -rf /root/.cache/pip \ && : From 7b28418f69e8d66f53463b950679967a501268c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Mar 2024 22:30:29 +0100 Subject: [PATCH 008/154] [naver] recognize '.naver' URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://blog.naver.com/PostView.naver?… --- gallery_dl/extractor/naver.py | 11 ++++++----- test/results/naver.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 55faf9e7..a376c9ab 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -26,7 +26,8 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): "{post[date]:%Y-%m-%d} {post[title]}") archive_fmt = "{blog[id]}_{post[num]}_{num}" pattern = (r"(?:https?://)?blog\.naver\.com/" - r"(?:PostView\.nhn\?blogId=(\w+)&logNo=(\d+)|(\w+)/(\d+)/?$)") + r"(?:PostView\.n(?:aver|hn)\?blogId=(\w+)&logNo=(\d+)|" + r"(\w+)/(\d+)/?$)") example = "https://blog.naver.com/BLOGID/12345" def __init__(self, match): @@ -73,7 +74,8 @@ class NaverBlogExtractor(NaverBase, Extractor): subcategory = "blog" categorytransfer = True pattern = (r"(?:https?://)?blog\.naver\.com/" - r"(?:PostList.nhn\?(?:[^&#]+&)*blogId=([^&#]+)|(\w+)/?$)") + r"(?:PostList\.n(?:aver|hn)\?(?:[^&#]+&)*blogId=([^&#]+)|" + r"(\w+)/?$)") example = "https://blog.naver.com/BLOGID" def __init__(self, match): @@ -81,12 +83,11 @@ class NaverBlogExtractor(NaverBase, Extractor): self.blog_id = match.group(1) or match.group(2) def items(self): - # fetch first post number url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id) - post_num = text.extract( + post_num = text.extr( self.request(url).text, 'gnFirstLogNo = "', '"', - )[0] + ) # setup params for API calls url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root) diff --git a/test/results/naver.py b/test/results/naver.py index 81d18efd..560bf82a 100644 --- a/test/results/naver.py +++ b/test/results/naver.py @@ -24,6 +24,12 @@ __tests__ = ( "#sha1_metadata": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e", }, +{ + "#url" : "https://blog.naver.com/PostView.naver?blogId=rlfqjxm0&logNo=221430673006", + "#category": ("", "naver", "post"), + "#class" : naver.NaverPostExtractor, +}, + { "#url" : "https://blog.naver.com/gukjung", "#category": ("", "naver", "blog"), @@ -42,4 +48,10 @@ __tests__ = ( "#count" : 12, }, +{ + "#url" : "https://blog.naver.com/PostList.naver?blogId=gukjung", + "#category": ("", "naver", "blog"), + "#class" : naver.NaverBlogExtractor, +}, + ) From f64fb8f239744c912ad4b15c3accf9e3bc6b7018 Mon Sep 17 00:00:00 2001 From: Johann Hong <57867081+986569200-johann-Hong@users.noreply.github.com> Date: Mon, 29 Jan 2024 00:23:09 +0900 Subject: [PATCH 009/154] [naver] EUC-KR encoding issue in old image URLs Fix Around October 2010, the image server URL format and file name encoding changed from EUC-KR to UTF-8. Modified to detect old URL format and decode image URLs into EUC-KR - (lint with flake8) Customize conditions Wrap lines smaller than 79 characters - (lint with flake8) Customize conditions (2nd try) - One import per line - Indent on consecutive lines - (lint with flake8) Customize conditions (3rd try) - E128 continuation line under-indented for visual indent - E123 closing bracket does not match indentation of opening bracket's line - Update naver.py Check encoding for all image URLs --- gallery_dl/extractor/naver.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 55faf9e7..25801c7e 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -10,6 +10,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import text +from urllib.parse import unquote class NaverBase(): @@ -63,7 +64,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): def images(self, page): return [ - (url.replace("://post", "://blog", 1).partition("?")[0], None) + (unquote(url, encoding="EUC-KR") + .replace("://post", "://blog", 1) + .partition("?")[0], None) + if "\ufffd" in unquote(url) + else + (url.replace("://post", "://blog", 1) + .partition("?")[0], None) for url in text.extract_iter(page, 'data-lazy-src="', '"') ] From a8d3efbb99815d1b0a33d760c53fa46d8d7d4924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Mar 2024 23:18:20 +0100 Subject: [PATCH 010/154] [naver] simplify code + add test --- gallery_dl/extractor/naver.py | 18 +++++++----------- test/results/naver.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 25801c7e..5062cb26 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -10,7 +10,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text -from urllib.parse import unquote class NaverBase(): @@ -63,16 +62,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): return data def images(self, page): - return [ - (unquote(url, encoding="EUC-KR") - .replace("://post", "://blog", 1) - .partition("?")[0], None) - if "\ufffd" in unquote(url) - else - (url.replace("://post", "://blog", 1) - .partition("?")[0], None) - for url in text.extract_iter(page, 'data-lazy-src="', '"') - ] + results = [] + for url in text.extract_iter(page, 'data-lazy-src="', '"'): + url = url.replace("://post", "://blog", 1).partition("?")[0] + if "\ufffd" in text.unquote(url): + url = text.unquote(url, encoding="EUC-KR") + results.append((url, None)) + return results class NaverBlogExtractor(NaverBase, Extractor): diff --git a/test/results/naver.py b/test/results/naver.py index 81d18efd..9a8f92ec 100644 --- a/test/results/naver.py +++ b/test/results/naver.py @@ -24,6 +24,33 @@ __tests__ = ( "#sha1_metadata": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e", }, +{ + "#url" : "https://blog.naver.com/PostView.nhn?blogId=rlfqjxm0&logNo=70161391809", + "#comment" : "filenames in EUC-KR encoding (#5126)", + "#category": ("", "naver", "post"), + "#class" : naver.NaverPostExtractor, + "#urls": ( + "https://blogfiles.pstatic.net/20130305_23/ping9303_1362411028002Dpz9z_PNG/1_사본.png", + "https://blogfiles.pstatic.net/20130305_46/rlfqjxm0_1362473322580x33zi_PNG/오마갓합작.png", + ), + + "blog": { + "id" : "rlfqjxm0", + "num" : 43030507, + "user": "에나", + }, + "post": { + "date" : "dt:2013-03-05 17:48:00", + "description": "&nbsp;◈ &nbsp; &nbsp; PROMOTER&nbsp;:핑수 ˚ 아담 EDITOR:핑수 &nbsp; 넵:이크:핑수...", + "num" : 70161391809, + "title" : "[공유] { 합작}  OH, MY GOD! ~ 아 또 무슨 종말을 한다 그래~" + }, + "count" : 2, + "num" : range(1, 2), + "filename" : r"re:1_사본|오마갓합작", + "extension": "png", +}, + { "#url" : "https://blog.naver.com/gukjung", "#category": ("", "naver", "blog"), From f296067797583ba90f4b5294f28966e04bbb6bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 00:46:19 +0100 Subject: [PATCH 011/154] [naver] unescape post 'title' and 'description' --- gallery_dl/extractor/naver.py | 6 ++++-- test/results/naver.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 1c85d361..d3150e6d 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -47,8 +47,10 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): extr = text.extract_from(page) data = { "post": { - "title" : extr('"og:title" content="', '"'), - "description": extr('"og:description" content="', '"'), + "title" : text.unescape(extr( + '"og:title" content="', '"')), + "description": text.unescape(extr( + '"og:description" content="', '"')).replace(" ", " "), "num" : text.parse_int(self.post_id), }, "blog": { diff --git a/test/results/naver.py b/test/results/naver.py index 0dc0e368..a763a5b5 100644 --- a/test/results/naver.py +++ b/test/results/naver.py @@ -41,9 +41,9 @@ __tests__ = ( }, "post": { "date" : "dt:2013-03-05 17:48:00", - "description": "&nbsp;◈ &nbsp; &nbsp; PROMOTER&nbsp;:핑수 ˚ 아담 EDITOR:핑수 &nbsp; 넵:이크:핑수...", + "description": " ◈ PROMOTER :핑수 ˚ 아담 EDITOR:핑수 넵:이크:핑수...", "num" : 70161391809, - "title" : "[공유] { 합작}  OH, MY GOD! ~ 아 또 무슨 종말을 한다 그래~" + "title" : "[공유] { 합작}  OH, MY GOD! ~ 아 또 무슨 종말을 한다 그래~", }, "count" : 2, "num" : range(1, 2), From 24873c27240a53859224b6ad0a56e3438c47aaf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 01:27:45 +0100 Subject: [PATCH 012/154] [warosu] fix crash for threads with deleted posts (#5289) --- gallery_dl/extractor/warosu.py | 14 +++++++++----- test/results/warosu.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 3bb635d6..11f0c18f 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -64,8 +64,7 @@ class WarosuThreadExtractor(Extractor): def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if " File:" in post: - self._extract_image(post, data) + if " File:" in post and self._extract_image(post, data): part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] @@ -91,6 +90,11 @@ class WarosuThreadExtractor(Extractor): "", "<").rstrip().rpartition(".")[0]) extr("
", "") - data["image"] = url = extr("
") - if url[0] == "/": - data["image"] = self.root + url + url = extr("") + if url: + if url[0] == "/": + data["image"] = self.root + url + else: + data["image"] = url + return True + return False diff --git a/test/results/warosu.py b/test/results/warosu.py index efc7f832..fd095183 100644 --- a/test/results/warosu.py +++ b/test/results/warosu.py @@ -54,6 +54,18 @@ __tests__ = ( "w" : 450, }, +{ + "#url" : "https://warosu.org/jp/thread/45886210", + "#comment" : "deleted post (#5289)", + "#category": ("", "warosu", "thread"), + "#class" : warosu.WarosuThreadExtractor, + "#count" : "> 150", + + "board" : "jp", + "board_name": "Otaku Culture", + "title" : "/07/th Expansion Thread", +}, + { "#url" : "https://warosu.org/ic/thread/4604652", "#category": ("", "warosu", "thread"), From 296f20e6309a70601878e90e4365cf686b869198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 01:28:47 +0100 Subject: [PATCH 013/154] [warosu] fix 'board_name' metadata --- gallery_dl/extractor/warosu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 11f0c18f..e91f45f8 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -50,7 +50,7 @@ class WarosuThreadExtractor(Extractor): title = text.unescape(text.extr(page, "class=filetitle>", "<")) return { "board" : self.board, - "board_name": boardname.rpartition(" - ")[2], + "board_name": boardname.split(" - ")[1], "thread" : self.thread, "title" : title, } From 9fd851cda157aae9185273dccb9c3a74c71334fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 02:11:28 +0100 Subject: [PATCH 014/154] [docs] fix typo: rebot -> robot (#5262) https://github.com/mikf/gallery-dl/issues/5262#issuecomment-1979884897 --- docs/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 37f12f13..b9070f3a 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5784,7 +5784,7 @@ How To * choose a name * select "installed app" * set ``http://localhost:6414/`` as "redirect uri" - * solve the "I'm not a rebot" reCATCHA if needed + * solve the "I'm not a robot" reCATCHA if needed * click "create app" * copy the client id (third line, under your application's name and From db507e30c7431d4ed7e23c153a044ce1751c2847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 02:26:52 +0100 Subject: [PATCH 015/154] [pixiv] fix novel text extraction (#5285) change to '/webview/v2/novel' since '/v1/novel/text' does not work anymore --- gallery_dl/extractor/pixiv.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index b9821f23..862a7db2 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -650,7 +650,7 @@ class PixivNovelExtractor(PixivExtractor): yield Message.Directory, novel try: - content = self.api.novel_text(novel["id"])["novel_text"] + content = self.api.novel_webview(novel["id"])["text"] except Exception: self.log.warning("Unable to download novel %s", novel["id"]) continue @@ -663,7 +663,7 @@ class PixivNovelExtractor(PixivExtractor): illusts = {} for marker in text.extract_iter(content, "[", "]"): - if marker.startswith("[jumpuri:If you would like to "): + if marker.startswith("uploadedimage:"): desktop = True elif marker.startswith("pixivimage:"): illusts[marker[11:].partition("-")[0]] = None @@ -918,6 +918,15 @@ class PixivAppAPI(): params = {"novel_id": novel_id} return self._call("/v1/novel/text", params) + def novel_webview(self, novel_id): + params = {"id": novel_id, "viewer_version": "20221031_ai"} + return self._call( + "/webview/v2/novel", params, self._novel_webview_parse) + + def _novel_webview_parse(self, response): + return util.json_loads(text.extr( + response.text, "novel: ", ",\n")) + def search_illust(self, word, sort=None, target=None, duration=None, date_start=None, date_end=None): params = {"word": word, "search_target": target, @@ -962,13 +971,17 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] - def _call(self, endpoint, params=None): + def _call(self, endpoint, params=None, parse=None): url = "https://app-api.pixiv.net" + endpoint while True: self.login() response = self.extractor.request(url, params=params, fatal=False) - data = response.json() + + if parse: + data = parse(response) + else: + data = response.json() if "error" not in data: return data From a8027745e318c54b5ba321f32d097c3f7e0a8314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 14:00:24 +0100 Subject: [PATCH 016/154] [downloader:http] add MIME type and signature for .mov files (#5287) --- gallery_dl/downloader/http.py | 2 ++ test/test_downloader.py | 1 + 2 files changed, 3 insertions(+) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index f1d2c4a8..0ff5dd9a 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -399,6 +399,7 @@ MIME_TYPES = { "video/webm": "webm", "video/ogg" : "ogg", "video/mp4" : "mp4", + "video/quicktime": "mov", "audio/wav" : "wav", "audio/x-wav": "wav", @@ -441,6 +442,7 @@ SIGNATURE_CHECKS = { "psd" : lambda s: s[0:4] == b"8BPS", "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( b"mp4", b"avc", b"iso", b"M4V")), + "mov" : lambda s: s[4:12] == b"ftypqt ", "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", "ogg" : lambda s: s[0:4] == b"OggS", "wav" : lambda s: (s[0:4] == b"RIFF" and diff --git a/test/test_downloader.py b/test/test_downloader.py index 126fa182..8027af50 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -304,6 +304,7 @@ SAMPLES = { ("mp4" , b"????ftypavc1"), ("mp4" , b"????ftypiso3"), ("mp4" , b"????ftypM4V"), + ("mov" , b"????ftypqt "), ("webm", b"\x1A\x45\xDF\xA3"), ("ogg" , b"OggS"), ("wav" , b"RIFF????WAVE"), From 0676a9d6ecbfcc8c9bd586e1e8912f7daf2364b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 15:55:00 +0100 Subject: [PATCH 017/154] [weibo] fix 'livephoto' filename extensions (#5287) --- gallery_dl/extractor/weibo.py | 2 +- test/results/weibo.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 5b451489..ab90c0ba 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -118,7 +118,7 @@ class WeiboExtractor(Extractor): append(pic["largest"].copy()) file = {"url": pic["video"]} - file["filehame"], _, file["extension"] = \ + file["filename"], _, file["extension"] = \ pic["video"].rpartition("%2F")[2].rpartition(".") append(file) diff --git a/test/results/weibo.py b/test/results/weibo.py index 8534c10b..992d2c93 100644 --- a/test/results/weibo.py +++ b/test/results/weibo.py @@ -209,8 +209,10 @@ __tests__ = ( "#comment" : "type == livephoto (#2146)", "#category": ("", "weibo", "status"), "#class" : weibo.WeiboStatusExtractor, - "#pattern" : r"https://video\.weibo\.com/media/play\?livephoto=https%3A%2F%2Fus.sinaimg.cn%2F\w+\.mov", + "#pattern" : r"https://video\.weibo\.com/media/play\?livephoto=https%3A%2F%2Fus\.sinaimg\.cn%2F\w+\.mov", "#range" : "2,4,6", + + "extension": "mov", }, { @@ -219,6 +221,8 @@ __tests__ = ( "#category": ("", "weibo", "status"), "#class" : weibo.WeiboStatusExtractor, "#urls" : "https://wx4.sinaimg.cn/large/68d80d22gy1h2ryfa8k0kg208w06o7wh.gif", + + "extension": "gif", }, { From ace16f00f5c4ef136e86d0012a21c7c195472f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 19:09:59 +0100 Subject: [PATCH 018/154] [weibo] fix retweets (#2825, #3874, #5263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - handle 快转 retweets - disable 'retweets' by default - skip all retweet media when 'retweets' are disabled - extract all retweet media when 'retweets' is set to "original" --- docs/configuration.rst | 2 +- gallery_dl/extractor/weibo.py | 26 ++++++++++++++++++-------- test/results/weibo.py | 26 ++++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index b9070f3a..df049c8b 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4039,7 +4039,7 @@ extractor.weibo.retweets Type ``bool`` Default - ``true`` + ``false`` Description Fetch media from retweeted posts. diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index ab90c0ba..1049ba7b 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -30,9 +30,9 @@ class WeiboExtractor(Extractor): self._prefix, self.user = match.groups() def _init(self): - self.retweets = self.config("retweets", True) - self.videos = self.config("videos", True) self.livephoto = self.config("livephoto", True) + self.retweets = self.config("retweets", False) + self.videos = self.config("videos", True) self.gifs = self.config("gifs", True) self.gifs_video = (self.gifs == "video") @@ -59,15 +59,25 @@ class WeiboExtractor(Extractor): for status in self.statuses(): - files = [] - if self.retweets and "retweeted_status" in status: + if "ori_mid" in status and not self.retweets: + self.log.debug("Skipping %s (快转 retweet)", status["id"]) + continue + + if "retweeted_status" in status: + if not self.retweets: + self.log.debug("Skipping %s (retweet)", status["id"]) + continue + + # videos of the original post are in status + # images of the original post are in status["retweeted_status"] + files = [] + self._extract_status(status, files) + self._extract_status(status["retweeted_status"], files) + if original_retweets: status = status["retweeted_status"] - self._extract_status(status, files) - else: - self._extract_status(status, files) - self._extract_status(status["retweeted_status"], files) else: + files = [] self._extract_status(status, files) status["date"] = text.parse_datetime( diff --git a/test/results/weibo.py b/test/results/weibo.py index 992d2c93..8f46da3f 100644 --- a/test/results/weibo.py +++ b/test/results/weibo.py @@ -80,11 +80,11 @@ __tests__ = ( "#category": ("", "weibo", "home"), "#class" : weibo.WeiboHomeExtractor, "#range" : "1-30", - "#count" : 30, + "#count" : 0, }, { - "#url" : "https://weibo.com/1758989602?tabtype=feed", + "#url" : "https://weibo.com/2553930725?tabtype=feed", "#category": ("", "weibo", "feed"), "#class" : weibo.WeiboFeedExtractor, "#range" : "1-30", @@ -194,6 +194,28 @@ __tests__ = ( "#class" : weibo.WeiboStatusExtractor, }, +{ + "#url" : "https://weibo.cn/detail/4600272267522211", + "#comment" : "retweet", + "#category": ("", "weibo", "status"), + "#class" : weibo.WeiboStatusExtractor, + "#count" : 0, +}, + +{ + "#url" : "https://weibo.cn/detail/4600272267522211", + "#comment" : "retweet", + "#category": ("", "weibo", "status"), + "#class" : weibo.WeiboStatusExtractor, + "#options" : {"retweets": True}, + "#count" : 2, + + "status": { + "id" : 4600272267522211, + "retweeted_status": {"id": 4600167083287033}, + }, +}, + { "#url" : "https://m.weibo.cn/detail/4600272267522211", "#comment" : "original retweets (#1542)", From 6d9e3c0eb1416f2dbae3719afd54e20f58af30c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 22:43:01 +0100 Subject: [PATCH 019/154] [skeb] add extractor for followed users (#5290) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit needs 'Authorization' header from browser session -o headers.Authorization="Bearer ey…" --- docs/supportedsites.md | 2 +- gallery_dl/extractor/skeb.py | 59 ++++++++++++++++++++++++------------ scripts/supportedsites.py | 4 +++ test/results/skeb.py | 8 +++++ 4 files changed, 53 insertions(+), 20 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 312cdc23..1bbfa12e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -790,7 +790,7 @@ Consider all listed sites to potentially be NSFW. Skeb https://skeb.jp/ - Followed Users, Posts, Search Results, User Profiles + Followed Creators, Followed Users, Posts, Search Results, User Profiles diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index b9b4b3c4..38a2d166 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -26,10 +26,10 @@ class SkebExtractor(Extractor): def _init(self): self.thumbnails = self.config("thumbnails", False) self.article = self.config("article", False) - self.headers = { - "Accept" : "application/json, text/plain, */*", - "Authorization": "Bearer null", - } + self.headers = {"Accept": "application/json, text/plain, */*"} + + if "Authorization" not in self.session.headers: + self.headers["Authorization"] = "Bearer null" def request(self, url, **kwargs): while True: @@ -55,6 +55,12 @@ class SkebExtractor(Extractor): url = file["file_url"] yield Message.Url, url, text.nameext_from_url(url, post) + def _items_users(self): + base = self.root + "/@" + for user in self.users(): + user["_extractor"] = SkebUserExtractor + yield Message.Queue, base + user["screen_name"], user + def posts(self): """Return post number""" @@ -83,6 +89,20 @@ class SkebExtractor(Extractor): return params["offset"] += 30 + def _pagination_users(self, endpoint, params): + url = "{}/api{}".format(self.root, endpoint) + params["offset"] = 0 + params["limit"] = 90 + + while True: + data = self.request( + url, params=params, headers=self.headers).json() + yield from data + + if len(data) < params["limit"]: + return + params["offset"] += params["limit"] + def _get_post_data(self, user_name, post_num): url = "{}/api/users/{}/works/{}".format( self.root, user_name, post_num) @@ -256,22 +276,23 @@ class SkebFollowingExtractor(SkebExtractor): pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators" example = "https://skeb.jp/@USER/following_creators" - def items(self): - for user in self.users(): - url = "{}/@{}".format(self.root, user["screen_name"]) - user["_extractor"] = SkebUserExtractor - yield Message.Queue, url, user + items = SkebExtractor._items_users def users(self): - url = "{}/api/users/{}/following_creators".format( - self.root, self.user_name) - params = {"sort": "date", "offset": 0, "limit": 90} + endpoint = "/users/{}/following_creators".format(self.user_name) + params = {"sort": "date"} + return self._pagination_users(endpoint, params) - while True: - data = self.request( - url, params=params, headers=self.headers).json() - yield from data - if len(data) < params["limit"]: - return - params["offset"] += params["limit"] +class SkebFollowingUsersExtractor(SkebExtractor): + """Extractor for your followed users""" + subcategory = "following-users" + pattern = r"(?:https?://)?skeb\.jp/following_users()" + example = "https://skeb.jp/following_users" + + items = SkebExtractor._items_users + + def users(self): + endpoint = "/following_users" + params = {} + return self._pagination_users(endpoint, params) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index ecdc121d..47d7ef92 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -273,6 +273,10 @@ SUBCATEGORY_MAP = { "sexcom": { "pins": "User Pins", }, + "skeb": { + "following" : "Followed Creators", + "following-users": "Followed Users", + }, "smugmug": { "path": "Images from Users and Folders", }, diff --git a/test/results/skeb.py b/test/results/skeb.py index a05ed63e..4aa8691d 100644 --- a/test/results/skeb.py +++ b/test/results/skeb.py @@ -82,4 +82,12 @@ __tests__ = ( "#class" : skeb.SkebFollowingExtractor, }, +{ + "#url" : "https://skeb.jp/following_users", + "#category": ("", "skeb", "following-users"), + "#class" : skeb.SkebFollowingUsersExtractor, + "#pattern" : skeb.SkebUserExtractor.pattern, + "#auth" : True, +}, + ) From 790c0ffb8db169b0a7e29bda8455dd09b2624d35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Mar 2024 22:56:57 +0100 Subject: [PATCH 020/154] [lensdump] recognize direct image links (#5293 --- gallery_dl/extractor/lensdump.py | 2 +- test/results/lensdump.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index d4ccf33b..8ca9d88e 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -104,7 +104,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/i/(\w+)" + pattern = r"(?:https?://)?(?:lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" def __init__(self, match): diff --git a/test/results/lensdump.py b/test/results/lensdump.py index 73f1fbdf..a300227b 100644 --- a/test/results/lensdump.py +++ b/test/results/lensdump.py @@ -32,7 +32,7 @@ __tests__ = ( "#url" : "https://lensdump.com/i/tyoAyM", "#category": ("", "lensdump", "image"), "#class" : lensdump.LensdumpImageExtractor, - "#pattern" : r"https://c\.l3n\.co/i/tyoAyM\.webp", + "#urls" : "https://c.l3n.co/i/tyoAyM.webp", "#sha1_content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", "date" : "dt:2022-08-01 08:24:28", @@ -45,4 +45,20 @@ __tests__ = ( "width" : 620, }, +{ + "#url" : "https://c.l3n.co/i/tyoAyM.webp", + "#category": ("", "lensdump", "image"), + "#class" : lensdump.LensdumpImageExtractor, + "#urls" : "https://c.l3n.co/i/tyoAyM.webp", + + "date" : "dt:2022-08-01 08:24:28", + "extension": "webp", + "filename" : "tyoAyM", + "height" : 400, + "id" : "tyoAyM", + "title" : "MYOBI clovis bookcaseset", + "url" : "https://c.l3n.co/i/tyoAyM.webp", + "width" : 620, +}, + ) From 40c0553523bb28790de0e6a07a978a42e2be88c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 7 Mar 2024 00:52:50 +0100 Subject: [PATCH 021/154] [twitter] add 'quotes' extractor (#5262) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/mikf/gallery-dl/issues/5262#issuecomment-1981571924 It's implemented as a search for 'quoted_tweet_id:…' on Twitter. --- docs/supportedsites.md | 2 +- gallery_dl/extractor/twitter.py | 16 ++++++++++++++-- test/results/twitter.py | 10 +++++++++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1bbfa12e..b004d7dc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -898,7 +898,7 @@ Consider all listed sites to potentially be NSFW. Twitter https://twitter.com/ - Avatars, Backgrounds, Bookmarks, Communities, Events, Followed Users, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets, User Profiles + Avatars, Backgrounds, Bookmarks, Communities, Events, Followed Users, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Quotes, Search Results, Timelines, Tweets, User Profiles Supported diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ad5bfc62..e6bf9b0f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -731,9 +731,9 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): - """Extractor for images from individual tweets""" + """Extractor for individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?$" example = "https://twitter.com/USER/status/12345" def __init__(self, match): @@ -810,6 +810,18 @@ class TwitterTweetExtractor(TwitterExtractor): return itertools.chain(buffer, tweets) +class TwitterQuotesExtractor(TwitterExtractor): + """Extractor for quotes of a Tweet""" + subcategory = "quotes" + pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes" + example = "https://twitter.com/USER/status/12345/quotes" + + def items(self): + url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user) + data = {"_extractor": TwitterSearchExtractor} + yield Message.Queue, url, data + + class TwitterAvatarExtractor(TwitterExtractor): subcategory = "avatar" filename_fmt = "avatar {date}.{extension}" diff --git a/test/results/twitter.py b/test/results/twitter.py index 5150a11a..f7fd8dba 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -218,7 +218,7 @@ __tests__ = ( "#category": ("", "twitter", "hashtag"), "#class" : twitter.TwitterHashtagExtractor, "#pattern" : twitter.TwitterSearchExtractor.pattern, - "#sha1_url": "3571c3a53b7647ea35517041fdc17f77ec5b2cb9", + "#urls" : "https://twitter.com/search?q=%23nature", }, { @@ -537,6 +537,14 @@ The Washington Post writes, "Three weeks after the toxic train derailment in Ohi "The analysis by Texas A&M University seems to contradict statements by state and federal regulators that air near the crash site is completely safe, despite residents complaining about rashes, breathing problems and other health effects." Your reaction.""", }, +{ + "#url" : "https://twitter.com/playpokemon/status/1263832915173048321/quotes", + "#category": ("", "twitter", "quotes"), + "#class" : twitter.TwitterQuotesExtractor, + "#pattern" : twitter.TwitterSearchExtractor.pattern, + "#urls" : "https://twitter.com/search?q=quoted_tweet_id:1263832915173048321", +}, + { "#url" : "https://twitter.com/supernaturepics/photo", "#category": ("", "twitter", "avatar"), From 052811b57f262025dcc87e6833e325ccb85bd04d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 7 Mar 2024 00:59:02 +0100 Subject: [PATCH 022/154] [docs] fix another typo (#5262) https://github.com/mikf/gallery-dl/issues/5262#issuecomment-1980094380 This on was on the same line as the previous one ... (9fd851cd) --- docs/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index df049c8b..6408a75d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5784,7 +5784,7 @@ How To * choose a name * select "installed app" * set ``http://localhost:6414/`` as "redirect uri" - * solve the "I'm not a robot" reCATCHA if needed + * solve the "I'm not a robot" reCAPTCHA if needed * click "create app" * copy the client id (third line, under your application's name and From 05331f9cf1c0090cfdd025f803ab44b477884882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 7 Mar 2024 01:29:19 +0100 Subject: [PATCH 023/154] [imagefap] flake8, cleanup, tests --- gallery_dl/extractor/imagefap.py | 14 ++++++++------ test/results/imagefap.py | 8 +++++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index e1be8c9d..32fe3715 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -173,8 +173,8 @@ class ImagefapFolderExtractor(ImagefapExtractor): def galleries(self, folder_id): """Yield gallery IDs and titles of a folder""" - folder="Uncategorized" if folder_id == "-1": + folder_name = "Uncategorized" if self._id: url = "{}/usergallery.php?userid={}&folderid=-1".format( self.root, self.user) @@ -182,26 +182,28 @@ class ImagefapFolderExtractor(ImagefapExtractor): url = "{}/profile/{}/galleries?folderid=-1".format( self.root, self.user) else: + folder_name = None url = "{}/organizer/{}/".format(self.root, folder_id) params = {"page": 0} + extr = text.extract_from(self.request(url, params=params).text) + if not folder_name: + folder_name = extr("class'blk_galleries'>", "") + while True: - extr = text.extract_from(self.request(url, params=params).text) cnt = 0 - if folder_id != -1 and params["page"] == 0: - folder = extr('class\'blk_galleries\'>','') - while True: gid = extr('", "<"), folder + yield gid, extr("", "<"), folder_name cnt += 1 if cnt < 20: break params["page"] += 1 + extr = text.extract_from(self.request(url, params=params).text) class ImagefapUserExtractor(ImagefapExtractor): diff --git a/test/results/imagefap.py b/test/results/imagefap.py index bec94011..7ac1631f 100644 --- a/test/results/imagefap.py +++ b/test/results/imagefap.py @@ -124,7 +124,11 @@ __tests__ = ( "#url" : "https://www.imagefap.com/usergallery.php?userid=1981976&folderid=409758", "#category": ("", "imagefap", "folder"), "#class" : imagefap.ImagefapFolderExtractor, - "#sha1_url": "37822523e6e4a56feb9dea35653760c86b44ff89", + "#urls" : "https://www.imagefap.com/gallery/7876223", + + "folder" : "Softcore", + "gallery_id": "7876223", + "title" : "Kelsi Monroe in lingerie", }, { @@ -140,6 +144,8 @@ __tests__ = ( "#class" : imagefap.ImagefapFolderExtractor, "#pattern" : imagefap.ImagefapGalleryExtractor.pattern, "#range" : "1-40", + + "folder": "Uncategorized", }, { From 146459056c21bab1742a51688960332d7b3632d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 7 Mar 2024 15:58:01 +0100 Subject: [PATCH 024/154] [reddit] provide 'fallback_url' as video fallback (#5296) --- gallery_dl/extractor/reddit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2ef0f9fb..9ba36941 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -191,6 +191,8 @@ class RedditExtractor(Extractor): try: if "reddit_video_preview" in post["preview"]: video = post["preview"]["reddit_video_preview"] + if "fallback_url" in video: + yield video["fallback_url"] if "dash_url" in video: yield "ytdl:" + video["dash_url"] if "hls_url" in video: From 3c979e1f0500529c40f4718319a140c3e73ffb1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 8 Mar 2024 23:13:53 +0100 Subject: [PATCH 025/154] [workflows] release standalone executables in gdl-org/builds needs some form of release notes, probably just git commits since last stable release --- .github/workflows/executables.yml | 60 ++++++++++++++++++++++++++----- .github/workflows/pages.yml | 2 +- scripts/pyinstaller.py | 42 +++++++++++++++------- 3 files changed, 83 insertions(+), 21 deletions(-) diff --git a/.github/workflows/executables.yml b/.github/workflows/executables.yml index 9d49e875..b3433d44 100644 --- a/.github/workflows/executables.yml +++ b/.github/workflows/executables.yml @@ -1,10 +1,15 @@ -name: executables +name: Executables on: workflow_dispatch: push: branches: - master + tags-ignore: + - "*" + +env: + DATE_FORMAT: "%Y.%m.%d" jobs: build: @@ -31,19 +36,58 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} architecture: ${{ matrix.architecture }} + - name: Date + run: echo "DATE=$(date '+${{ env.DATE_FORMAT }}')" >> "$GITHUB_ENV" + + - name: Update Version + # use Python since its behavior is consistent across operating systems + shell: python + run: | + import re + path = "./application/version.py" + with open(path) as fp: + content = fp.read() + content = re.sub( + r'\b(__version__ = "[^"]+)', + r"\1:${{ env.DATE }}", + content) + with open(path, "w") as fp: + fp.write(content) + - name: Build executable run: | pip install requests requests[socks] yt-dlp pyyaml ${{ matrix.python-packages }} pyinstaller - python scripts/pyinstaller.py + python ./scripts/pyinstaller.py --os '${{ matrix.os }}' --arch '${{ matrix.architecture }}' + + - uses: actions/upload-artifact@v4 + with: + name: executable-${{ matrix.os }}-${{ matrix.architecture }}-${{ matrix.python-version }} + path: dist/* + retention-days: 1 + compression-level: 0 + + release: + + needs: build + runs-on: ubuntu-latest + + steps: + - uses: actions/download-artifact@v4 + + - name: Date + run: echo "DATE=$(date '+${{ env.DATE_FORMAT }}')" >> "$GITHUB_ENV" - - name: Upload executable - uses: actions/upload-artifact@v3 + - uses: ncipollo/release-action@v1 with: - name: gallery-dl-${{ matrix.os }}-${{ matrix.architecture }}-${{ matrix.python-version }} - path: | - dist + owner: gdl-org + repo: builds + tag: ${{ env.DATE }} + artifacts: "executable-*/*" + allowUpdates: true + makeLatest: true + token: ${{ secrets.REPO_TOKEN }} diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e4ac57b6..9ddb05ea 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -28,7 +28,7 @@ jobs: curl -L -X POST -H "Accept: application/vnd.github+json" - -H "Authorization: Bearer ${{ secrets.DISPATCH_TOKEN }}" + -H "Authorization: Bearer ${{ secrets.REPO_TOKEN }}" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches -d '{"ref":"master"}' diff --git a/scripts/pyinstaller.py b/scripts/pyinstaller.py index f30501b4..5a81a1b5 100755 --- a/scripts/pyinstaller.py +++ b/scripts/pyinstaller.py @@ -4,16 +4,34 @@ """Build a standalone executable using PyInstaller""" import PyInstaller.__main__ +import argparse import util -import os - -PyInstaller.__main__.run([ - "--onefile", - "--console", - "--name", "gallery-dl." + ("exe" if os.name == "nt" else "bin"), - "--additional-hooks-dir", util.path("scripts"), - "--distpath", util.path("dist"), - "--workpath", util.path("build"), - "--specpath", util.path("build"), - util.path("gallery_dl", "__main__.py"), -]) +import sys + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--os") + parser.add_argument("-a", "--arch") + args = parser.parse_args() + + name = "gallery-dl" + if args.os: + name = "{}_{}".format(name, args.os.partition("-")[0].lower()) + if args.arch == "x86": + name += "_x86" + + PyInstaller.__main__.run([ + "--onefile", + "--console", + "--name", name, + "--additional-hooks-dir", util.path("scripts"), + "--distpath", util.path("dist"), + "--workpath", util.path("build"), + "--specpath", util.path("build"), + util.path("gallery_dl", "__main__.py"), + ]) + + +if __name__ == "__main__": + sys.exit(main()) From a01d334febc0f550475b94c983478bd8db18e5b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 9 Mar 2024 23:32:31 +0100 Subject: [PATCH 026/154] [workflows] update to actions/setup-python@v5 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 18abb567..18a30c72 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,7 +26,7 @@ jobs: if [[ "$(find ./gallery_dl -type f -not -perm 644)" ]]; then exit 1; fi - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} From ac4e29f70a9a6ef023639576d7f93c45acec9ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 9 Mar 2024 23:33:58 +0100 Subject: [PATCH 027/154] [lensdump] support more direct link formats (#5293) --- gallery_dl/extractor/lensdump.py | 2 +- test/results/lensdump.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index 8ca9d88e..12e8860c 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -104,7 +104,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = r"(?:https?://)?(?:lensdump\.com|\w\.l3n\.co)/i/(\w+)" + pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" def __init__(self, match): diff --git a/test/results/lensdump.py b/test/results/lensdump.py index a300227b..b6a030ee 100644 --- a/test/results/lensdump.py +++ b/test/results/lensdump.py @@ -61,4 +61,16 @@ __tests__ = ( "width" : 620, }, +{ + "#url" : "https://i.lensdump.com/i/tyoAyM", + "#category": ("", "lensdump", "image"), + "#class" : lensdump.LensdumpImageExtractor, +}, + +{ + "#url" : "https://i3.lensdump.com/i/tyoAyM", + "#category": ("", "lensdump", "image"), + "#class" : lensdump.LensdumpImageExtractor, +}, + ) From 5842e4928da7ad65ecaf1f387b6ed245d5fa0d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 9 Mar 2024 23:35:23 +0100 Subject: [PATCH 028/154] [imgur] fail downloads when redirected to 'removed.png' (#5308) --- gallery_dl/extractor/imgur.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 8884d3ee..86b1edd4 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -39,10 +39,15 @@ class ImgurExtractor(Extractor): image["url"] = url = "https://i.imgur.com/{}.{}".format( image["id"], image["ext"]) image["date"] = text.parse_datetime(image["created_at"]) + image["_http_validate"] = self._validate text.nameext_from_url(url, image) return url + def _validate(self, response): + return (not response.history or + not response.url.endswith("/removed.png")) + def _items_queue(self, items): album_ex = ImgurAlbumExtractor image_ex = ImgurImageExtractor From 71cf08af4b56c1ce39f9fdc63e23a7246e4c8986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 9 Mar 2024 23:47:35 +0100 Subject: [PATCH 029/154] [workflows] update version.py path --- .github/workflows/executables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/executables.yml b/.github/workflows/executables.yml index b3433d44..7a303ce2 100644 --- a/.github/workflows/executables.yml +++ b/.github/workflows/executables.yml @@ -49,7 +49,7 @@ jobs: shell: python run: | import re - path = "./application/version.py" + path = "./gallery_dl/version.py" with open(path) as fp: content = fp.read() content = re.sub( From 4565de3c8efd0b4fd40c2e8379fb75ac0a5a5c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 10 Mar 2024 20:50:55 +0100 Subject: [PATCH 030/154] [reddit] provide 'gif' and 'mp4' variant fallbacks (#5315) --- gallery_dl/extractor/reddit.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9ba36941..e099c7ed 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -202,6 +202,12 @@ class RedditExtractor(Extractor): try: for image in post["preview"]["images"]: + variants = image.get("variants") + if variants: + if "gif" in variants: + yield variants["gif"]["source"]["url"] + if "mp4" in variants: + yield variants["mp4"]["source"]["url"] yield image["source"]["url"] except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) From 4ccdba8ccbab4bcab6e1fea5ce036bbf49aeb8b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 11 Mar 2024 21:35:50 +0100 Subject: [PATCH 031/154] [scripts] update release.py build Linux binary in ubuntu22.04 VM --- scripts/pyinstaller.py | 3 +++ scripts/release.sh | 42 +++++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/scripts/pyinstaller.py b/scripts/pyinstaller.py index 5a81a1b5..ee22ecad 100755 --- a/scripts/pyinstaller.py +++ b/scripts/pyinstaller.py @@ -13,6 +13,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("-o", "--os") parser.add_argument("-a", "--arch") + parser.add_argument("-e", "--extension") args = parser.parse_args() name = "gallery-dl" @@ -20,6 +21,8 @@ def main(): name = "{}_{}".format(name, args.os.partition("-")[0].lower()) if args.arch == "x86": name += "_x86" + if args.extension: + name = "{}.{}".format(name, args.extension.lower()) PyInstaller.__main__.run([ "--onefile", diff --git a/scripts/release.sh b/scripts/release.sh index 8b84b980..09127b59 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -53,31 +53,43 @@ build-linux() { cd "${ROOTDIR}" echo Building Linux executable - VENV_PATH="/tmp/venv" - VENV_PYTHON="${VENV_PATH}/bin/python" - - rm -rf "${VENV_PATH}" - python -m virtualenv "${VENV_PATH}" - - $VENV_PYTHON -m pip install requests requests[socks] yt-dlp pyyaml secretstorage pyinstaller - $VENV_PYTHON ./scripts/pyinstaller.py + build-vm 'ubuntu22.04' 'gallery-dl.bin' } build-windows() { - cd "${ROOTDIR}/dist" + cd "${ROOTDIR}" echo Building Windows executable - # remove old executable - rm -f "gallery-dl.exe" + build-vm 'windows7_x86_sp1' 'gallery-dl.exe' +} + +build-vm() { + VMNAME="$1" + BINNAME="$2" + TMPPATH="/tmp/gallery-dl/dist/$BINNAME" - # build windows exe in vm - ln -fs "${ROOTDIR}" /tmp/ - vmstart "windows7_x86_sp1" & + # launch VM + vmstart "$VMNAME" & disown - while [ ! -e "gallery-dl.exe" ] ; do + + # copy source files + mkdir -p /tmp/gallery-dl + cp -a -t /tmp/gallery-dl -- \ + ./gallery_dl ./scripts ./data ./setup.py ./README.rst + + # remove old executable + rm -f "./dist/$BINNAME" + + # wait for new executable + while [ ! -e "$TMPPATH" ] ; do sleep 5 done sleep 2 + + # move + mv "$TMPPATH" "./dist/$BINNAME" + + rm -r /tmp/gallery-dl } sign() { From 106dfdb4c3dd1f4e2dc22a4c6fa161e6877a9ea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 11 Mar 2024 21:34:00 +0100 Subject: [PATCH 032/154] cleanup sleep-request retry delay code more lines but easier to read I'd say --- gallery_dl/extractor/common.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index cf0f8c90..d14e13ae 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -203,9 +203,15 @@ class Extractor(): self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - self.sleep( - max(tries, self._interval()) if self._interval else tries, - "retry") + + if self._interval: + seconds = self._interval() + if seconds < tries: + seconds = tries + else: + seconds = tries + + self.sleep(seconds, "retry") tries += 1 raise exception.HttpError(msg, response) From 6601e78b2fe80f2ab309c7bbcdacf223e383cfb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 11 Mar 2024 21:52:23 +0100 Subject: [PATCH 033/154] [docker] remove setuptools and wheel __pycache__ directories reduces image size by ~2MB --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 81bd8c12..30759122 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ RUN : \ https://github.com/mikf/gallery-dl/archive/refs/heads/master.tar.gz \ yt-dlp \ && rm -rf /root/.cache/pip \ + && find /usr/local/lib/python3.*/site-packages/setuptools -name __pycache__ -exec rm -rf {} + \ + && find /usr/local/lib/python3.*/site-packages/wheel -name __pycache__ -exec rm -rf {} + \ && : ENTRYPOINT [ "gallery-dl" ] From 7ac6274fe00f25edb2f36d2856cfce53118f3c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 11 Mar 2024 22:16:15 +0100 Subject: [PATCH 034/154] [docker] change date tags format from YYYYMMDD to YYYY.MM.DD to use the same format as in executables --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 46e67a4e..043940b6 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -37,7 +37,7 @@ jobs: type=ref,event=tag type=raw,value=dev type=sha,format=long,prefix= - type=raw,priority=500,value={{date 'YYYYMMDD'}} + type=raw,priority=500,value={{date 'YYYY.MM.DD'}} - uses: docker/setup-qemu-action@v3 From 416f1b8cc14b433bd8333d6f00d1e388d854e2e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 11 Mar 2024 23:28:20 +0100 Subject: [PATCH 035/154] [pages] customize layout to remove unnecessary links - "docs" header - "Improve this page" https://github.com/pages-themes/primer/blob/master/_layouts/default.html --- docs/_layouts/default.html | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 docs/_layouts/default.html diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html new file mode 100644 index 00000000..955164a3 --- /dev/null +++ b/docs/_layouts/default.html @@ -0,0 +1,21 @@ + + + + + + + +{% seo %} + + + + +
+ + {{ content }} + +
+ + + + From 225d849139f1c2f42c5f5652b1fbef4f9d3b8d16 Mon Sep 17 00:00:00 2001 From: blankie Date: Tue, 12 Mar 2024 11:44:25 +1100 Subject: [PATCH 036/154] [mastodon] fix handling null 'moved' account field --- gallery_dl/extractor/mastodon.py | 6 +++++- test/results/mastodon.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 68b41961..030d7d1a 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -70,7 +70,11 @@ class MastodonExtractor(BaseExtractor): def _check_moved(self, account): self._check_moved = None - if "moved" in account: + # Certain fediverse software (such as Iceshrimp and Sharkey) have a + # null account "moved" field instead of not having it outright. + # To handle this, check if the "moved" value is truthy instead + # if only it exists. + if account.get("moved"): self.log.warning("Account '%s' moved to '%s'", account["acct"], account["moved"]["acct"]) diff --git a/test/results/mastodon.py b/test/results/mastodon.py index cf881968..b6cb3464 100644 --- a/test/results/mastodon.py +++ b/test/results/mastodon.py @@ -18,4 +18,15 @@ __tests__ = ( "instance_remote": None, }, +{ + "#url" : "mastodon:https://wanderingwires.net/@quarc/9qppkxzyd1ee3i9p", + "#comment" : "null moved account", + "#category": ("mastodon", "wanderingwires.net", "status"), + "#class" : mastodon.MastodonStatusExtractor, + "#urls" : "https://s3.wanderingwires.net/null/4377e826-72ab-4659-885c-fa12945eb207.png", + + "instance": "wanderingwires.net", + "instance_remote": None, +}, + ) From c210c7a4ed8356e97394c750dede4bbd37128ecc Mon Sep 17 00:00:00 2001 From: Martin Contento <67421+tinnet@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:12:07 +0000 Subject: [PATCH 037/154] [instagram] default posts like_count to zero I don't know when/why this happens and for private account reasons can't add an example here, but i had this error out and it feels like a safe fix to default to `0` here. --- gallery_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index ddc11318..6a18db0f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -165,7 +165,7 @@ class InstagramExtractor(Extractor): data = { "post_id" : post["pk"], "post_shortcode": post["code"], - "likes": post.get("like_count"), + "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), "date": text.parse_timestamp(post.get("taken_at")), } From dde822e69a3f9de155bc708beb06af22d2ff143d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 12 Mar 2024 18:52:03 +0100 Subject: [PATCH 038/154] [instagram] change 'posts are private' exception to a warning (#5322) --- gallery_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 6a18db0f..9c2b1de2 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -736,7 +736,7 @@ class InstagramRestAPI(): not user["followed_by_viewer"]: name = user["username"] s = "" if name.endswith("s") else "s" - raise exception.StopExtraction("%s'%s posts are private", name, s) + self.extractor.log.warning("%s'%s posts are private", name, s) self.extractor._assign_user(user) return user["id"] From 108abab537ce6466f5055e6a8f66c4abc9e0fc23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 13 Mar 2024 14:46:03 +0100 Subject: [PATCH 039/154] [twitter] add 'protected' metadata field (#5327) for 'author' and 'user' --- gallery_dl/extractor/twitter.py | 1 + test/results/twitter.py | 53 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index e6bf9b0f..e7b02496 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -380,6 +380,7 @@ class TwitterExtractor(Extractor): "date" : text.parse_datetime( uget("created_at"), "%a %b %d %H:%M:%S %z %Y"), "verified" : uget("verified", False), + "protected" : uget("protected", False), "profile_banner" : uget("profile_banner_url", ""), "profile_image" : uget( "profile_image_url_https", "").replace("_normal.", "."), diff --git a/test/results/twitter.py b/test/results/twitter.py index f7fd8dba..c94963d9 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -72,6 +72,59 @@ __tests__ = ( "#class" : twitter.TwitterTimelineExtractor, "#range" : "1-40", "#sha1_url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", + + "author": { + "date" : "dt:2015-01-12 10:25:22", + "description" : "The very best nature pictures.", + "favourites_count": int, + "followers_count" : int, + "friends_count" : int, + "listed_count" : int, + "media_count" : int, + "statuses_count" : int, + "id" : 2976459548, + "location" : "Earth", + "name" : "supernaturepics", + "nick" : "Nature Pictures", + "profile_banner" : "https://pbs.twimg.com/profile_banners/2976459548/1421058583", + "profile_image" : "https://pbs.twimg.com/profile_images/554585280938659841/FLVAlX18.jpeg", + "protected" : False, + "verified" : False, + }, + "user": { + "date" : "dt:2015-01-12 10:25:22", + "description" : "The very best nature pictures.", + "favourites_count": int, + "followers_count" : int, + "friends_count" : int, + "listed_count" : int, + "media_count" : int, + "statuses_count" : int, + "id" : 2976459548, + "location" : "Earth", + "name" : "supernaturepics", + "nick" : "Nature Pictures", + "profile_banner" : "https://pbs.twimg.com/profile_banners/2976459548/1421058583", + "profile_image" : "https://pbs.twimg.com/profile_images/554585280938659841/FLVAlX18.jpeg", + "protected" : False, + "verified" : False, + }, + "tweet_id" : range(400000000000000000, 800000000000000000), + "conversation_id": range(400000000000000000, 800000000000000000), + "quote_id" : 0, + "reply_id" : 0, + "retweet_id" : 0, + "count" : range(1, 4), + "num" : range(1, 4), + "favorite_count" : int, + "quote_count" : int, + "reply_count" : int, + "retweet_count" : int, + "content" : str, + "lang" : str, + "date" : "type:datetime", + "sensitive" : False, + "source" : "nature_pics", }, { From d53db6e11a0efe030160f0d1b354494bc307effe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 13 Mar 2024 22:34:25 +0100 Subject: [PATCH 040/154] [twitter] handle "account is temporarily locked" errors (#5300) - display proper error message - add 'locked' option --- docs/configuration.rst | 13 +++++++++ gallery_dl/extractor/twitter.py | 51 +++++++++++++++++++++++---------- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6408a75d..4f4722ec 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3771,6 +3771,19 @@ Description * ``"wait"``: Wait until rate limit reset +extractor.twitter.locked +------------------------ +Type + ``string`` +Default + ``"abort"`` +Description + Selects how to handle "account is temporarily locked" errors. + + * ``"abort"``: Raise an error and stop extraction + * ``"wait"``: Wait until the account is unlocked and retry + + extractor.twitter.replies ------------------------- Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index e7b02496..87feeba9 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -895,6 +895,7 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor + self.log = extractor.log self.root = "https://twitter.com/i/api" self._nsfw_warning = True @@ -1257,7 +1258,7 @@ class TwitterAPI(): @cache(maxage=3600) def _guest_token(self): endpoint = "/1.1/guest/activate.json" - self.extractor.log.info("Requesting guest token") + self.log.info("Requesting guest token") return str(self._call( endpoint, None, "POST", False, "https://api.twitter.com", )["guest_token"]) @@ -1287,17 +1288,35 @@ class TwitterAPI(): if response.status_code < 400: data = response.json() - if not data.get("errors") or not any( - (e.get("message") or "").lower().startswith("timeout") - for e in data["errors"]): - return data # success or non-timeout errors - msg = data["errors"][0].get("message") or "Unspecified" - self.extractor.log.debug("Internal Twitter error: '%s'", msg) + errors = data.get("errors") + if not errors: + return data - if self.headers["x-twitter-auth-type"]: - self.extractor.log.debug("Retrying API request") - continue # retry + retry = False + for error in errors: + msg = error.get("message") or "Unspecified" + self.log.debug("API error: '%s'", msg) + + if "this account is temporarily locked" in msg: + msg = "Account temporarily locked" + if self.extractor.config("locked") != "wait": + raise exception.AuthorizationError(msg) + self.log.warning("%s. Press ENTER to retry.", msg) + try: + input() + except (EOFError, OSError): + pass + retry = True + + elif msg.lower().startswith("timeout"): + retry = True + + if not retry: + return data + elif self.headers["x-twitter-auth-type"]: + self.log.debug("Retrying API request") + continue # fall through to "Login Required" response.status_code = 404 @@ -1387,7 +1406,7 @@ class TwitterAPI(): try: tweet = tweets[tweet_id] except KeyError: - self.extractor.log.debug("Skipping %s (deleted)", tweet_id) + self.log.debug("Skipping %s (deleted)", tweet_id) continue if "retweeted_status_id_str" in tweet: @@ -1619,8 +1638,10 @@ class TwitterAPI(): variables["cursor"] = cursor def _pagination_users(self, endpoint, variables, path=None): - params = {"variables": None, - "features" : self._json_dumps(self.features_pagination)} + params = { + "variables": None, + "features" : self._json_dumps(self.features_pagination), + } while True: cursor = entry = None @@ -1664,9 +1685,9 @@ class TwitterAPI(): if text.startswith("Age-restricted"): if self._nsfw_warning: self._nsfw_warning = False - self.extractor.log.warning('"%s"', text) + self.log.warning('"%s"', text) - self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text) + self.log.debug("Skipping %s ('%s')", tweet_id, text) @cache(maxage=365*86400, keyarg=1) From 5158cbb4c11ec360c803ef04472ba1993640155b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 13 Mar 2024 22:36:38 +0100 Subject: [PATCH 041/154] [weibo] rework pagination logic (#4168) don't automatically stop when receiving an empty status list shouldn't improve 'tabtype=feed' results, but at least 'tabtype=album' ones and others using cursors won't end prematurely --- gallery_dl/extractor/weibo.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 1049ba7b..83b1642e 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -186,23 +186,34 @@ class WeiboExtractor(Extractor): data = data["data"] statuses = data["list"] - if not statuses: - return yield from statuses - if "next_cursor" in data: # videos, newvideo - if data["next_cursor"] == -1: + # videos, newvideo + cursor = data.get("next_cursor") + if cursor: + if cursor == -1: return - params["cursor"] = data["next_cursor"] - elif "page" in params: # home, article - params["page"] += 1 - elif data["since_id"]: # album + params["cursor"] = cursor + continue + + # album + since_id = data.get("since_id") + if since_id: params["sinceid"] = data["since_id"] - else: # feed, last album page - try: - params["since_id"] = statuses[-1]["id"] - 1 - except KeyError: + continue + + # home, article + if "page" in params: + if not statuses: return + params["page"] += 1 + continue + + # feed, last album page + try: + params["since_id"] = statuses[-1]["id"] - 1 + except LookupError: + return def _sina_visitor_system(self, response): self.log.info("Sina Visitor System") From ab1fc470b7d90c58c152fbe6e74e02c1c3b02946 Mon Sep 17 00:00:00 2001 From: teslaedison Date: Thu, 14 Mar 2024 16:07:02 +0800 Subject: [PATCH 042/154] chore: fix some typos Signed-off-by: teslaedison --- docs/configuration.rst | 4 ++-- snap/local/launchers/gallery-dl-launch | 2 +- snap/snapcraft.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6408a75d..7dd7ede1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3286,7 +3286,7 @@ Examples * ``["jpeg", "webp"]`` Description Only include assets that are in the specified file types. ``all`` can be - used to specifiy all file types. Valid values are: + used to specify all file types. Valid values are: * Grids: ``png``, ``jpeg``, ``jpg``, ``webp`` * Heroes: ``png``, ``jpeg``, ``jpg``, ``webp`` @@ -3326,7 +3326,7 @@ Examples * ``["fr", "it"]`` Description Only include assets that are in the specified languages. ``all`` can be - used to specifiy all languages. Valid values are `ISO 639-1 `__ + used to specify all languages. Valid values are `ISO 639-1 `__ language codes. diff --git a/snap/local/launchers/gallery-dl-launch b/snap/local/launchers/gallery-dl-launch index 908f303f..a4047c4d 100755 --- a/snap/local/launchers/gallery-dl-launch +++ b/snap/local/launchers/gallery-dl-launch @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# This is the maintainence launcher for the snap, make necessary runtime environment changes to make the snap work here. You may also insert security confinement/deprecation/obsoletion notice of the snap here. +# This is the maintenance launcher for the snap, make necessary runtime environment changes to make the snap work here. You may also insert security confinement/deprecation/obsoletion notice of the snap here. set \ -o errexit \ diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index e08bd49f..c7bcb452 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -37,7 +37,7 @@ plugs: # Network access network: - # For network service for recieving OAuth callback tokens + # For network service for receiving OAuth callback tokens network-bind: # Configuration access From 76683c5f5cfc9a9e8887eeb525c3df10d15a1e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Mar 2024 18:08:56 +0100 Subject: [PATCH 043/154] [deviantart:stash] fix 'index' metadata (#5335) --- gallery_dl/extractor/deviantart.py | 2 +- test/results/deviantart.py | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9cbb21c2..a22417a8 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -699,7 +699,7 @@ class DeviantartStashExtractor(DeviantartExtractor): if uuid: deviation = self.api.deviation(uuid) deviation["index"] = text.parse_int(text.extr( - page, 'gmi-deviationid="', '"')) + page, '\\"deviationId\\":', ',')) yield deviation return diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 2ffc485a..1bb5ecdd 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -364,9 +364,28 @@ __tests__ = ( "#url" : "https://sta.sh/022c83odnaxc", "#category": ("", "deviantart", "stash"), "#class" : deviantart.DeviantartStashExtractor, - "#pattern" : r"https://wixmp-[^.]+\.wixmp\.com/f/.+/.+\.png\?token=.+", + "#pattern" : r"https://wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/940f2d05-c5eb-4917-8192-7eb6a2d508c6/dcvdmbc-e506cdcf-3208-4c20-85ab-0bfa8a7bcb16.png\?token=ey.+", "#count" : 1, "#sha1_content": "057eb2f2861f6c8a96876b13cca1a4b7a408c11f", + + "content": { + "filename": "01_by_justatest235723_dcvdmbc.png", + "filesize": 380, + "width" : 128, + "height" : 128, + "src" : r"re:https://wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/940f2d05-c5eb-4917-8192-7eb6a2d508c6/dcvdmbc-e506cdcf-3208-4c20-85ab-0bfa8a7bcb16.png\?token=ey.+", + }, + "da_category" : "Uncategorized", + "date" : "dt:2018-12-26 14:49:27", + "deviationid" : "A4A6AD52-8857-46EE-ABFE-86D49D4FF9D0", + "download_filesize": 380, + "extension" : "png", + "filename" : "01_by_justatest235723-dcvdmbc", + "index" : 778297656, + "index_base36" : "cvdmbc", + "published_time": 1545835767, + "title" : "01", + "url" : "https://www.deviantart.com/stash/022c83odnaxc", }, { From 5716430c35a52300889b54a6422f4018b8142dce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Mar 2024 18:14:55 +0100 Subject: [PATCH 044/154] =?UTF-8?q?[deviantart:stash]=20recognize=20'devia?= =?UTF-8?q?ntart.com/stash/=E2=80=A6'=20URLs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gallery_dl/extractor/deviantart.py | 3 ++- test/results/deviantart.py | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index a22417a8..bb74929c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -678,7 +678,8 @@ class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" - pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" + pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)" + r"/([a-z0-9]+)") example = "https://sta.sh/abcde" skip = Extractor.skip diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 1bb5ecdd..a9727334 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -361,7 +361,7 @@ __tests__ = ( }, { - "#url" : "https://sta.sh/022c83odnaxc", + "#url" : "https://www.deviantart.com/stash/022c83odnaxc", "#category": ("", "deviantart", "stash"), "#class" : deviantart.DeviantartStashExtractor, "#pattern" : r"https://wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/940f2d05-c5eb-4917-8192-7eb6a2d508c6/dcvdmbc-e506cdcf-3208-4c20-85ab-0bfa8a7bcb16.png\?token=ey.+", @@ -388,6 +388,12 @@ __tests__ = ( "url" : "https://www.deviantart.com/stash/022c83odnaxc", }, +{ + "#url" : "https://sta.sh/022c83odnaxc", + "#category": ("", "deviantart", "stash"), + "#class" : deviantart.DeviantartStashExtractor, +}, + { "#url" : "https://sta.sh/21jf51j7pzl2", "#comment" : "multiple stash items", From ddee5ae7c4e6a0a45644efb1065b97efb5cc0c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Mar 2024 20:34:54 +0100 Subject: [PATCH 045/154] [gofile] fix extraction --- gallery_dl/extractor/gofile.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 289f91cb..f0eb4e9c 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -41,9 +41,13 @@ class GofileFolderExtractor(Extractor): folder = self._get_content(self.content_id, password) yield Message.Directory, folder + try: + contents = folder.pop("children") + except KeyError: + raise exception.AuthorizationError("Password required") + num = 0 - contents = folder.pop("contents") - for content_id in folder["childs"]: + for content_id in folder["childrenIds"]: content = contents[content_id] content["folder"] = folder @@ -67,31 +71,32 @@ class GofileFolderExtractor(Extractor): @memcache() def _create_account(self): self.log.debug("Creating temporary account") - return self._api_request("createAccount")["token"] + return self._api_request("accounts", method="POST")["token"] @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") page = self.request(self.root + "/dist/js/alljs.js").text - return text.extr(page, 'fetchData.wt = "', '"') + return text.extr(page, 'wt: "', '"') def _get_content(self, content_id, password=None): + headers = {"Authorization": "Bearer " + self.api_token} + params = {"wt": self.website_token} if password is not None: - password = hashlib.sha256(password.encode()).hexdigest() - return self._api_request("getContent", { - "contentId" : content_id, - "token" : self.api_token, - "wt" : self.website_token, - "password" : password, - }) - - def _api_request(self, endpoint, params=None): + params["password"] = hashlib.sha256(password.encode()).hexdigest() + return self._api_request("contents/" + content_id, params, headers) + + def _api_request(self, endpoint, params=None, headers=None, method="GET"): response = self.request( - "https://api.gofile.io/" + endpoint, params=params).json() + "https://api.gofile.io/" + endpoint, + method=method, params=params, headers=headers, + ).json() if response["status"] != "ok": if response["status"] == "error-notFound": raise exception.NotFoundError("content") + if response["status"] == "error-passwordRequired": + raise exception.AuthorizationError("Password required") raise exception.StopExtraction( "%s failed (Status: %s)", endpoint, response["status"]) From 1418c0ce38871db5c29de99926812c3132646c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Mar 2024 22:27:13 +0100 Subject: [PATCH 046/154] [kemonoparty] add 'revision_count' metadata field (#5334) --- gallery_dl/extractor/kemonoparty.py | 7 +++++-- test/results/kemonoparty.py | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 06dc861e..13897b49 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -232,6 +232,7 @@ class KemonopartyExtractor(Extractor): except exception.HttpError: post["revision_hash"] = self._revision_hash(post) post["revision_index"] = 1 + post["revision_count"] = 1 return (post,) revs.insert(0, post) @@ -247,9 +248,10 @@ class KemonopartyExtractor(Extractor): uniq.append(rev) revs = uniq - idx = len(revs) + cnt = idx = len(revs) for rev in revs: rev["revision_index"] = idx + rev["revision_count"] = cnt idx -= 1 return revs @@ -257,10 +259,11 @@ class KemonopartyExtractor(Extractor): def _revisions_all(self, url): revs = self.request(url + "/revisions").json() - idx = len(revs) + cnt = idx = len(revs) for rev in revs: rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx + rev["revision_count"] = cnt idx -= 1 return revs diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 16e1b78f..1528f55f 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -195,6 +195,7 @@ __tests__ = ( "hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", "revision_id" : 142470, "revision_index": 2, + "revision_count": 9, "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", }, @@ -210,6 +211,7 @@ __tests__ = ( "hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", "revision_id" : 0, "revision_index": 1, + "revision_count": 1, "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", }, @@ -224,6 +226,7 @@ __tests__ = ( "revision_id": range(134996, 3052965), "revision_index": range(1, 9), + "revision_count": 9, "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", }, From 03a9ce9832c12c03ed5221f04a470d9e9d6d9b0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 15 Mar 2024 23:04:10 +0100 Subject: [PATCH 047/154] [kemonoparty] add 'order-revisions' option (#5334) --- docs/configuration.rst | 16 ++++++++++++++++ gallery_dl/extractor/kemonoparty.py | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 65a076eb..29578019 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2346,6 +2346,22 @@ Description Note: This requires 1 additional HTTP request per post. +extractor.kemonoparty.order-revisions +------------------------------------- +Type + ``string`` +Default + ``"desc"`` +Description + Controls the order in which + `revisions `__ + are returned. + + * ``"asc"``: Ascending order (oldest first) + * ``"desc"``: Descending order (newest first) + * ``"reverse"``: Same as ``"asc"`` + + extractor.khinsider.format -------------------------- Type diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 13897b49..de2a9b6c 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -40,6 +40,8 @@ class KemonopartyExtractor(Extractor): def _init(self): self.revisions = self.config("revisions") if self.revisions: + order = self.config("order-revisions") + self.revisions_reverse = order[0] in ("r", "a") if order else False self.revisions_unique = (self.revisions == "unique") self._prepare_ddosguard_cookies() self._find_inline = re.compile( @@ -254,6 +256,9 @@ class KemonopartyExtractor(Extractor): rev["revision_count"] = cnt idx -= 1 + if self.revisions_reverse: + revs.reverse() + return revs def _revisions_all(self, url): @@ -266,6 +271,9 @@ class KemonopartyExtractor(Extractor): rev["revision_count"] = cnt idx -= 1 + if self.revisions_reverse: + revs.reverse() + return revs def _revision_hash(self, revision): From 99c53f7fa8bc47953f8e0cb130d23d8940d97188 Mon Sep 17 00:00:00 2001 From: Herp Date: Thu, 14 Mar 2024 23:37:16 -0400 Subject: [PATCH 048/154] Fix imagefap extrcator --- gallery_dl/extractor/imagefap.py | 2 +- test/results/imagefap.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 32fe3715..85446c01 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -194,7 +194,7 @@ class ImagefapFolderExtractor(ImagefapExtractor): cnt = 0 while True: - gid = extr('
", "<"), folder_name diff --git a/test/results/imagefap.py b/test/results/imagefap.py index 7ac1631f..8cdaf9b1 100644 --- a/test/results/imagefap.py +++ b/test/results/imagefap.py @@ -120,6 +120,16 @@ __tests__ = ( "#sha1_url": "37822523e6e4a56feb9dea35653760c86b44ff89", }, +{ + "#url" : "https://www.imagefap.com/organizer/613950/Grace-Stout", + "#category": ("", "imagefap", "folder"), + "#class" : imagefap.ImagefapFolderExtractor, + "#pattern" : imagefap.ImagefapGalleryExtractor.pattern, + "#count" : 31, + + "title": r"re:Grace Stout .+", +}, + { "#url" : "https://www.imagefap.com/usergallery.php?userid=1981976&folderid=409758", "#category": ("", "imagefap", "folder"), From 8e694d85c4e61950811508b040559b0918c06d80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 16 Mar 2024 02:02:02 +0100 Subject: [PATCH 049/154] [twitter] add 'birdwatch' metadata field (#5317) should probably get a better name, but this is what it's called internally by Twitter --- gallery_dl/extractor/twitter.py | 2 ++ test/results/twitter.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 87feeba9..a5bd9840 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -340,6 +340,8 @@ class TwitterExtractor(Extractor): txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content + if "birdwatch_pivot" in tweet: + tdata["birdwatch"] = tweet["birdwatch_pivot"]["subtitle"]["text"] if "in_reply_to_screen_name" in legacy: tdata["reply_to"] = legacy["in_reply_to_screen_name"] if "quoted_by" in legacy: diff --git a/test/results/twitter.py b/test/results/twitter.py index c94963d9..cc41d9ca 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -590,6 +590,17 @@ The Washington Post writes, "Three weeks after the toxic train derailment in Ohi "The analysis by Texas A&M University seems to contradict statements by state and federal regulators that air near the crash site is completely safe, despite residents complaining about rashes, breathing problems and other health effects." Your reaction.""", }, +{ + "#url" : "https://twitter.com/KrisKobach1787/status/1765935595702919299", + "#comment" : "'birdwatch' note (#5317)", + "#category": ("", "twitter", "tweet"), + "#class" : twitter.TwitterTweetExtractor, + "#options" : {"text-tweets": True}, + + "birdwatch": "In addition to the known harm of lead exposure, especially to children, Mr. Kobach is incorrect when he states the mandate is unfunded. In fact, the BIPARTISAN Infrastructure Law Joe Biden signed into law in Nov 2021 provides $15B toward lead service line replacement projects. epa.gov/ground-water-a…", + "content" : "Biden wants to replace lead pipes. He failed to mention that the unfunded mandate sets an almost impossible timeline, will cost billions, infringe on the rights of the States and their residents – all for benefits that may be entirely speculative. #sotu https://ag.ks.gov/media-center/news-releases/2024/02/09/kobach-leads-coalition-demanding-biden-drop-unnecessary-epa-rule", +}, + { "#url" : "https://twitter.com/playpokemon/status/1263832915173048321/quotes", "#category": ("", "twitter", "quotes"), From 26bc2d55f465b6b416f2c0401d10d24508496332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 17 Mar 2024 14:04:21 +0100 Subject: [PATCH 050/154] [hiperdex] update URL patterns & fix 'manga' metadata (#5340) --- gallery_dl/extractor/hiperdex.py | 12 ++++++------ test/results/hiperdex.py | 8 +++++++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 20491b56..aadce6ca 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -25,7 +25,7 @@ class HiperdexBase(): @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/manga/{}/".format(self.root, manga) + url = "{}/mangas/{}/".format(self.root, manga) page = self.request(url).text extr = text.extract_from(page) @@ -33,7 +33,7 @@ class HiperdexBase(): "url" : text.unescape(extr( 'property="og:url" content="', '"')), "manga" : text.unescape(extr( - '"headline": "', '"')), + ' property="name" title="', '"')), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( @@ -68,8 +68,8 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for manga chapters from hiperdex.com""" - pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" - example = "https://hiperdex.com/manga/MANGA/CHAPTER/" + pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" + example = "https://hiperdex.com/mangas/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -90,8 +90,8 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for manga from hiperdex.com""" chapterclass = HiperdexChapterExtractor - pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" - example = "https://hiperdex.com/manga/MANGA/" + pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" + example = "https://hiperdex.com/mangas/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() diff --git a/test/results/hiperdex.py b/test/results/hiperdex.py index 33948f22..bb421142 100644 --- a/test/results/hiperdex.py +++ b/test/results/hiperdex.py @@ -9,7 +9,7 @@ from gallery_dl.extractor import hiperdex __tests__ = ( { - "#url" : "https://hiperdex.com/manga/domestic-na-kanojo/154-5/", + "#url" : "https://hiperdex.com/mangas/domestic-na-kanojo/154-5/", "#category": ("", "hiperdex", "chapter"), "#class" : hiperdex.HiperdexChapterExtractor, "#pattern" : r"https://(1st)?hiperdex\d?.(com|net|info)/wp-content/uploads/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp", @@ -27,6 +27,12 @@ __tests__ = ( "type" : "Manga", }, +{ + "#url" : "https://hiperdex.com/manga/domestic-na-kanojo/154-5/", + "#category": ("", "hiperdex", "chapter"), + "#class" : hiperdex.HiperdexChapterExtractor, +}, + { "#url" : "https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/", "#category": ("", "hiperdex", "chapter"), From 5c1f5861b64adc8ae3a077de7ce5b056d3157cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 00:01:27 +0100 Subject: [PATCH 051/154] [flickr] add 'contexts' option (#5324) --- docs/configuration.rst | 20 ++++++++++++++++++-- gallery_dl/extractor/flickr.py | 13 +++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 29578019..7ab9e1c2 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1872,6 +1872,20 @@ Description from `linking your Flickr account to gallery-dl `__. +extractor.flickr.contexts +------------------------- +Type + ``bool`` +Default + ``false`` +Description + For each photo, return the albums and pools it belongs to + as ``set`` and ``pool`` metadata. + + Note: This requires 1 additional API call per photo. + See `flickr.photos.getAllContexts `__ for details. + + extractor.flickr.exif --------------------- Type @@ -1879,9 +1893,11 @@ Type Default ``false`` Description - Fetch `exif` and `camera` metadata for each photo. + For each photo, return its EXIF/TIFF/GPS tags + as ``exif`` and ``camera`` metadata. Note: This requires 1 additional API call per photo. + See `flickr.photos.getExif `__ for details. extractor.flickr.metadata @@ -1901,7 +1917,7 @@ Description It is possible to specify a custom list of metadata includes. See `the extras parameter `__ - in `Flickr API docs `__ + in `Flickr's API docs `__ for possible field names. diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index f7dc3cc2..c94a110a 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -77,6 +77,8 @@ class FlickrImageExtractor(FlickrExtractor): photo = self.api.photos_getInfo(self.item_id) if self.api.exif: photo.update(self.api.photos_getExif(self.item_id)) + if self.api.contexts: + photo.update(self.api.photos_getAllContexts(self.item_id)) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) @@ -268,6 +270,8 @@ class FlickrAPI(oauth.OAuth1API): self.exif = extractor.config("exif", False) self.videos = extractor.config("videos", True) + self.contexts = extractor.config("contexts", False) + self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): for fmt, fmtname, fmtwidth in self.FORMATS: @@ -311,6 +315,13 @@ class FlickrAPI(oauth.OAuth1API): params = {"user_id": user_id} return self._pagination("people.getPhotos", params) + def photos_getAllContexts(self, photo_id): + """Returns all visible sets and pools the photo belongs to.""" + params = {"photo_id": photo_id} + data = self._call("photos.getAllContexts", params) + del data["stat"] + return data + def photos_getExif(self, photo_id): """Retrieves a list of EXIF/TIFF/GPS tags for a given photo.""" params = {"photo_id": photo_id} @@ -444,6 +455,8 @@ class FlickrAPI(oauth.OAuth1API): if self.exif: photo.update(self.photos_getExif(photo["id"])) + if self.contexts: + photo.update(self.photos_getAllContexts(photo["id"])) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: From 718c870430da391bc9f7bc0ca1244ee736b860d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 04:25:07 +0100 Subject: [PATCH 052/154] [tests] show full path for nested values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'user.name' instead of just 'name' when testing for "user": { … , "name": "…", … } --- test/test_results.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index bceb2710..05946182 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -214,44 +214,46 @@ class TestExtractorResults(unittest.TestCase): for kwdict in tjob.kwdict_list: self._test_kwdict(kwdict, metadata) - def _test_kwdict(self, kwdict, tests): + def _test_kwdict(self, kwdict, tests, parent=None): for key, test in tests.items(): if key.startswith("?"): key = key[1:] if key not in kwdict: continue - self.assertIn(key, kwdict, msg=key) + path = "{}.{}".format(parent, key) if parent else key + self.assertIn(key, kwdict, msg=path) value = kwdict[key] if isinstance(test, dict): - self._test_kwdict(value, test) + self._test_kwdict(value, test, path) elif isinstance(test, type): - self.assertIsInstance(value, test, msg=key) + self.assertIsInstance(value, test, msg=path) elif isinstance(test, range): - self.assertRange(value, test, msg=key) + self.assertRange(value, test, msg=path) elif isinstance(test, list): subtest = False for idx, item in enumerate(test): if isinstance(item, dict): subtest = True - self._test_kwdict(value[idx], item) + subpath = "{}[{}]".format(path, idx) + self._test_kwdict(value[idx], item, subpath) if not subtest: - self.assertEqual(test, value, msg=key) + self.assertEqual(test, value, msg=path) elif isinstance(test, str): if test.startswith("re:"): - self.assertRegex(value, test[3:], msg=key) + self.assertRegex(value, test[3:], msg=path) elif test.startswith("dt:"): - self.assertIsInstance(value, datetime.datetime, msg=key) - self.assertEqual(test[3:], str(value), msg=key) + self.assertIsInstance(value, datetime.datetime, msg=path) + self.assertEqual(test[3:], str(value), msg=path) elif test.startswith("type:"): - self.assertEqual(test[5:], type(value).__name__, msg=key) + self.assertEqual(test[5:], type(value).__name__, msg=path) elif test.startswith("len:"): - self.assertIsInstance(value, (list, tuple), msg=key) - self.assertEqual(int(test[4:]), len(value), msg=key) + self.assertIsInstance(value, (list, tuple), msg=path) + self.assertEqual(int(test[4:]), len(value), msg=path) else: - self.assertEqual(test, value, msg=key) + self.assertEqual(test, value, msg=path) else: - self.assertEqual(test, value, msg=key) + self.assertEqual(test, value, msg=path) class ResultJob(job.DownloadJob): From da6ba60331bb378ab318b672739b9cf4a2cdb364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 04:32:45 +0100 Subject: [PATCH 053/154] [bluesky] add 'instance' metadata field (#4438) --- gallery_dl/extractor/bluesky.py | 2 ++ test/results/bluesky.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index f57651c0..84c31878 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -40,6 +40,7 @@ class BlueskyExtractor(Extractor): self.api = BlueskyAPI(self) self._user = self._user_did = None + self.instance = self.root.partition("://")[2] def items(self): for post in self.posts(): @@ -81,6 +82,7 @@ class BlueskyExtractor(Extractor): if self._metadata_user: post["user"] = self._user or post["author"] + post["instance"] = self.instance post["post_id"] = pid post["count"] = len(images) post["date"] = text.parse_datetime( diff --git a/test/results/bluesky.py b/test/results/bluesky.py index 84b99aa3..1bba4638 100644 --- a/test/results/bluesky.py +++ b/test/results/bluesky.py @@ -133,6 +133,7 @@ __tests__ = ( "filename" : "bafkreidypzoaybmfj5h7pnpiyct6ng5yae6ydp4czrm72ocg7ev6vbirri", "height" : 630, "indexedAt" : "2023-12-22T18:58:32.715Z", + "instance" : "bsky.app", "labels" : [], "likeCount" : int, "num" : 1, @@ -153,7 +154,7 @@ __tests__ = ( "followersCount": int, "followsCount" : int, "handle" : "bsky.app", - "indexedAt" : "2023-12-22T18:54:12.339Z", + "indexedAt" : "2024-01-20T05:04:41.904Z", "labels" : [], "postsCount" : int, }, From ae116812ccdf9e698b584322e5448266aed22e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 16:35:09 +0100 Subject: [PATCH 054/154] [vipergirls] add 'like' option (#4166) --- docs/configuration.rst | 13 ++++++++++ gallery_dl/extractor/vipergirls.py | 41 +++++++++++++++++++++++------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 7ab9e1c2..9fa5667c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3954,6 +3954,19 @@ Description ``"raw"``, ``"full"``, ``"regular"``, ``"small"``, and ``"thumb"``. +extractor.vipergirls.like +------------------------- +Type + ``bool`` +Default + ``false`` +Description + Automatically `like` posts after downloading their images. + + Note: Requires `login `__ + or `cookies `__ + + extractor.vsco.videos --------------------- Type diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 5374f1ce..62386ce2 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -28,15 +28,32 @@ class VipergirlsExtractor(Extractor): def items(self): self.login() + posts = self.posts() - for post in self.posts(): + like = self.config("like") + if like: + user_hash = posts[0].get("hash") + if len(user_hash) < 16: + self.log.warning("Login required to like posts") + like = False + + posts = posts.iter("post") + if self.page: + util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + + for post in posts: data = post.attrib data["thread_id"] = self.thread_id yield Message.Directory, data + + image = None for image in post: yield Message.Queue, image.attrib["main_url"], data + if image is not None and like: + self.like(post, user_hash) + def login(self): if self.cookies_check(self.cookies_names): return @@ -64,6 +81,17 @@ class VipergirlsExtractor(Extractor): return {cookie.name: cookie.value for cookie in response.cookies} + def like(self, post, user_hash): + url = self.root + "/post_thanks.php" + params = { + "do" : "post_thanks_add", + "p" : post.get("id"), + "securitytoken": user_hash, + } + + with self.request(url, params=params, allow_redirects=False): + pass + class VipergirlsThreadExtractor(VipergirlsExtractor): """Extractor for vipergirls threads""" @@ -77,12 +105,7 @@ class VipergirlsThreadExtractor(VipergirlsExtractor): def posts(self): url = "{}/vr.php?t={}".format(self.root, self.thread_id) - root = ElementTree.fromstring(self.request(url).text) - posts = root.iter("post") - - if self.page: - util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) - return posts + return ElementTree.fromstring(self.request(url).text) class VipergirlsPostExtractor(VipergirlsExtractor): @@ -95,8 +118,8 @@ class VipergirlsPostExtractor(VipergirlsExtractor): def __init__(self, match): VipergirlsExtractor.__init__(self, match) self.thread_id, self.post_id = match.groups() + self.page = 0 def posts(self): url = "{}/vr.php?p={}".format(self.root, self.post_id) - root = ElementTree.fromstring(self.request(url).text) - return root.iter("post") + return ElementTree.fromstring(self.request(url).text) From e1c51c0dfbe282bb039ae43448249778074dba0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 16:53:02 +0100 Subject: [PATCH 055/154] [vipergirls] add 'domain' option (#4166) --- docs/configuration.rst | 12 ++++++++++++ gallery_dl/extractor/vipergirls.py | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9fa5667c..9c3782c3 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3954,6 +3954,18 @@ Description ``"raw"``, ``"full"``, ``"regular"``, ``"small"``, and ``"thumb"``. +extractor.vipergirls.domain +--------------------------- +Type + ``string`` +Default + ``"vipergirls.to"`` +Description + Specifies the domain used by ``vipergirls`` extractors. + + For example ``"viper.click"`` if the main domain is blocked or to bypass Cloudflare, + + extractor.vipergirls.like ------------------------- Type diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 62386ce2..6dfb23cc 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -26,6 +26,11 @@ class VipergirlsExtractor(Extractor): cookies_domain = ".vipergirls.to" cookies_names = ("vg_userid", "vg_password") + def _init(self): + domain = self.config("domain") + if domain: + self.root = text.ensure_http_scheme(domain) + def items(self): self.login() posts = self.posts() From 0d69af94d5191e682441332f3c17d63fdc4eb402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 20:40:02 +0100 Subject: [PATCH 056/154] [gelbooru] detect returned favorites order (#5220) --- gallery_dl/extractor/gelbooru.py | 51 +++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 83f13922..0050a62d 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -32,6 +32,9 @@ class GelbooruBase(): url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() + if not key: + return data + try: posts = data[key] except KeyError: @@ -167,13 +170,54 @@ class GelbooruFavoriteExtractor(GelbooruBase, params = { "s" : "favorite", "id" : self.favorite_id, - "limit": "1", + "limit": "2", } - count = self._api_request(params, "@attributes", True)[0]["count"] + data = self._api_request(params, None, True) + + count = data["@attributes"]["count"] if count <= self.offset: - return + return () + + favs = data["favorite"] + try: + order = 1 if favs[0]["id"] < favs[1]["id"] else -1 + except LookupError: + order = 0 + + if order > 0: + self.log.debug("API yields favorites in ascending order") + self.log.debug("Returning them in reverse") + return self._pagination_reverse(params, count) + + self.log.debug("API yields favorites in descending order") + return self._pagination(params, count) + + def _pagination(self, params, count): + if self.offset: + pnum, skip = divmod(self.offset, self.per_page) + else: + pnum = skip = 0 + + params["pid"] = pnum + params["limit"] = self.per_page + + while True: + favs = self._api_request(params, "favorite", True) + + if not favs: + return + + if skip: + favs = favs[skip:] + skip = 0 + + for fav in favs: + yield from self._api_request({"id": fav["favorite"]}) + + params["pid"] += 1 + def _pagination_reverse(self, params, count): pnum, last = divmod(count-1, self.per_page) if self.offset > last: # page number change @@ -182,7 +226,6 @@ class GelbooruFavoriteExtractor(GelbooruBase, pnum -= diff + 1 skip = self.offset - # paginate over them in reverse params["pid"] = pnum params["limit"] = self.per_page From 6d93295fea1fd6e4d2f948abcc3c9d288bdacb41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 18 Mar 2024 20:46:11 +0100 Subject: [PATCH 057/154] [gelbooru] add 'date_favorited' metadata field --- gallery_dl/extractor/gelbooru.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 0050a62d..e54a1fae 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -213,7 +213,9 @@ class GelbooruFavoriteExtractor(GelbooruBase, skip = 0 for fav in favs: - yield from self._api_request({"id": fav["favorite"]}) + for post in self._api_request({"id": fav["favorite"]}): + post["date_favorited"] = text.parse_timestamp(fav["added"]) + yield post params["pid"] += 1 @@ -238,7 +240,9 @@ class GelbooruFavoriteExtractor(GelbooruBase, skip = 0 for fav in favs: - yield from self._api_request({"id": fav["favorite"]}) + for post in self._api_request({"id": fav["favorite"]}): + post["date_favorited"] = text.parse_timestamp(fav["added"]) + yield post params["pid"] -= 1 if params["pid"] < 0: From eb673a7204c9c70eb7d74905c79bfd7b2eaacea8 Mon Sep 17 00:00:00 2001 From: wankio <31354933+wankio@users.noreply.github.com> Date: Tue, 19 Mar 2024 02:48:53 +0700 Subject: [PATCH 058/154] Update fapello.py get fullsize image instead resized --- gallery_dl/extractor/fapello.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index aff8e616..403dc929 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -42,7 +42,7 @@ class FapelloPostExtractor(Extractor): "type" : "video" if 'type="video' in page else "photo", "thumbnail": text.extr(page, 'poster="', '"'), } - url = text.extr(page, 'src="', '"') + url = text.extr(page, 'src="', '"').replace(".md", "") yield Message.Directory, data yield Message.Url, url, text.nameext_from_url(url, data) From 56f2d5a5f265930de8007565668e99463f66c12d Mon Sep 17 00:00:00 2001 From: wankio <31354933+wankio@users.noreply.github.com> Date: Thu, 21 Mar 2024 00:04:21 +0700 Subject: [PATCH 059/154] fapello.py Fullsize image by remove ".md" and ".th" in image url, it will download fullsize of images --- gallery_dl/extractor/fapello.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index 403dc929..838ae7b6 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -42,7 +42,8 @@ class FapelloPostExtractor(Extractor): "type" : "video" if 'type="video' in page else "photo", "thumbnail": text.extr(page, 'poster="', '"'), } - url = text.extr(page, 'src="', '"').replace(".md", "") + url = text.extr(page, 'src="', '"').replace( + ".md", "").replace(".th", "") yield Message.Directory, data yield Message.Url, url, text.nameext_from_url(url, data) From ddb2edfd32058024df405cbb2678b99839a4dfda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 21 Mar 2024 19:19:33 +0100 Subject: [PATCH 060/154] [formatter] fix local DST datetime offsets for ':O' 'O' would get the *current* local UTC offset and apply it to all 'datetime' objects it gets applied to. This would result in a wrong offset if the current offset includes DST and the target 'datetime' does not or vice-versa. 'O' now determines the correct local UTC offset while respecting DST for each individual 'datetime'. --- gallery_dl/formatter.py | 12 ++++++------ test/test_formatter.py | 28 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 6098fc61..b83cf21c 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -375,18 +375,18 @@ def _parse_offset(format_spec, default): fmt = _build_format_func(format_spec, default) if not offset or offset == "local": - is_dst = time.daylight and time.localtime().tm_isdst > 0 - offset = -(time.altzone if is_dst else time.timezone) + def off(dt): + local = time.localtime(util.datetime_to_timestamp(dt)) + return fmt(dt + datetime.timedelta(0, local.tm_gmtoff)) else: hours, _, minutes = offset.partition(":") offset = 3600 * int(hours) if minutes: offset += 60 * (int(minutes) if offset > 0 else -int(minutes)) + offset = datetime.timedelta(0, offset) - offset = datetime.timedelta(seconds=offset) - - def off(obj): - return fmt(obj + offset) + def off(obj): + return fmt(obj + offset) return off diff --git a/test/test_formatter.py b/test/test_formatter.py index 5ed94ec4..26adbdee 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -31,8 +31,9 @@ class TestFormatter(unittest.TestCase): "h": "

foo

& bar

", "u": "'< / >'", "t": 1262304000, - "dt": datetime.datetime(2010, 1, 1), "ds": "2010-01-01T01:00:00+0100", + "dt": datetime.datetime(2010, 1, 1), + "dt_dst": datetime.datetime(2010, 6, 1), "name": "Name", "title1": "Title", "title2": "", @@ -236,19 +237,18 @@ class TestFormatter(unittest.TestCase): self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z/O1}", "2010-01-01 01:00:00") self._run_test("{t!d:O2}", "2010-01-01 02:00:00") - orig_daylight = time.daylight - orig_timezone = time.timezone - orig_altzone = time.altzone - try: - time.daylight = False - time.timezone = -3600 - self._run_test("{dt:O}", "2010-01-01 01:00:00") - time.timezone = 7200 - self._run_test("{dt:Olocal}", "2009-12-31 22:00:00") - finally: - time.daylight = orig_daylight - time.timezone = orig_timezone - time.altzone = orig_altzone + def test_offset_local(self): + ts = self.kwdict["dt"].replace(tzinfo=datetime.UTC).timestamp() + offset = time.localtime(ts).tm_gmtoff + dt = self.kwdict["dt"] + datetime.timedelta(seconds=offset) + self._run_test("{dt:O}", str(dt)) + self._run_test("{dt:Olocal}", str(dt)) + + ts = self.kwdict["dt_dst"].replace(tzinfo=datetime.UTC).timestamp() + offset = time.localtime(ts).tm_gmtoff + dt = self.kwdict["dt_dst"] + datetime.timedelta(seconds=offset) + self._run_test("{dt_dst:O}", str(dt)) + self._run_test("{dt_dst:Olocal}", str(dt)) def test_sort(self): self._run_test("{l:S}" , "['a', 'b', 'c']") From 1b34d5ac4097861e9bcadfd18394441f5d624529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 00:45:09 +0100 Subject: [PATCH 061/154] [subscribestar] fix 'date' metadata --- gallery_dl/extractor/subscribestar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 31fb891a..d4adfed9 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -175,7 +175,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): "author_id" : text.parse_int(extr('data-user-id="', '"')), "author_nick": text.unescape(extr('alt="', '"')), "date" : self._parse_datetime(extr( - 'class="section-subtitle">', '<')), + '', '<')), "content" : (extr( '
")[2]), From 77ab015df20201dde138eb7d845103dd20e9acea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 01:38:25 +0100 Subject: [PATCH 062/154] [idolcomplex] support new pool URLs --- gallery_dl/extractor/idolcomplex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index c249a3e6..35f36794 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -222,8 +222,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/pools?/show/(\d+)" - example = "https://idol.sankakucomplex.com/pools/show/12345" + pattern = BASE_PATTERN + r"/pools?/(?:show/)?(\w+)" + example = "https://idol.sankakucomplex.com/pools/0123456789abcdef" per_page = 24 def __init__(self, match): From 32262a048ba4fb0bcc5f21b93830228c81209e96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 01:43:05 +0100 Subject: [PATCH 063/154] [idolcomplex] fix metadata extraction - replace legacy 'id' vales with alphanumeric ones, since the former are no longer available - approximate 'vote_average', since the real value is no longer available - fix 'vote_count' --- gallery_dl/extractor/idolcomplex.py | 11 +++++------ test/results/idolcomplex.py | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 35f36794..dfd9a317 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -101,9 +101,8 @@ class IdolcomplexExtractor(SankakuExtractor): page = self.request(url, retries=10).text extr = text.extract_from(page) - pid_alnum = extr('/posts/', '"') - vavg = extr('itemprop="ratingValue">', "<") - vcnt = extr('itemprop="reviewCount">', "<") + vavg = extr('id="rating"', "") + vcnt = extr('>Votes:', "<") pid = extr(">Post ID:", "<") created = extr(' title="', '"') @@ -120,10 +119,10 @@ class IdolcomplexExtractor(SankakuExtractor): rating = extr(">Rating:", " Date: Fri, 22 Mar 2024 02:10:45 +0100 Subject: [PATCH 064/154] [bunkr] remove 'description' metadata album descriptions are no longer available on album pages and the previous code erroneously returned just '0' --- gallery_dl/extractor/bunkr.py | 1 - test/results/bunkr.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 1a0e47d7..a0933474 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -54,7 +54,6 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "album_id" : self.album_id, "album_name" : text.unescape(info[0]), "album_size" : size[1:-1], - "description": text.unescape(info[2]) if len(info) > 2 else "", "count" : len(urls), } diff --git a/test/results/bunkr.py b/test/results/bunkr.py index b1a605fb..95cdd3fa 100644 --- a/test/results/bunkr.py +++ b/test/results/bunkr.py @@ -13,13 +13,12 @@ __tests__ = ( "#category": ("lolisafe", "bunkr", "album"), "#class" : bunkr.BunkrAlbumExtractor, "#urls" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", - "#sha1_content": "f38b54b17cd7462e687b58d83f00fca88b1b105a", + "#sha1_content": "961b25d85b5f5bd18cbe3e847ac55925f14d0286", "album_id" : "Lktg9Keq", "album_name" : "test テスト \"&>", "album_size" : "182 B", "count" : 1, - "description": "", "extension" : "png", "file" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", "filename" : "test-テスト-\"&>-QjgneIQv", @@ -43,7 +42,6 @@ __tests__ = ( "album_name" : "test2", "album_size" : "561.6 KB", "count" : 2, - "description": "", "filename" : r"re:video-gLn1hgpw|image-sZrQUeOx", "id" : r"re:gLn1hgpw|sZrQUeOx", "name" : r"re:video|image", From a650fd3177b253facb74eed9c0a6ec5b7afc78b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 02:53:54 +0100 Subject: [PATCH 065/154] [deviantart] improve 'index' extraction for stash files (#5335) --- gallery_dl/extractor/deviantart.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index bb74929c..dc5a2f83 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -213,7 +213,9 @@ class DeviantartExtractor(Extractor): """Adjust the contents of a Deviation-object""" if "index" not in deviation: try: - if deviation["url"].startswith("https://sta.sh"): + if deviation["url"].startswith(( + "https://www.deviantart.com/stash/", "https://sta.sh", + )): filename = deviation["content"]["src"].split("/")[5] deviation["index_base36"] = filename.partition("-")[0][1:] deviation["index"] = id_from_base36( From fe9171508f085f9d7b40018d0f41589b22c2a449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 03:10:12 +0100 Subject: [PATCH 066/154] [kemonoparty] fix exception for '/revision/' URLs caused by 03a9ce98 --- gallery_dl/extractor/kemonoparty.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index de2a9b6c..9c77b7a7 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -40,9 +40,10 @@ class KemonopartyExtractor(Extractor): def _init(self): self.revisions = self.config("revisions") if self.revisions: - order = self.config("order-revisions") - self.revisions_reverse = order[0] in ("r", "a") if order else False self.revisions_unique = (self.revisions == "unique") + order = self.config("order-revisions") + self.revisions_reverse = order[0] in ("r", "a") if order else False + self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:su|party))?(/inline/[^"]+' From 9f73fac5efef6af5eb5cbedcac997f54b7995211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 17:53:06 +0100 Subject: [PATCH 067/154] [steamgriddb] raise proper exception for deleted assets --- gallery_dl/extractor/steamgriddb.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 9d46fd6b..85828247 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -163,6 +163,9 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor): def assets(self): endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id asset = self._call(endpoint)["asset"] + if asset is None: + raise exception.NotFoundError("asset ({}:{})".format( + self.asset_type, self.asset_id)) return (asset,) From 7a7dc442a0b54eb485d752809249174b04ac1629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 17:57:04 +0100 Subject: [PATCH 068/154] [tests] update extractor results --- test/results/8chan.py | 2 +- test/results/blogspot.py | 8 ++++---- test/results/deviantart.py | 4 ++-- test/results/hentaifoundry.py | 5 +++-- test/results/hitomi.py | 5 ++--- test/results/kemonoparty.py | 5 ++--- test/results/mastodonsocial.py | 2 +- test/results/misskeydesign.py | 2 +- test/results/myhentaigallery.py | 2 +- test/results/naverwebtoon.py | 2 +- test/results/omgmiamiswimwear.py | 4 ++-- test/results/pornpics.py | 1 + test/results/raddle.py | 5 ++--- test/results/reddit.py | 1 + test/results/steamgriddb.py | 12 +++--------- test/results/tumblr.py | 2 +- test/results/twibooru.py | 2 +- test/results/twitter.py | 2 +- test/results/unsplash.py | 19 ++----------------- test/results/vsco.py | 6 ++---- test/results/xbooru.py | 2 +- test/results/xhamster.py | 4 ++-- 22 files changed, 37 insertions(+), 60 deletions(-) diff --git a/test/results/8chan.py b/test/results/8chan.py index f7be8148..7bdbdecc 100644 --- a/test/results/8chan.py +++ b/test/results/8chan.py @@ -73,7 +73,7 @@ __tests__ = ( "#category": ("", "8chan", "board"), "#class" : _8chan._8chanBoardExtractor, "#pattern" : _8chan._8chanThreadExtractor.pattern, - "#count" : 27, + "#count" : range(24, 28), }, { diff --git a/test/results/blogspot.py b/test/results/blogspot.py index 75ecff92..3c7beadc 100644 --- a/test/results/blogspot.py +++ b/test/results/blogspot.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html", "#category": ("blogger", "blogspot", "post"), "#class" : blogger.BloggerPostExtractor, - "#urls" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", + "#urls" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjH9WkPvLJq2moxKtyt3ieJZWSDFQwOi3PHRdlHVHEQHRwy-d86Jg6HWSMhxaa6EgvlXq-zDMmKM4kIPn27eJ9Hepk2X9e9HQhqwMfrT8RYTnFe65uexw7KSk5FdWHxRVp5crz3p_qph3Bj/s0/Icy-Moonrise---For-Web.jpg", "blog": { "date" : "dt:2010-11-21 18:19:42", @@ -43,7 +43,7 @@ __tests__ = ( "extension": "jpg", "filename" : "Icy-Moonrise---For-Web", "num" : 1, - "url" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", + "url" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjH9WkPvLJq2moxKtyt3ieJZWSDFQwOi3PHRdlHVHEQHRwy-d86Jg6HWSMhxaa6EgvlXq-zDMmKM4kIPn27eJ9Hepk2X9e9HQhqwMfrT8RYTnFe65uexw7KSk5FdWHxRVp5crz3p_qph3Bj/s0/Icy-Moonrise---For-Web.jpg", }, { @@ -59,7 +59,7 @@ __tests__ = ( "#comment" : "new image domain (#2204)", "#category": ("blogger", "blogspot", "post"), "#class" : blogger.BloggerPostExtractor, - "#pattern" : "https://blogger.googleusercontent.com/img/a/.+=s0$", + "#pattern" : r"https://blogger\.googleusercontent\.com/img/.+=s0$", "#count" : 8, }, @@ -67,7 +67,7 @@ __tests__ = ( "#url" : "https://julianbphotography.blogspot.com/", "#category": ("blogger", "blogspot", "blog"), "#class" : blogger.BloggerBlogExtractor, - "#pattern" : r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg", + "#pattern" : r"https://blogger\.googleusercontent\.com/img/.+/s0/", "#range" : "1-25", "#count" : 25, }, diff --git a/test/results/deviantart.py b/test/results/deviantart.py index a9727334..525206df 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -308,7 +308,7 @@ __tests__ = ( "target" : dict, "thumbs" : list, "title" : "Banner", - "url" : "https://sta.sh/0198jippkeys", + "url" : "https://www.deviantart.com/stash/0198jippkeys", "username" : "gdldev", }, @@ -589,7 +589,7 @@ __tests__ = ( "index" : int, "index_base36": r"re:^[0-9a-z]+$", - "url" : r"re:^https://sta.sh", + "url" : r"re:^https://www.deviantart.com/stash/\w+", }, { diff --git a/test/results/hentaifoundry.py b/test/results/hentaifoundry.py index 35c66f8a..386ec3cd 100644 --- a/test/results/hentaifoundry.py +++ b/test/results/hentaifoundry.py @@ -29,10 +29,11 @@ __tests__ = ( }, { - "#url" : "https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", + "#url" : "https://www.hentai-foundry.com/pictures/user/Ethevian/scraps", "#category": ("", "hentaifoundry", "scraps"), "#class" : hentaifoundry.HentaifoundryScrapsExtractor, - "#sha1_url": "7cd9c6ec6258c4ab8c44991f7731be82337492a7", + "#pattern" : r"https://pictures\.hentai-foundry\.com/e/Ethevian/.+", + "#count" : ">= 10", }, { diff --git a/test/results/hitomi.py b/test/results/hitomi.py index 9039525c..b2426b0b 100644 --- a/test/results/hitomi.py +++ b/test/results/hitomi.py @@ -5,6 +5,7 @@ # published by the Free Software Foundation. from gallery_dl.extractor import hitomi +from gallery_dl import exception __tests__ = ( @@ -47,9 +48,7 @@ __tests__ = ( "#comment" : "gallery with 'broken' redirect", "#category": ("", "hitomi", "gallery"), "#class" : hitomi.HitomiGalleryExtractor, - "#options" : {"format": "original"}, - "#pattern" : r"https://[a-c]b\.hitomi\.la/images/\d+/\d+/[0-9a-f]{64}\.jpg", - "#count" : 10, + "#exception": exception.NotFoundError, }, { diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 1528f55f..b855e6cc 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -89,11 +89,10 @@ __tests__ = ( }, { - "#url" : "https://kemono.party/gumroad/user/trylsc/post/IURjT", - "#comment" : "kemono.party -> data.kemono.party", + "#url" : "https://kemono.su/gumroad/user/3101696181060/post/tOWyf", "#category": ("", "kemonoparty", "gumroad"), "#class" : kemonoparty.KemonopartyPostExtractor, - "#pattern" : r"https://kemono\.party/data/(a4/7b/a47bfe938d8c1682eef06e885927484cd8df1b.+\.jpg|c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)", + "#urls" : "https://kemono.su/data/6f/13/6f1394b19516396ea520254350662c254bbea30c1e111fd4b0f042c61c426d07.zip", }, { diff --git a/test/results/mastodonsocial.py b/test/results/mastodonsocial.py index ffd46a41..8c22bcf3 100644 --- a/test/results/mastodonsocial.py +++ b/test/results/mastodonsocial.py @@ -82,9 +82,9 @@ __tests__ = ( "#class" : mastodon.MastodonFollowingExtractor, "#extractor": False, "#urls" : ( + "https://mastodon.ie/@RustyBertrand", "https://ravenation.club/@soundwarrior20", "https://mastodon.social/@0x4f", - "https://mastodon.social/@RustyBertrand", "https://mastodon.social/@christianselig", "https://saturation.social/@clive", "https://mastodon.social/@sjvn", diff --git a/test/results/misskeydesign.py b/test/results/misskeydesign.py index f12be9ff..3560597b 100644 --- a/test/results/misskeydesign.py +++ b/test/results/misskeydesign.py @@ -21,7 +21,7 @@ __tests__ = ( "#url" : "https://misskey.design/@blooddj@pawoo.net", "#category": ("misskey", "misskey.design", "user"), "#class" : misskey.MisskeyUserExtractor, - "#count" : 7, + "#count" : "> 30", }, { diff --git a/test/results/myhentaigallery.py b/test/results/myhentaigallery.py index 0fa4e558..a90e067a 100644 --- a/test/results/myhentaigallery.py +++ b/test/results/myhentaigallery.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://myhentaigallery.com/g/16247", "#category": ("", "myhentaigallery", "gallery"), "#class" : myhentaigallery.MyhentaigalleryGalleryExtractor, - "#pattern" : r"https://images\.myhentaicomics\.com/m\w\w/images/[^/]+/original/\d+\.jpg", + "#pattern" : r"https://(cdn|images)\.myhentaicomics\.com/m\w\w/images/[^/]+/original/\d+\.jpg", "artist" : list, "count" : 11, diff --git a/test/results/naverwebtoon.py b/test/results/naverwebtoon.py index 179f85b4..36b67086 100644 --- a/test/results/naverwebtoon.py +++ b/test/results/naverwebtoon.py @@ -109,7 +109,7 @@ __tests__ = ( "#category": ("", "naverwebtoon", "comic"), "#class" : naverwebtoon.NaverwebtoonComicExtractor, "#pattern" : naverwebtoon.NaverwebtoonEpisodeExtractor.pattern, - "#count" : 25, + "#count" : 24, }, { diff --git a/test/results/omgmiamiswimwear.py b/test/results/omgmiamiswimwear.py index 27f4d54d..e92228da 100644 --- a/test/results/omgmiamiswimwear.py +++ b/test/results/omgmiamiswimwear.py @@ -15,11 +15,11 @@ __tests__ = ( }, { - "#url" : "https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", + "#url" : "https://www.omgmiamiswimwear.com/products/snatch-me-waist-belt", "#category": ("shopify", "omgmiamiswimwear", "product"), "#class" : shopify.ShopifyProductExtractor, "#pattern" : r"https://cdn\.shopify\.com/s/files/1/1819/6171/", - "#count" : 5, + "#count" : 3, }, ) diff --git a/test/results/pornpics.py b/test/results/pornpics.py index 39ad6d28..6caf5dd1 100644 --- a/test/results/pornpics.py +++ b/test/results/pornpics.py @@ -17,6 +17,7 @@ __tests__ = ( "categories": [ "Outdoor", "MILF", + "Boots", "Amateur", "Sexy", ], diff --git a/test/results/raddle.py b/test/results/raddle.py index 24710e94..6b3ccde8 100644 --- a/test/results/raddle.py +++ b/test/results/raddle.py @@ -68,9 +68,8 @@ __tests__ = ( "#comment" : "Image post", "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, - "#sha1_url" : "48663f767ea258fcd545ab5aa0e734f98f434388", "#sha1_content": "431e938082c2b59c44888a83cfc711cd1f0e910a", - "#count" : 1, + "#urls" : "https://uploads-cdn.raddle.me/submission_images/30f4cf7d235d40c1daebf6dc2e58bef2a80bec2b5b2dab10f2021ea8e3f29e11.png", }, { @@ -79,7 +78,7 @@ __tests__ = ( "#category": ("postmill", "raddle", "post"), "#class" : postmill.PostmillPostExtractor, "#options" : {"save-link-post-body": True}, - "#pattern" : r"^(text:[\s\S]+|https://raddle\.me/submission_images/[0-9a-f]+\.png)$", + "#pattern" : r"^(text:[\s\S]+|https://(uploads-cdn\.)?raddle\.me/submission_images/[0-9a-f]+\.png)$", "#count" : 2, }, diff --git a/test/results/reddit.py b/test/results/reddit.py index e5cd1c5e..bd0f9fd7 100644 --- a/test/results/reddit.py +++ b/test/results/reddit.py @@ -46,6 +46,7 @@ __tests__ = ( "#class" : reddit.RedditHomeExtractor, "#range" : "1-20", "#count" : ">= 20", + "#archive" : False, }, { diff --git a/test/results/steamgriddb.py b/test/results/steamgriddb.py index 06c1c22b..8cb39d17 100644 --- a/test/results/steamgriddb.py +++ b/test/results/steamgriddb.py @@ -5,22 +5,16 @@ # published by the Free Software Foundation. from gallery_dl.extractor import steamgriddb +from gallery_dl import exception __tests__ = ( { "#url" : "https://www.steamgriddb.com/grid/368023", + "#comment" : "deleted", "#category": ("", "steamgriddb", "asset"), "#class" : steamgriddb.SteamgriddbAssetExtractor, - "#urls" : ("https://cdn2.steamgriddb.com/grid/" - "82fee171d62c044898d99ba0fddeb203.png"), - "#count" : 1, - "#sha1_content": "0bffaccae6f35f9fab529684a5b158d1cec4186b", - - "game": { - "id" : 5259324, - "name": "Helltaker", - }, + "#exception": exception.NotFoundError, }, { diff --git a/test/results/tumblr.py b/test/results/tumblr.py index 12374e4a..67896590 100644 --- a/test/results/tumblr.py +++ b/test/results/tumblr.py @@ -103,7 +103,7 @@ __tests__ = ( "date-max" : "2015-04-25T00:00:00", "date-min" : "2015-04-01T00:00:00", }, - "#count" : 193, + "#count" : 192, }, { diff --git a/test/results/twibooru.py b/test/results/twibooru.py index a3aec152..ff0919d6 100644 --- a/test/results/twibooru.py +++ b/test/results/twibooru.py @@ -44,7 +44,7 @@ __tests__ = ( "tag_ids" : list, "tags" : list, "thumbnails_generated": True, - "updated_at" : "2023-12-25T06:58:33.986Z", + "updated_at" : str, "upvotes" : int, "view_url" : "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width" : 576, diff --git a/test/results/twitter.py b/test/results/twitter.py index cc41d9ca..2d43ebf8 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -575,7 +575,7 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, "#options" : {"cards": True}, - "#pattern" : r"https://pbs.twimg.com/card_img/174\d+/[\w-]+\?format=(jpg|png)&name=orig$", + "#pattern" : r"https://pbs.twimg.com/card_img/17\d+/[\w-]+\?format=(jpg|png)&name=orig$", "#range" : "1,3", }, diff --git a/test/results/unsplash.py b/test/results/unsplash.py index 0050bdce..8c529743 100644 --- a/test/results/unsplash.py +++ b/test/results/unsplash.py @@ -57,22 +57,7 @@ __tests__ = ( "slug" : "red-wooden-cross-on-gray-concrete-pathway-between-green-trees-during-daytime-kaoHI0iHJPM", "sponsorship": None, "subcategory": "image", - "tags" : [ - "japan", - "hakone", - "神奈川県 日本", - "torii", - "shrine", - "traditional", - "sunrise", - "hakone shrine", - "wallpaper", - "grey", - "arbour", - "outdoors", - "garden", - "gate", - ], + "tags" : list, "tags_preview": list, "topic_submissions": {}, "topics" : [], @@ -114,7 +99,7 @@ __tests__ = ( "total_photos" : 86, "total_promoted_photos": 24, "twitter_username" : None, - "updated_at" : "2023-11-27T07:10:52Z", + "updated_at" : str, "username" : "_______life_" }, "views": range(2000000, 10000000), diff --git a/test/results/vsco.py b/test/results/vsco.py index ed42f1fa..6fa9eb69 100644 --- a/test/results/vsco.py +++ b/test/results/vsco.py @@ -47,13 +47,11 @@ __tests__ = ( }, { - "#url" : "https://vsco.co/vscotest39/spaces", + "#url" : "https://vsco.co/missuri/spaces", "#category": ("", "vsco", "spaces"), "#class" : vsco.VscoSpacesExtractor, "#urls" : ( - "https://vsco.co/spaces/62991a535a9ee215340fa2b0", - "https://vsco.co/spaces/62b35bfb54f97cbfbd5c1e62", - "https://vsco.co/spaces/629674a6875ebddb8f1320c1", + "https://vsco.co/spaces/62e4934e6920440801d19f05", ), }, diff --git a/test/results/xbooru.py b/test/results/xbooru.py index 016958c9..784cd723 100644 --- a/test/results/xbooru.py +++ b/test/results/xbooru.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://xbooru.com/index.php?page=post&s=list&tags=konoyan", "#category": ("gelbooru_v02", "xbooru", "tag"), "#class" : gelbooru_v02.GelbooruV02TagExtractor, - "#count" : 24, + "#count" : 25, }, { diff --git a/test/results/xhamster.py b/test/results/xhamster.py index 44675c50..72634e18 100644 --- a/test/results/xhamster.py +++ b/test/results/xhamster.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://xhamster.com/photos/gallery/take-me-to-the-carwash-at-digitaldesire-15860946", "#category": ("", "xhamster", "gallery"), "#class" : xhamster.XhamsterGalleryExtractor, - "#pattern" : r"https://ic-ph-ah\.xhcdn\.com/a/\w+/webp/000/\d+/\d+/\d+_1000\.jpg$", + "#pattern" : r"https://ic-ph-\w+\.xhcdn\.com/a/\w+/webp/000/\d+/\d+/\d+_1000\.jpg$", "#count" : 19, "comments": int, @@ -58,7 +58,7 @@ __tests__ = ( "#url" : "https://jp.xhamster2.com/photos/gallery/take-me-to-the-carwash-at-digitaldesire-15860946", "#category": ("", "xhamster", "gallery"), "#class" : xhamster.XhamsterGalleryExtractor, - "#pattern" : r"https://ic-ph-ah\.xhcdn\.com/a/\w+/webp/000/\d+/\d+/\d+_1000\.jpg$", + "#pattern" : r"https://ic-ph-\w+\.xhcdn\.com/a/\w+/webp/000/\d+/\d+/\d+_1000\.jpg$", "#count" : 19, }, From 4b6f47e571f41cce02dcf1709a1efdf86b4a9d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 18:00:20 +0100 Subject: [PATCH 069/154] [pornhub:gif] extract 'viewkey' and 'timestamp' metadata (#4463) https://github.com/mikf/gallery-dl/issues/4463#issuecomment-2014550302 --- gallery_dl/extractor/pornhub.py | 3 +++ test/results/pornhub.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 7ff40a37..c7283fcd 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -143,6 +143,9 @@ class PornhubGifExtractor(PornhubExtractor): "url" : extr('"contentUrl": "', '"'), "date" : text.parse_datetime( extr('"uploadDate": "', '"'), "%Y-%m-%d"), + "viewkey" : extr('From this video: ' + '', '<'), "user" : text.remove_html(extr("Created by:", "
")), } diff --git a/test/results/pornhub.py b/test/results/pornhub.py index e2aa9818..53e7df88 100644 --- a/test/results/pornhub.py +++ b/test/results/pornhub.py @@ -62,9 +62,11 @@ __tests__ = ( "hardcore sex", "babes 18 year", ], + "timestamp": "5:07", "title" : "Intense sloppy blowjob of Danika Mori", "url" : "https://el.phncdn.com/pics/gifs/043/726/891/43726891a.webm", "user" : "Danika Mori", + "viewkey" : "64367c8c78a4a", }, { From 55e8fdad297b45d637b5fc0c155d6ba69675a83c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Mar 2024 18:16:24 +0100 Subject: [PATCH 070/154] [tests] use 'datetime.timezone.utc' instead of 'datetime.UTC' 'datetime.UTC' was added in Python 3.11 and is not defined in older versions. --- test/test_formatter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_formatter.py b/test/test_formatter.py index 26adbdee..89cb1aad 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -238,13 +238,15 @@ class TestFormatter(unittest.TestCase): self._run_test("{t!d:O2}", "2010-01-01 02:00:00") def test_offset_local(self): - ts = self.kwdict["dt"].replace(tzinfo=datetime.UTC).timestamp() + ts = self.kwdict["dt"].replace( + tzinfo=datetime.timezone.utc).timestamp() offset = time.localtime(ts).tm_gmtoff dt = self.kwdict["dt"] + datetime.timedelta(seconds=offset) self._run_test("{dt:O}", str(dt)) self._run_test("{dt:Olocal}", str(dt)) - ts = self.kwdict["dt_dst"].replace(tzinfo=datetime.UTC).timestamp() + ts = self.kwdict["dt_dst"].replace( + tzinfo=datetime.timezone.utc).timestamp() offset = time.localtime(ts).tm_gmtoff dt = self.kwdict["dt_dst"] + datetime.timedelta(seconds=offset) self._run_test("{dt_dst:O}", str(dt)) From 31e7ca73b602564e4ed0c1f451fdcebc31214741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Mar 2024 13:30:09 +0100 Subject: [PATCH 071/154] [gelbooru] add 'order-posts' option for favorites (#5220) --- docs/configuration.rst | 14 ++++++++++++++ gallery_dl/extractor/gelbooru.py | 29 +++++++++++++++++------------ test/results/gelbooru.py | 24 ++++++++++++++++++++++-- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9c3782c3..51408d3c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2017,6 +2017,20 @@ Description page. +extractor.gelbooru.favorite.order-posts +--------------------------------------- +Type + ``string`` +Default + ``"desc"`` +Description + Controls the order in which favorited posts are returned. + + * ``"asc"``: Ascending favorite date order (oldest first) + * ``"desc"``: Descending favorite date order (newest first) + * ``"reverse"``: Same as ``"asc"`` + + extractor.generic.enabled ------------------------- Type diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index e54a1fae..2459a61f 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -172,26 +172,31 @@ class GelbooruFavoriteExtractor(GelbooruBase, "id" : self.favorite_id, "limit": "2", } - data = self._api_request(params, None, True) count = data["@attributes"]["count"] - if count <= self.offset: - return () + self.log.debug("API reports %s favorite entries", count) favs = data["favorite"] try: order = 1 if favs[0]["id"] < favs[1]["id"] else -1 - except LookupError: - order = 0 + except LookupError as exc: + self.log.debug( + "Error when determining API favorite order (%s: %s)", + exc.__class__.__name__, exc) + order = -1 + else: + self.log.debug("API yields favorites in %sscending order", + "a" if order > 0 else "de") - if order > 0: - self.log.debug("API yields favorites in ascending order") + order_favs = self.config("order-posts") + if order_favs and order_favs[0] in ("r", "a"): self.log.debug("Returning them in reverse") - return self._pagination_reverse(params, count) + order = -order - self.log.debug("API yields favorites in descending order") - return self._pagination(params, count) + if order < 0: + return self._pagination(params, count) + return self._pagination_reverse(params, count) def _pagination(self, params, count): if self.offset: @@ -203,7 +208,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite", True) + favs = self._api_request(params, "favorite") if not favs: return @@ -232,7 +237,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite", True) + favs = self._api_request(params, "favorite") favs.reverse() if skip: diff --git a/test/results/gelbooru.py b/test/results/gelbooru.py index b2f99ed1..6302d56f 100644 --- a/test/results/gelbooru.py +++ b/test/results/gelbooru.py @@ -47,10 +47,30 @@ __tests__ = ( }, { - "#url" : "https://gelbooru.com/index.php?page=favorites&s=view&id=279415", + "#url" : "https://gelbooru.com/index.php?page=favorites&s=view&id=1435674", "#category": ("booru", "gelbooru", "favorite"), "#class" : gelbooru.GelbooruFavoriteExtractor, - "#count" : 3, + "#urls" : ( + "https://img3.gelbooru.com/images/5d/30/5d30fc056ed8668616b3c440df9bac89.jpg", + "https://img3.gelbooru.com/images/4c/2d/4c2da867ed643acdadd8105177dcdaf0.png", + "https://img3.gelbooru.com/images/c8/26/c826f3cb90d9aaca8d0632a96bf4abe8.jpg", + "https://img3.gelbooru.com/images/c1/fe/c1fe59c0bc8ce955dd353544b1015d0c.jpg", + "https://img3.gelbooru.com/images/e6/6d/e66d8883c184f5d3b2591dfcdf0d007c.jpg", + ), +}, + +{ + "#url" : "https://gelbooru.com/index.php?page=favorites&s=view&id=1435674", + "#category": ("booru", "gelbooru", "favorite"), + "#class" : gelbooru.GelbooruFavoriteExtractor, + "#options" : {"order-posts": "reverse"}, + "#urls" : ( + "https://img3.gelbooru.com/images/e6/6d/e66d8883c184f5d3b2591dfcdf0d007c.jpg", + "https://img3.gelbooru.com/images/c1/fe/c1fe59c0bc8ce955dd353544b1015d0c.jpg", + "https://img3.gelbooru.com/images/c8/26/c826f3cb90d9aaca8d0632a96bf4abe8.jpg", + "https://img3.gelbooru.com/images/4c/2d/4c2da867ed643acdadd8105177dcdaf0.png", + "https://img3.gelbooru.com/images/5d/30/5d30fc056ed8668616b3c440df9bac89.jpg", + ), }, { From 925123e0077b239a47f1db5608eb75183f37a080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Mar 2024 15:50:24 +0100 Subject: [PATCH 072/154] [deviantart] handle CloudFront blocks in general (#5363) This was already done for non-OAuth requests (#655) but CF is now blocking OAuth API requests as well. --- gallery_dl/extractor/deviantart.py | 31 +++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index dc5a2f83..ca8acaa5 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -84,6 +84,16 @@ class DeviantartExtractor(Extractor): else: self.commit_journal = None + def request(self, url, **kwargs): + if "fatal" not in kwargs: + kwargs["fatal"] = False + while True: + response = Extractor.request(self, url, **kwargs) + if response.status_code != 403 or \ + b"Request blocked." not in response.content: + return response + self.wait(seconds=300, reason="CloudFront block") + def skip(self, num): self.offset += num return num @@ -462,18 +472,12 @@ class DeviantartExtractor(Extractor): def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" - kwargs["fatal"] = None diff = time.time() - DeviantartExtractor._last_request if diff < 2.0: self.sleep(2.0 - diff, "request") - - while True: - response = self.request(url, **kwargs) - if response.status_code != 403 or \ - b"Request blocked." not in response.content: - DeviantartExtractor._last_request = time.time() - return response - self.wait(seconds=180) + response = self.request(url, **kwargs) + DeviantartExtractor._last_request = time.time() + return response def _fetch_premium(self, deviation): try: @@ -1418,9 +1422,14 @@ class DeviantartOAuthAPI(): self.authenticate(None if public else self.refresh_token_key) kwargs["headers"] = self.headers response = self.extractor.request(url, **kwargs) - data = response.json() - status = response.status_code + try: + data = response.json() + except ValueError: + self.log.error("Unable to parse API response") + data = {} + + status = response.status_code if 200 <= status < 400: if self.delay > self.delay_min: self.delay -= 1 From c3bafd6a2b8becd03126745e66e84bb2416df4c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Mar 2024 17:41:29 +0100 Subject: [PATCH 073/154] release version 1.26.9 --- CHANGELOG.md | 84 +++++++++++++++++++++++++++++++++++++++++++ README.rst | 4 +-- gallery_dl/version.py | 2 +- 3 files changed, 87 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f938ab94..8cdcf642 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,89 @@ # Changelog +## 1.26.9 - 2024-03-23 +### Extractors +#### Additions +- [artstation] support video clips ([#2566](https://github.com/mikf/gallery-dl/issues/2566), [#3309](https://github.com/mikf/gallery-dl/issues/3309), [#3911](https://github.com/mikf/gallery-dl/issues/3911)) +- [artstation] support collections ([#146](https://github.com/mikf/gallery-dl/issues/146)) +- [deviantart] recognize `deviantart.com/stash/…` URLs +- [idolcomplex] support new pool URLs +- [lensdump] recognize direct image links ([#5293](https://github.com/mikf/gallery-dl/issues/5293)) +- [skeb] add extractor for followed users ([#5290](https://github.com/mikf/gallery-dl/issues/5290)) +- [twitter] add `quotes` extractor ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) +- [wikimedia] support `azurlane.koumakan.jp` ([#5256](https://github.com/mikf/gallery-dl/issues/5256)) +- [xvideos] support `/channels/` URLs ([#5244](https://github.com/mikf/gallery-dl/issues/5244)) +#### Fixes +- [artstation] fix handling usernames with dashes in domain names ([#5224](https://github.com/mikf/gallery-dl/issues/5224)) +- [bluesky] fix not spawning child extractors for followed users ([#5246](https://github.com/mikf/gallery-dl/issues/5246)) +- [deviantart] handle CloudFront blocks ([#5363](https://github.com/mikf/gallery-dl/issues/5363)) +- [deviantart:avatar] fix `index` for URLs without `?` ([#5276](https://github.com/mikf/gallery-dl/issues/5276)) +- [deviantart:stash] fix `index` values ([#5335](https://github.com/mikf/gallery-dl/issues/5335)) +- [gofile] fix extraction +- [hiperdex] update URL patterns & fix `manga` metadata ([#5340](https://github.com/mikf/gallery-dl/issues/5340)) +- [idolcomplex] fix metadata extraction +- [imagefap] fix folder extraction ([#5333](https://github.com/mikf/gallery-dl/issues/5333)) +- [instagram] make accessing `like_count` non-fatal ([#5218](https://github.com/mikf/gallery-dl/issues/5218)) +- [mastodon] fix handling null `moved` account field ([#5321](https://github.com/mikf/gallery-dl/issues/5321)) +- [naver] fix EUC-KR encoding issue in old image URLs ([#5126](https://github.com/mikf/gallery-dl/issues/5126)) +- [nijie] increase default delay between requests ([#5221](https://github.com/mikf/gallery-dl/issues/5221)) +- [nitter] ignore invalid Tweets ([#5253](https://github.com/mikf/gallery-dl/issues/5253)) +- [pixiv:novel] fix text extraction ([#5285](https://github.com/mikf/gallery-dl/issues/5285), [#5309](https://github.com/mikf/gallery-dl/issues/5309)) +- [skeb] retry 429 responses containing a `request_key` cookie ([#5210](https://github.com/mikf/gallery-dl/issues/5210)) +- [warosu] fix crash for threads with deleted posts ([#5289](https://github.com/mikf/gallery-dl/issues/5289)) +- [weibo] fix retweets ([#2825](https://github.com/mikf/gallery-dl/issues/2825), [#3874](https://github.com/mikf/gallery-dl/issues/3874), [#5263](https://github.com/mikf/gallery-dl/issues/5263)) +- [weibo] fix `livephoto` filename extensions ([#5287](https://github.com/mikf/gallery-dl/issues/5287)) +- [xvideos] fix galleries with more than 500 images ([#5244](https://github.com/mikf/gallery-dl/issues/5244)) +#### Improvements +- [bluesky] improve API error messages +- [bluesky] handle posts with different `embed` structure +- [deviantart:avatar] ignore default avatars ([#5276](https://github.com/mikf/gallery-dl/issues/5276)) +- [fapello] download full-sized images ([#5349](https://github.com/mikf/gallery-dl/issues/5349)) +- [gelbooru:favorite] automatically detect returned post order ([#5220](https://github.com/mikf/gallery-dl/issues/5220)) +- [imgur] fail downloads when redirected to `removed.png` ([#5308](https://github.com/mikf/gallery-dl/issues/5308)) +- [instagram] raise proper error for missing `reels_media` ([#5257](https://github.com/mikf/gallery-dl/issues/5257)) +- [instagram] change `posts are private` exception to a warning ([#5322](https://github.com/mikf/gallery-dl/issues/5322)) +- [reddit] improve preview fallback formats ([#5296](https://github.com/mikf/gallery-dl/issues/5296), [#5315](https://github.com/mikf/gallery-dl/issues/5315)) +- [steamgriddb] raise exception for deleted assets +- [twitter] handle "account is temporarily locked" errors ([#5300](https://github.com/mikf/gallery-dl/issues/5300)) +- [weibo] rework pagination logic ([#4168](https://github.com/mikf/gallery-dl/issues/4168)) +- [zerochan] fetch more posts by using the API ([#3669](https://github.com/mikf/gallery-dl/issues/3669)) +#### Metadata +- [bluesky] add `instance` metadata field ([#4438](https://github.com/mikf/gallery-dl/issues/4438)) +- [gelbooru:favorite] add `date_favorited` metadata field +- [imagefap] extract `folder` metadata ([#5270](https://github.com/mikf/gallery-dl/issues/5270)) +- [instagram] default `likes` to `0` ([#5323](https://github.com/mikf/gallery-dl/issues/5323)) +- [kemonoparty] add `revision_count` metadata field ([#5334](https://github.com/mikf/gallery-dl/issues/5334)) +- [naver] unescape post `title` and `description` +- [pornhub:gif] extract `viewkey` and `timestamp` metadata ([#4463](https://github.com/mikf/gallery-dl/issues/4463)) +- [redgifs] make `date` available for directories ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) +- [subscribestar] fix `date` metadata +- [twitter] add `birdwatch` metadata field ([#5317](https://github.com/mikf/gallery-dl/issues/5317)) +- [twitter] add `protected` metadata field ([#5327](https://github.com/mikf/gallery-dl/issues/5327)) +- [warosu] fix `board_name` metadata +#### Options +- [bluesky] add `reposts` option ([#4438](https://github.com/mikf/gallery-dl/issues/4438), [#5248](https://github.com/mikf/gallery-dl/issues/5248)) +- [deviantart] add `comments-avatars` option ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [deviantart] extend `metadata` option ([#5175](https://github.com/mikf/gallery-dl/issues/5175)) +- [flickr] add `contexts` option ([#5324](https://github.com/mikf/gallery-dl/issues/5324)) +- [gelbooru:favorite] add `order-posts` option ([#5220](https://github.com/mikf/gallery-dl/issues/5220)) +- [kemonoparty] add `order-revisions` option ([#5334](https://github.com/mikf/gallery-dl/issues/5334)) +- [vipergirls] add `like` option ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +- [vipergirls] add `domain` option ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +### Downloaders +- [http] add MIME type and signature for `.mov` files ([#5287](https://github.com/mikf/gallery-dl/issues/5287)) +### Docker +- build images from source instead of PyPI package +- build `linux/arm64` images ([#5227](https://github.com/mikf/gallery-dl/issues/5227)) +- build images on every push to master + - tag images as `YYYY.MM.DD` + - tag the most recent build from master as `dev` + - tag the most recent release build as `latest` +- reduce image size ([#5097](https://github.com/mikf/gallery-dl/issues/5097)) +### Miscellaneous +- [formatter] fix local DST datetime offsets for `:O` +- build Linux executable on Ubuntu 22.04 LTS ([#4184](https://github.com/mikf/gallery-dl/issues/4184)) +- automatically create directories for logging files ([#5249](https://github.com/mikf/gallery-dl/issues/5249)) + ## 1.26.8 - 2024-02-17 ### Extractors #### Additions diff --git a/README.rst b/README.rst index 6f6aa025..9d017abb 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index b48cd1e5..d438ba4d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.9-dev" +__version__ = "1.26.9" From 15a4bc25846304ea30ba8be3c3a374c8326d816c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 24 Mar 2024 02:21:38 +0100 Subject: [PATCH 074/154] [kemonoparty] fix KeyError for empty files (#5368) --- gallery_dl/extractor/kemonoparty.py | 2 +- gallery_dl/version.py | 2 +- test/results/kemonoparty.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 9c77b7a7..f82bb33d 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -156,7 +156,7 @@ class KemonopartyExtractor(Extractor): def _file(self, post): file = post["file"] - if not file: + if not file or "path" not in file: return () file["type"] = "file" return (file,) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d438ba4d..ab8df335 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.9" +__version__ = "1.27.0-dev" diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index b855e6cc..7cc62d34 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -248,6 +248,16 @@ __tests__ = ( "published": "2022-07-29T21:12:11.483000", }, +{ + "#url" : "https://kemono.su/gumroad/user/3267960360326/post/jwwag", + "#comment" : "empty 'file' with no 'path' (#5368)", + "#category": ("", "kemonoparty", "gumroad"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#count" : 8, + + "type" : "attachment", +}, + { "#url" : "https://kemono.su/discord/server/488668827274444803#608504710906904576", "#category": ("", "kemonoparty", "discord"), From 423599ce9547ebfe580cfca7590f5067c93528c8 Mon Sep 17 00:00:00 2001 From: fireattack Date: Mon, 25 Mar 2024 11:45:04 +0800 Subject: [PATCH 075/154] [twitter] fix pattern for single tweet (#5371) - Add optional slash - Update tests to include some non-standard tweet URLs --- gallery_dl/extractor/twitter.py | 2 +- test/results/twitter.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a5bd9840..97387df7 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -736,7 +736,7 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?$" + pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?(?:$|[?#])" example = "https://twitter.com/USER/status/12345" def __init__(self, match): diff --git a/test/results/twitter.py b/test/results/twitter.py index 2d43ebf8..4ceb63b9 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -316,7 +316,7 @@ __tests__ = ( }, { - "#url" : "https://twitter.com/perrypumas/status/1065692031626829824", + "#url" : "https://twitter.com/perrypumas/status/1065692031626829824?s=20", "#comment" : "video", "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, @@ -324,7 +324,7 @@ __tests__ = ( }, { - "#url" : "https://twitter.com/playpokemon/status/1263832915173048321", + "#url" : "https://twitter.com/playpokemon/status/1263832915173048321/", "#comment" : "content with emoji, newlines, hashtags (#338)", "#category": ("", "twitter", "tweet"), "#class" : twitter.TwitterTweetExtractor, From 72ac2c750d7b42d5d4d33ca49988da67848cec22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 26 Mar 2024 02:27:36 +0100 Subject: [PATCH 076/154] [kemonoparty:favorite] support 'sort' and 'order' query params (#5375) --- gallery_dl/extractor/kemonoparty.py | 16 +++++++++++- test/results/kemonoparty.py | 40 ++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index f82bb33d..f7f5ea9d 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -494,7 +494,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(3)).get("type") or + self.params = text.parse_query(match.group(3)) + self.favorites = (self.params.get("type") or self.config("favorites") or "artist") @@ -502,9 +503,17 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): self._prepare_ddosguard_cookies() self.login() + sort = self.params.get("sort") + order = self.params.get("order") or "desc" + if self.favorites == "artist": users = self.request( self.root + "/api/v1/account/favorites?type=artist").json() + + if not sort: + sort = "updated" + users.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for user in users: user["_extractor"] = KemonopartyUserExtractor url = "{}/{}/user/{}".format( @@ -514,6 +523,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): elif self.favorites == "post": posts = self.request( self.root + "/api/v1/account/favorites?type=post").json() + + if not sort: + sort = "faved_seq" + posts.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for post in posts: post["_extractor"] = KemonopartyPostExtractor url = "{}/{}/user/{}/post/{}".format( diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 7cc62d34..7b8d1301 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -352,8 +352,24 @@ __tests__ = ( "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyUserExtractor.pattern, "#auth" : True, - "#count" : 3, - "#sha1_url": "902c656c8002a3257ef9e255cb69bca1937373d4", + "#urls" : ( + "https://kemono.su/patreon/user/881792", + "https://kemono.su/fanbox/user/6993449", + "https://kemono.su/subscribestar/user/alcorart", + ), +}, + +{ + "#url" : "https://kemono.su/favorites?type=artist&sort=faved_seq&order=asc", + "#category": ("", "kemonoparty", "favorite"), + "#class" : kemonoparty.KemonopartyFavoriteExtractor, + "#pattern" : kemonoparty.KemonopartyUserExtractor.pattern, + "#auth" : True, + "#urls" : ( + "https://kemono.su/fanbox/user/6993449", + "https://kemono.su/patreon/user/881792", + "https://kemono.su/subscribestar/user/alcorart", + ), }, { @@ -362,8 +378,24 @@ __tests__ = ( "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, "#auth" : True, - "#count" : 3, - "#sha1_url": "4be8e84cb384a907a8e7997baaf6287b451783b5", + "#urls" : ( + "https://kemono.su/subscribestar/user/alcorart/post/184329", + "https://kemono.su/fanbox/user/6993449/post/23913", + "https://kemono.su/patreon/user/881792/post/4769638", + ), +}, + +{ + "#url" : "https://kemono.su/favorites?type=post&sort=published&order=asc", + "#category": ("", "kemonoparty", "favorite"), + "#class" : kemonoparty.KemonopartyFavoriteExtractor, + "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, + "#auth" : True, + "#urls" : ( + "https://kemono.su/patreon/user/881792/post/4769638", + "https://kemono.su/fanbox/user/6993449/post/23913", + "https://kemono.su/subscribestar/user/alcorart/post/184329", + ), }, ) From 9cce4616271715093f1a885ebfb64be6dabd2abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 26 Mar 2024 15:20:14 +0100 Subject: [PATCH 077/154] [kemonoparty] add 'announcements' option (#5262) https://github.com/mikf/gallery-dl/issues/5262#issuecomment-2015919188 --- docs/configuration.rst | 10 ++++++++++ gallery_dl/extractor/kemonoparty.py | 30 +++++++++++++++++------------ test/results/kemonoparty.py | 13 +++++++++++++ 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 51408d3c..3052385f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2317,6 +2317,16 @@ Description Extract a user's direct messages as ``dms`` metadata. +extractor.kemonoparty.announcements +----------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract a user's announcements as ``announcements`` metadata. + + extractor.kemonoparty.favorites ------------------------------- Type diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index f7f5ea9d..bb0b03af 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -57,7 +57,7 @@ class KemonopartyExtractor(Extractor): generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") comments = self.config("comments") - username = dms = None + username = dms = announcements = None # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} @@ -68,6 +68,8 @@ class KemonopartyExtractor(Extractor): '"): - footer = text.extr(dm, "") - dms.append({ + cards = [] + for card in text.extract_iter(page, ""): + footer = text.extr(card, "") + cards.append({ "body": text.unescape(text.extr( - dm, "
", "
", " 19: diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 7b8d1301..4c370089 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -135,6 +135,19 @@ __tests__ = ( }], }, +{ + "#url" : "https://kemono.su/patreon/user/3161935/post/68231671", + "#comment" : "announcements", + "#category": ("", "kemonoparty", "patreon"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#options" : {"announcements": True}, + + "announcements": [{ + "body": "
Thank you so much for the support!
This Patreon is more of a tip jar for supporting what I make. I have to clarify that there are no exclusive Patreon animations because all are released for the public. You will get earlier access to WIPs. Direct downloads to my works are also available for $5 and $10 Tiers.
", + "date": "2023-02", + }], +}, + { "#url" : "https://kemono.su/patreon/user/19623797/post/29035449", "#comment" : "invalid file (#3510)", From ef0c90414c1077e42ae17ccec96eb4925d924c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 26 Mar 2024 15:33:26 +0100 Subject: [PATCH 078/154] [wikimedia] suppress exception for entries without 'imageinfo' (#5384) --- gallery_dl/extractor/wikimedia.py | 6 +++++- test/results/azurlanewiki.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index ac00682d..c15c8302 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -69,7 +69,11 @@ class WikimediaExtractor(BaseExtractor): def items(self): for info in self._pagination(self.params): - image = info["imageinfo"][0] + try: + image = info["imageinfo"][0] + except LookupError: + self.log.debug("Missing 'imageinfo' for %s", info) + continue image["metadata"] = { m["name"]: m["value"] diff --git a/test/results/azurlanewiki.py b/test/results/azurlanewiki.py index 83f103b1..17673420 100644 --- a/test/results/azurlanewiki.py +++ b/test/results/azurlanewiki.py @@ -14,4 +14,12 @@ __tests__ = ( "#class" : wikimedia.WikimediaArticleExtractor, }, +{ + "#url" : "https://azurlane.koumakan.jp/wiki/Louisville/Gallery", + "#comment" : "entries with missing 'imageinfo' (#5384)", + "#category": ("wikimedia", "azurlanewiki", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#count" : "> 10", +}, + ) From a1e64bac73453b821859227dd880f76a2b37c108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 29 Mar 2024 02:04:11 +0100 Subject: [PATCH 079/154] [docs] update defaults of 'sleep-request', 'browser', 'tls12' --- docs/configuration.rst | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 3052385f..6b6049f5 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -363,7 +363,23 @@ extractor.*.sleep-request Type |Duration|_ Default - ``0`` + * ``"0.5-1.5"`` + ``[Danbooru]``, ``[E621]``, ``[foolfuuka]``, ``itaku``, + ``newgrounds``, ``[philomena]``, ``pixiv:novel``, ``plurk``, + ``poipiku`` , ``pornpics``, ``soundgasm``, ``urlgalleries``, + ``vk``, ``zerochan`` + * ``"1.0-2.0"`` + ``flickr``, ``weibo``, ``[wikimedia]`` + * ``"2.0-4.0"`` + ``behance``, ``imagefap``, ``[Nijie]`` + * ``"3.0-6.0"`` + ``exhentai``, ``idolcomplex``, ``[reactor]``, ``readcomiconline`` + * ``"6.0-6.1"`` + ``twibooru`` + * ``"6.0-12.0"`` + ``instagram`` + * ``0`` + otherwise Description Minimal time interval in seconds between each HTTP request during data extraction. @@ -382,6 +398,7 @@ Description Specifying username and password is required for * ``nijie`` + * ``horne`` and optional for @@ -389,8 +406,12 @@ Description * ``aryion`` * ``atfbooru`` (*) * ``bluesky`` + * ``booruvar`` (*) + * ``coomerparty`` * ``danbooru`` (*) + * ``deviantart`` * ``e621`` (*) + * ``e6ai`` (*) * ``e926`` (*) * ``exhentai`` * ``idolcomplex`` @@ -401,7 +422,6 @@ Description * ``mangoxo`` * ``pillowfort`` * ``sankaku`` - * ``seisoparty`` * ``subscribestar`` * ``tapas`` * ``tsumino`` @@ -417,7 +437,7 @@ Description the API key found in your user profile, not the actual account password. Note: Leave the ``password`` value empty or undefined - to get prompted for a passeword when performing a login + to be prompted for a passeword when performing a login (see `getpass() `__). @@ -557,8 +577,8 @@ extractor.*.browser Type ``string`` Default - * ``"firefox"`` for ``patreon``, ``mangapark``, and ``mangasee`` - * ``null`` everywhere else + * ``"firefox"``: ``artstation``, ``mangasee``, ``patreon``, ``pixiv:series``, ``twitter`` + * ``null``: otherwise Example * ``"chrome:macos"`` Description @@ -633,8 +653,8 @@ extractor.*.tls12 Type ``bool`` Default - * ``true`` - * ``false`` for ``patreon``, ``pixiv:series`` + * ``false``: ``patreon``, ``pixiv:series`` + * ``true``: otherwise Description Allow selecting TLS 1.2 cipher suites. From c7edeb871bb6887aa8e1b67dc056d06adfde9c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 29 Mar 2024 02:11:04 +0100 Subject: [PATCH 080/154] [docs] complete Authentication info in supportedsites.md --- docs/supportedsites.md | 6 +++--- scripts/supportedsites.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b004d7dc..e5665688 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1103,7 +1103,7 @@ Consider all listed sites to potentially be NSFW. Booruvar https://booru.borvar.art/ Pools, Popular Images, Posts, Tag Searches - + Supported @@ -1125,7 +1125,7 @@ Consider all listed sites to potentially be NSFW. e6AI https://e6ai.net/ Favorites, Pools, Popular Images, Posts, Tag Searches - + Supported @@ -1331,7 +1331,7 @@ Consider all listed sites to potentially be NSFW. Furbooru https://furbooru.org/ Galleries, Posts, Search Results - + API Key diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 47d7ef92..9748f00a 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -354,11 +354,13 @@ AUTH_MAP = { "atfbooru" : "Supported", "baraag" : _OAUTH, "bluesky" : "Supported", + "booruvar" : "Supported", "coomerparty" : "Supported", "danbooru" : "Supported", "derpibooru" : _APIKEY_DB, "deviantart" : _OAUTH, "e621" : "Supported", + "e6ai" : "Supported", "e926" : "Supported", "e-hentai" : "Supported", "exhentai" : "Supported", @@ -366,6 +368,7 @@ AUTH_MAP = { "fantia" : _COOKIES, "flickr" : _OAUTH, "furaffinity" : _COOKIES, + "furbooru" : "API Key", "horne" : "Required", "idolcomplex" : "Supported", "imgbb" : "Supported", @@ -386,7 +389,6 @@ AUTH_MAP = { "reddit" : _OAUTH, "sankaku" : "Supported", "seiga" : _COOKIES, - "seisoparty" : "Supported", "smugmug" : _OAUTH, "subscribestar" : "Supported", "tapas" : "Supported", From bdbc8be5f3d3dbae2f8195c71cd6f5c1f077c25a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Mar 2024 18:13:38 +0100 Subject: [PATCH 081/154] [twitter] prevent crash when extracting 'birdwatch' metadata (#5403) --- gallery_dl/extractor/twitter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 97387df7..4a817714 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -341,7 +341,12 @@ class TwitterExtractor(Extractor): tdata["content"] = txt if tco.startswith("https://t.co/") else content if "birdwatch_pivot" in tweet: - tdata["birdwatch"] = tweet["birdwatch_pivot"]["subtitle"]["text"] + try: + tdata["birdwatch"] = \ + tweet["birdwatch_pivot"]["subtitle"]["text"] + except KeyError: + self.log.debug("Unable to extract 'birdwatch' note from %s", + tweet["birdwatch_pivot"]) if "in_reply_to_screen_name" in legacy: tdata["reply_to"] = legacy["in_reply_to_screen_name"] if "quoted_by" in legacy: From bdca01e6e60ce238bfe8a44cf997734c662c9bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Mar 2024 23:32:31 +0100 Subject: [PATCH 082/154] [workflows] build complete docs Pages only on gdl-org/docs deploy only docs/oauth-redirect.html on mikf.github.io/gallery-dl --- .github/workflows/pages.yml | 44 ++++++++----------------------------- 1 file changed, 9 insertions(+), 35 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 9ddb05ea..e0335a55 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -33,50 +33,24 @@ jobs: https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches -d '{"ref":"master"}' - build: + deploy: runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: - uses: actions/checkout@v4 - - uses: actions/configure-pages@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - - name: Install Docutils - run: pip install docutils pygments - - - name: Update Links - working-directory: ./docs/ - run: sed --in-place 's/\.\(rst\|md\)\b/.html/g' -- *.md *.rst - - - name: reStructuredText to HTML - working-directory: ./docs/ + - name: Copy static files run: | - while read -r RST - do - python -m docutils --writer=html --output="${RST%.rst}.html" -- "$RST" - done < <(find . -type f -name "*.rst") - - - uses: actions/jekyll-build-pages@v1 - with: - source: ./docs/ - destination: ./_site/ + mkdir --parents -- ./_site + cp --archive --target-directory=./_site -- \ + ./docs/oauth-redirect.html - uses: actions/upload-pages-artifact@v3 - - deploy: - - runs-on: ubuntu-latest - - needs: build - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - - steps: - uses: actions/deploy-pages@v4 id: deployment From cf36c576d00b0306d10e10439a88f34d69089793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 31 Mar 2024 03:52:29 +0200 Subject: [PATCH 083/154] [docs] document 'actions' (#4543) or at least attempt to --- docs/configuration.rst | 59 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6b6049f5..faa6472d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -856,6 +856,65 @@ Description for available ``PRAGMA`` statements and further details. +extractor.*.actions +------------------- +Type + * ``object`` (`pattern` -> `action`) + * ``list`` of ``lists`` with 2 ``strings`` as elements +Example + .. code:: json + + { + "error" : "status |= 1", + "warning:(?i)unable to .+": "exit 127", + "info:Logging in as .+" : "level = debug" + } + + .. code:: json + + [ + ["error" , "status |= 1" ], + ["warning:(?i)unable to .+", "exit 127" ], + ["info:Logging in as .+" , "level = debug"] + ] + +Description + Perform an ``action`` when logging a message matched by ``pattern``. + + ``pattern`` is parsed as severity level (``debug``, ``info``, ``warning``, ``error``, or integer value) + followed by an optional `Python Regular Expression `__ + separated by a colon ``:``. + Using ``*`` as `level` or leaving it empty + matches logging messages of all levels + (e.g. ``*:`` or ``:``). + + ``action`` is parsed as action type + followed by (optional) arguments. + + Supported Action Types: + + ``status``: + | Modify job exit status. + | Expected syntax is `` `` (e.g. ``= 100``). + + Supported operators are + ``=`` (assignment), + ``&`` (bitwise AND), + ``|`` (bitwise OR), + ``^`` (bitwise XOR). + ``level``: + | Modify severity level of the current logging message. + | Can be one of ``debug``, ``info``, ``warning``, ``error`` or an integer value. + ``print`` + Write argument to stdout. + ``restart``: + Restart the current extractor run. + ``wait``: + Stop execution until Enter is pressed. + ``exit``: + Exit the program with the given argument as exit status. + + extractor.*.postprocessors -------------------------- Type From 0c178846734e6149b41e82502da1c038ecfd17e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 31 Mar 2024 23:25:05 +0200 Subject: [PATCH 084/154] store 'match' and 'groups' in Extractor objects --- gallery_dl/extractor/common.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index d14e13ae..2500fec9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -45,6 +45,8 @@ class Extractor(): def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string + self.match = match + self.groups = match.groups() self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -599,7 +601,7 @@ class GalleryExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.gallery_url = self.root + match.group(1) if url is None else url + self.gallery_url = self.root + self.groups[0] if url is None else url def items(self): self.login() @@ -674,7 +676,7 @@ class MangaExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.manga_url = url or self.root + match.group(1) + self.manga_url = self.root + self.groups[0] if url is None else url if self.config("chapter-reverse", False): self.reverse = not self.reverse @@ -736,17 +738,17 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - if not self.category: - self._init_category(match) Extractor.__init__(self, match) + if not self.category: + self._init_category() - def _init_category(self, match): - for index, group in enumerate(match.groups()): + def _init_category(self): + for index, group in enumerate(self.groups): if group is not None: if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.match.group(0)) self.config_instance = info.get else: self.root = group From 64948f2c09a9a0ce8e7d6de82a561e38cacdcd0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Apr 2024 22:31:25 +0200 Subject: [PATCH 085/154] [foolfuuka] improve 'board' pattern & support pages (#5408) --- gallery_dl/extractor/foolfuuka.py | 24 ++++++++++++++---------- test/results/desuarchive.py | 20 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 715abcb7..85dd8969 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -117,8 +117,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex-1) - self.thread = match.group(match.lastindex) + self.board = self.groups[-2] + self.thread = self.groups[-1] self.data = None def metadata(self): @@ -140,20 +140,22 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$" example = "https://archived.moe/a/" def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex) + self.board = self.groups[-2] + self.page = self.groups[-1] def items(self): index_base = "{}/_/api/chan/index/?board={}&page=".format( self.root, self.board) thread_base = "{}/{}/thread/".format(self.root, self.board) - for page in itertools.count(1): - with self.request(index_base + format(page)) as response: + page = self.page + for pnum in itertools.count(text.parse_int(page, 1)): + with self.request(index_base + format(pnum)) as response: try: threads = response.json() except ValueError: @@ -167,6 +169,9 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): thread["_extractor"] = FoolfuukaThreadExtractor yield Message.Queue, thread["url"], thread + if page: + return + class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" @@ -179,17 +184,16 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) self.params = params = {} - args = match.group(match.lastindex).split("/") - key = None - for arg in args: + key = None + for arg in self.groups[-1].split("/"): if key: params[key] = text.unescape(arg) key = None else: key = arg - board = match.group(match.lastindex-1) + board = self.groups[-2] if board != "_": params["boards"] = board diff --git a/test/results/desuarchive.py b/test/results/desuarchive.py index 8e180652..43cd9689 100644 --- a/test/results/desuarchive.py +++ b/test/results/desuarchive.py @@ -15,12 +15,32 @@ __tests__ = ( "#sha1_url": "e7d624aded15a069194e38dc731ec23217a422fb", }, +{ + "#url" : "https://desuarchive.org/a", + "#category": ("foolfuuka", "desuarchive", "board"), + "#class" : foolfuuka.FoolfuukaBoardExtractor, +}, + { "#url" : "https://desuarchive.org/a/", "#category": ("foolfuuka", "desuarchive", "board"), "#class" : foolfuuka.FoolfuukaBoardExtractor, }, +{ + "#url" : "https://desuarchive.org/a/2", + "#category": ("foolfuuka", "desuarchive", "board"), + "#class" : foolfuuka.FoolfuukaBoardExtractor, +}, + +{ + "#url" : "https://desuarchive.org/a/page/2", + "#category": ("foolfuuka", "desuarchive", "board"), + "#class" : foolfuuka.FoolfuukaBoardExtractor, + "#pattern" : foolfuuka.FoolfuukaThreadExtractor.pattern, + "#count" : 10, +}, + { "#url" : "https://desuarchive.org/_/search/text/test/", "#category": ("foolfuuka", "desuarchive", "search"), From 095e5ded6f968d84be70f66a29a35237f8a9cb6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Apr 2024 23:35:42 +0200 Subject: [PATCH 086/154] [reddit] support comment embeds (#5366) --- gallery_dl/extractor/reddit.py | 39 +++++++++++++++++++++++++++++++--- test/results/reddit.py | 12 +++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index e099c7ed..a3a455a9 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -74,8 +74,8 @@ class RedditExtractor(Extractor): yield Message.Url, url, submission elif "gallery_data" in media: - for submission["num"], url in enumerate( - self._extract_gallery(media), 1): + for url in self._extract_gallery(media): + submission["num"] += 1 text.nameext_from_url(url, submission) yield Message.Url, url, submission @@ -99,7 +99,10 @@ class RedditExtractor(Extractor): urls.append((url, submission)) for comment in comments: html = comment["body_html"] or "" - if ' href="' in html: + href = (' href="' in html) + media = ("media_metadata" in comment) + + if media or href: comment["date"] = text.parse_timestamp( comment["created_utc"]) if submission: @@ -107,6 +110,14 @@ class RedditExtractor(Extractor): data["comment"] = comment else: data = comment + + if media: + for embed in self._extract_embed(comment): + submission["num"] += 1 + text.nameext_from_url(embed, submission) + yield Message.Url, embed, submission + + if href: for url in text.extract_iter(html, ' href="', '"'): urls.append((url, data)) @@ -118,6 +129,7 @@ class RedditExtractor(Extractor): if url.startswith(( "https://www.reddit.com/message/compose", "https://reddit.com/message/compose", + "https://preview.redd.it/", )): continue @@ -172,6 +184,27 @@ class RedditExtractor(Extractor): submission["id"], item["media_id"]) self.log.debug(src) + def _extract_embed(self, submission): + meta = submission["media_metadata"] + if not meta: + return + + for mid, data in meta.items(): + if data["status"] != "valid" or "s" not in data: + self.log.warning( + "embed %s: skipping item %s (status: %s)", + submission["id"], mid, data.get("status")) + continue + src = data["s"] + url = src.get("u") or src.get("gif") or src.get("mp4") + if url: + yield url.partition("?")[0].replace("/preview.", "/i.", 1) + else: + self.log.error( + "embed %s: unable to fetch download URL for item %s", + submission["id"], mid) + self.log.debug(src) + def _extract_video_ytdl(self, submission): return "https://www.reddit.com" + submission["permalink"] diff --git a/test/results/reddit.py b/test/results/reddit.py index bd0f9fd7..55623337 100644 --- a/test/results/reddit.py +++ b/test/results/reddit.py @@ -168,6 +168,18 @@ __tests__ = ( "#count" : 0, }, +{ + "#url" : "https://www.reddit.com/r/RobloxArt/comments/15ko0qu/", + "#comment" : "comment embeds (#5366)", + "#category": ("", "reddit", "submission"), + "#class" : reddit.RedditSubmissionExtractor, + "#options" : {"comments": 10}, + "#urls" : ( + "https://i.redd.it/ppt5yciyipgb1.jpg", + "https://i.redd.it/u0ojzd69kpgb1.png", + ), +}, + { "#url" : "https://www.reddit.com/user/TheSpiritTree/comments/srilyf/", "#comment" : "user page submission (#2301)", From 24d792b7355ab031431b156233a15699d8d55c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 Apr 2024 19:32:24 +0200 Subject: [PATCH 087/154] [build] add minimal pyproject.toml --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..fed528d4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" From 4103eb99181997340db1dac4c4d15a94caa0d9cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 Apr 2024 19:33:14 +0200 Subject: [PATCH 088/154] [build] generate sdist and wheel packages using 'build' module --- scripts/release.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/release.sh b/scripts/release.sh index 09127b59..5911b9a1 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -44,9 +44,9 @@ update-dev() { build-python() { cd "${ROOTDIR}" - echo Building bdist_wheel and sdist + echo Building sdist and wheel - python setup.py bdist_wheel sdist + python -m build } build-linux() { From 28a795ca613f189e454a340b2b2226a668335ac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 Apr 2024 23:25:36 +0200 Subject: [PATCH 089/154] [build] include only the latest CHANGELOG entries The CHANGELOG is now at a size where it takes up roughly 50kB or 10% of an sdist or wheel package. --- scripts/release.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/release.sh b/scripts/release.sh index 5911b9a1..3685500b 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -112,6 +112,14 @@ changelog() { -e "s*\([( ]\)#\([0-9]\+\)*\1[#\2](https://github.com/mikf/gallery-dl/issues/\2)*g" \ -e "s*^## \w\+\$*## ${NEWVERSION} - $(date +%Y-%m-%d)*" \ "${CHANGELOG}" + + mv "${CHANGELOG}" "${CHANGELOG}.orig" + + # - remove all but the latest entries + sed -n \ + -e '/^## /,/^$/ { /^$/q; p }' \ + "${CHANGELOG}.orig" \ + > "${CHANGELOG}" } supportedsites() { @@ -129,6 +137,7 @@ upload-git() { cd "${ROOTDIR}" echo Pushing changes to github + mv "${CHANGELOG}.orig" "${CHANGELOG}" || true git add "gallery_dl/version.py" "${README}" "${CHANGELOG}" git commit -S -m "release version ${NEWVERSION}" git tag -s -m "version ${NEWVERSION}" "v${NEWVERSION}" From ef8f02c3d9431046dccff0bd85b9bbab93cf30c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 Apr 2024 20:51:54 +0200 Subject: [PATCH 090/154] [oauth] use Extractor.request() for HTTP requests (#5433) Enables using proxies and general network options. --- gallery_dl/extractor/oauth.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 8c8a5a99..55715757 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -110,7 +110,7 @@ class OAuthBase(Extractor): # get a request token params = {"oauth_callback": self.redirect_uri} - data = self.session.get(request_token_url, params=params).text + data = self.request(request_token_url, params=params).text data = text.parse_query(data) self.session.auth.token_secret = data["oauth_token_secret"] @@ -120,7 +120,7 @@ class OAuthBase(Extractor): data = self.open(authorize_url, params) # exchange the request token for an access token - data = self.session.get(access_token_url, params=data).text + data = self.request(access_token_url, params=data).text data = text.parse_query(data) token = data["oauth_token"] token_secret = data["oauth_token_secret"] @@ -189,7 +189,8 @@ class OAuthBase(Extractor): data["client_id"] = client_id data["client_secret"] = client_secret - data = self.session.post(token_url, data=data, auth=auth).json() + data = self.request( + token_url, method="POST", data=data, auth=auth).json() # check token response if "error" in data: @@ -386,7 +387,7 @@ class OAuthMastodon(OAuthBase): "redirect_uris": self.redirect_uri, "scopes": "read", } - data = self.session.post(url, data=data).json() + data = self.request(url, method="POST", data=data).json() if "client_id" not in data or "client_secret" not in data: raise exception.StopExtraction( @@ -441,7 +442,8 @@ class OAuthPixiv(OAuthBase): "redirect_uri" : "https://app-api.pixiv.net" "/web/v1/users/auth/pixiv/callback", } - data = self.session.post(url, headers=headers, data=data).json() + data = self.request( + url, method="POST", headers=headers, data=data).json() if "error" in data: stdout_write("\n{}\n".format(data)) From 86a97d8e275827b9764111990fbb131a5cfcea6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Apr 2024 00:25:23 +0200 Subject: [PATCH 091/154] [kemonoparty] fix crash on posts with missing datetime info (#5422) --- gallery_dl/extractor/kemonoparty.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index bb0b03af..b0c24de7 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -82,7 +82,7 @@ class KemonopartyExtractor(Extractor): self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = self._parse_datetime( - post["published"] or post["added"]) + post.get("published") or post.get("added") or "") if username: post["username"] = username From 9a8403917af4fca2cb4d34c08ea6c7e4dfe2400a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Apr 2024 03:24:51 +0200 Subject: [PATCH 092/154] restore LD_LIBRARY_PATH for PyInstaller builds (#5421) --- gallery_dl/cookies.py | 2 +- gallery_dl/postprocessor/exec.py | 5 ++--- gallery_dl/postprocessor/ugoira.py | 2 +- gallery_dl/util.py | 29 ++++++++++++++++++++++++++++- test/test_postprocessor.py | 8 ++++---- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 478abb63..092a9415 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -857,7 +857,7 @@ class DatabaseConnection(): def Popen_communicate(*args): - proc = subprocess.Popen( + proc = util.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) try: stdout, stderr = proc.communicate() diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index e7ed2f69..7d2be2b9 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -10,7 +10,6 @@ from .common import PostProcessor from .. import util, formatter -import subprocess import os import re @@ -80,14 +79,14 @@ class ExecPP(PostProcessor): def _exec(self, args, shell): self.log.debug("Running '%s'", args) - retcode = subprocess.Popen(args, shell=shell).wait() + retcode = util.Popen(args, shell=shell).wait() if retcode: self.log.warning("'%s' returned with non-zero exit status (%d)", args, retcode) def _exec_async(self, args, shell): self.log.debug("Running '%s'", args) - subprocess.Popen(args, shell=shell) + util.Popen(args, shell=shell) def _replace(self, match): name = match.group(1) diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index b713c6f3..5fbc1a55 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -171,7 +171,7 @@ class UgoiraPP(PostProcessor): def _exec(self, args): self.log.debug(args) out = None if self.output else subprocess.DEVNULL - retcode = subprocess.Popen(args, stdout=out, stderr=out).wait() + retcode = util.Popen(args, stdout=out, stderr=out).wait() if retcode: print() self.log.error("Non-zero exit status when running %s (%s)", diff --git a/gallery_dl/util.py b/gallery_dl/util.py index bc9418f5..0e6f04a9 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -339,7 +339,7 @@ def extract_headers(response): @functools.lru_cache(maxsize=None) def git_head(): try: - out, err = subprocess.Popen( + out, err = Popen( ("git", "rev-parse", "--short", "HEAD"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -579,6 +579,33 @@ GLOBALS = { } +if EXECUTABLE and hasattr(sys, "_MEIPASS"): + # https://github.com/pyinstaller/pyinstaller/blob/develop/doc + # /runtime-information.rst#ld_library_path--libpath-considerations + _popen_env = os.environ.copy() + + orig = _popen_env.get("LD_LIBRARY_PATH_ORIG") + if orig is None: + _popen_env.pop("LD_LIBRARY_PATH", None) + else: + _popen_env["LD_LIBRARY_PATH"] = orig + + orig = _popen_env.get("DYLD_LIBRARY_PATH_ORIG") + if orig is None: + _popen_env.pop("DYLD_LIBRARY_PATH", None) + else: + _popen_env["DYLD_LIBRARY_PATH"] = orig + + del orig + + class Popen(subprocess.Popen): + def __init__(self, args, **kwargs): + kwargs["env"] = _popen_env + subprocess.Popen.__init__(self, args, **kwargs) +else: + Popen = subprocess.Popen + + def compile_expression(expr, name="", globals=None): code_object = compile(expr, name, "eval") return functools.partial(eval, code_object, globals or GLOBALS) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 0ee7cdb2..6e76f07a 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -172,7 +172,7 @@ class ExecTest(BasePostprocessorTest): "command": "echo {} {_path} {_directory} {_filename} && rm {};", }) - with patch("subprocess.Popen") as p: + with patch("gallery_dl.util.Popen") as p: i = Mock() i.wait.return_value = 0 p.return_value = i @@ -192,7 +192,7 @@ class ExecTest(BasePostprocessorTest): "\fE _directory.upper()"], }) - with patch("subprocess.Popen") as p: + with patch("gallery_dl.util.Popen") as p: i = Mock() i.wait.return_value = 0 p.return_value = i @@ -212,7 +212,7 @@ class ExecTest(BasePostprocessorTest): "command": "echo {}", }) - with patch("subprocess.Popen") as p: + with patch("gallery_dl.util.Popen") as p: i = Mock() i.wait.return_value = 123 p.return_value = i @@ -230,7 +230,7 @@ class ExecTest(BasePostprocessorTest): "command": "echo {}", }) - with patch("subprocess.Popen") as p: + with patch("gallery_dl.util.Popen") as p: i = Mock() p.return_value = i self._trigger(("after",)) From 40bd145637d2361eb324f7e8b85a605dec69b798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Apr 2024 03:43:12 +0200 Subject: [PATCH 093/154] remove 'contextlib' imports --- gallery_dl/cookies.py | 6 ++++-- test/test_job.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 092a9415..71a45f00 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -10,7 +10,6 @@ # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/cookies.py import binascii -import contextlib import ctypes import logging import os @@ -682,7 +681,8 @@ def _get_gnome_keyring_password(browser_keyring_name): # lists all keys and presumably searches for its key in the list. # It appears that we must do the same. # https://github.com/jaraco/keyring/issues/556 - with contextlib.closing(secretstorage.dbus_init()) as con: + con = secretstorage.dbus_init() + try: col = secretstorage.get_default_collection(con) label = browser_keyring_name + " Safe Storage" for item in col.get_all_items(): @@ -691,6 +691,8 @@ def _get_gnome_keyring_password(browser_keyring_name): else: _log_error("Failed to read from GNOME keyring") return b"" + finally: + con.close() def _get_linux_keyring_password(browser_keyring_name, keyring): diff --git a/test/test_job.py b/test/test_job.py index 141b1b2e..3e6f85be 100644 --- a/test/test_job.py +++ b/test/test_job.py @@ -13,7 +13,6 @@ import unittest from unittest.mock import patch import io -import contextlib sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from gallery_dl import job, config, text # noqa E402 @@ -32,8 +31,13 @@ class TestJob(unittest.TestCase): jobinstance = extr_or_job with io.StringIO() as buffer: - with contextlib.redirect_stdout(buffer): + stdout = sys.stdout + sys.stdout = buffer + try: jobinstance.run() + finally: + sys.stdout = stdout + return buffer.getvalue() From 3346a377b318bfd1b4435bd38892b05d627a2a3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Apr 2024 17:00:48 +0200 Subject: [PATCH 094/154] [pp:ugoira] log errors for general exceptions --- gallery_dl/postprocessor/ugoira.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 5fbc1a55..c63a3d94 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -155,7 +155,9 @@ class UgoiraPP(PostProcessor): self.log.error("Unable to invoke FFmpeg (%s: %s)", exc.__class__.__name__, exc) pathfmt.realpath = pathfmt.temppath - except Exception: + except Exception as exc: + print() + self.log.error("%s: %s", exc.__class__.__name__, exc) pathfmt.realpath = pathfmt.temppath else: if self.mtime: From 647a87d17c4018d13c2e3d8db405392aed340f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Apr 2024 17:56:21 +0200 Subject: [PATCH 095/154] [twitter] match '/photo/' Tweet URLs (#5443) fixes regression introduced in 40c05535 --- gallery_dl/extractor/twitter.py | 3 ++- test/results/twitter.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4a817714..9f3e3918 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -741,7 +741,8 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?(?:$|[?#])" + pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + r"/?(?:$|\?|#|photo/)") example = "https://twitter.com/USER/status/12345" def __init__(self, match): diff --git a/test/results/twitter.py b/test/results/twitter.py index 4ceb63b9..ac119a33 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -526,6 +526,13 @@ You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, p "#count" : 5, }, +{ + "#url" : "https://twitter.com/supernaturepics/status/604341487988576256/photo/1", + "#comment" : "/photo/ URL (#5443)", + "#category": ("", "twitter", "tweet"), + "#class" : twitter.TwitterTweetExtractor, +}, + { "#url" : "https://twitter.com/morino_ya/status/1392763691599237121", "#comment" : "retweet with missing media entities (#1555)", From 0e730ba980071b011f91dc9ab0fe8902cab1c440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 7 Apr 2024 02:31:21 +0200 Subject: [PATCH 096/154] [pp:mtime] do not overwrite '_mtime' for None values (#5439) --- gallery_dl/postprocessor/mtime.py | 3 +++ test/test_postprocessor.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index ea61b7b4..6ded1e29 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -33,6 +33,9 @@ class MtimePP(PostProcessor): def run(self, pathfmt): mtime = self._get(pathfmt.kwdict) + if mtime is None: + return + pathfmt.kwdict["_mtime"] = ( util.datetime_to_timestamp(mtime) if isinstance(mtime, datetime) else diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 6e76f07a..d509052c 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -573,6 +573,16 @@ class MtimeTest(BasePostprocessorTest): self._trigger() self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + def test_mtime_none(self): + self._create(None, {"date": None}) + self._trigger() + self.assertNotIn("_mtime", self.pathfmt.kwdict) + + def test_mtime_undefined(self): + self._create(None, {}) + self._trigger() + self.assertNotIn("_mtime", self.pathfmt.kwdict) + def test_mtime_key(self): self._create({"key": "foo"}, {"foo": 315532800}) self._trigger() From 40c1a8e47106222a34d8a2145ba865571613162d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 9 Apr 2024 17:35:05 +0200 Subject: [PATCH 097/154] [wikimedia] fix exception for files with empty 'metadata' --- gallery_dl/extractor/wikimedia.py | 4 ++-- test/results/fandom.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index c15c8302..4976c078 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -77,10 +77,10 @@ class WikimediaExtractor(BaseExtractor): image["metadata"] = { m["name"]: m["value"] - for m in image["metadata"]} + for m in image["metadata"] or ()} image["commonmetadata"] = { m["name"]: m["value"] - for m in image["commonmetadata"]} + for m in image["commonmetadata"] or ()} filename = image["canonicaltitle"] image["filename"], _, image["extension"] = \ diff --git a/test/results/fandom.py b/test/results/fandom.py index 40d82e93..c876a64c 100644 --- a/test/results/fandom.py +++ b/test/results/fandom.py @@ -83,6 +83,15 @@ __tests__ = ( "width" : 728, }, +{ + "#url" : "https://hearthstone.fandom.com/wiki/Flame_Juggler", + "#comment" : "empty 'metadata'", + "#category": ("wikimedia", "fandom-hearthstone", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + + "metadata" : {}, +}, + { "#url" : "https://projectsekai.fandom.com/wiki/Project_SEKAI_Wiki", "#category": ("wikimedia", "fandom-projectsekai", "article"), From b57051719fac671db5df4059377f7c4a00280547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 9 Apr 2024 19:24:01 +0200 Subject: [PATCH 098/154] [wikimedia] support wiki.gg wikis --- docs/supportedsites.md | 6 ++++++ gallery_dl/extractor/wikimedia.py | 10 +++++++--- scripts/supportedsites.py | 1 + test/results/wikigg.py | 24 ++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 test/results/wikigg.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e5665688..9b2a4bc5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1499,6 +1499,12 @@ Consider all listed sites to potentially be NSFW. Articles + + wiki.gg + https://www.wiki.gg/ + Articles + + Super Mario Wiki https://www.mariowiki.com/ diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 4976c078..9370cfb5 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -27,9 +27,9 @@ class WikimediaExtractor(BaseExtractor): if self.category == "wikimedia": self.category = self.root.split(".")[-2] - elif self.category == "fandom": - self.category = \ - "fandom-" + self.root.partition(".")[0].rpartition("/")[2] + elif self.category in ("fandom", "wikigg"): + self.category = "{}-{}".format( + self.category, self.root.partition(".")[0].rpartition("/")[2]) if path.startswith("wiki/"): path = path[5:] @@ -152,6 +152,10 @@ BASE_PATTERN = WikimediaExtractor.update({ "root": None, "pattern": r"[\w-]+\.fandom\.com", }, + "wikigg": { + "root": None, + "pattern": r"\w+\.wiki\.gg", + }, "mariowiki": { "root": "https://www.mariowiki.com", "pattern": r"(?:www\.)?mariowiki\.com", diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 9748f00a..04b421c4 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -143,6 +143,7 @@ CATEGORY_MAP = { "webmshare" : "webmshare", "webtoons" : "Webtoon", "wikiart" : "WikiArt.org", + "wikigg" : "wiki.gg", "wikimediacommons": "Wikimedia Commons", "xbunkr" : "xBunkr", "xhamster" : "xHamster", diff --git a/test/results/wikigg.py b/test/results/wikigg.py new file mode 100644 index 00000000..ffc5cb70 --- /dev/null +++ b/test/results/wikigg.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wiki.gg/wiki/Title", + "#comment" : "for scripts/supportedsites.py", + "#category": ("wikimedia", "wikigg-www", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://hearthstone.wiki.gg/wiki/Flame_Juggler", + "#category": ("wikimedia", "wikigg-hearthstone", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +) From 35d4a706aec4b232c04390ee2060730309eea836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 11 Apr 2024 22:27:49 +0200 Subject: [PATCH 099/154] [pixiv:novel] add 'covers' option (#5373) --- docs/configuration.rst | 16 +++++++++++++--- gallery_dl/extractor/pixiv.py | 14 ++++++++++++++ test/results/pixiv.py | 9 ++++++--- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index faa6472d..524bf357 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2964,14 +2964,24 @@ Description `gppt `__. -extractor.pixiv.embeds ----------------------- +extractor.pixiv.novel.covers +---------------------------- +Type + ``bool`` +Default + ``false`` +Description + Download cover images. + + +extractor.pixiv.novel.embeds +---------------------------- Type ``bool`` Default ``false`` Description - Download images embedded in novels. + Download embedded images. extractor.pixiv.novel.full-series diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 862a7db2..ab7c18e0 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -619,6 +619,7 @@ class PixivNovelExtractor(PixivExtractor): meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") embeds = self.config("embeds") + covers = self.config("covers") if embeds: headers = { @@ -658,6 +659,19 @@ class PixivNovelExtractor(PixivExtractor): novel["extension"] = "txt" yield Message.Url, "text:" + content, novel + if covers: + path = novel["image_urls"]["large"].partition("/img/")[2] + url = ("https://i.pximg.net/novel-cover-original/img/" + + path.rpartition(".")[0].replace("_master1200", "")) + novel["date_url"] = self._date_from_url(url) + novel["num"] += 1 + novel["suffix"] = "_p{:02}".format(novel["num"]) + novel["_fallback"] = (url + ".png",) + url_jpg = url + ".jpg" + text.nameext_from_url(url_jpg, novel) + yield Message.Url, url_jpg, novel + del novel["_fallback"] + if embeds: desktop = False illusts = {} diff --git a/test/results/pixiv.py b/test/results/pixiv.py index 87a69513..038ddb5d 100644 --- a/test/results/pixiv.py +++ b/test/results/pixiv.py @@ -459,11 +459,14 @@ __tests__ = ( { "#url" : "https://www.pixiv.net/novel/show.php?id=16422450", - "#comment" : "embeds", + "#comment" : "embeds // covers (#5373)", "#category": ("", "pixiv", "novel"), "#class" : pixiv.PixivNovelExtractor, - "#options" : {"embeds": True}, - "#count" : 3, + "#options" : { + "embeds": True, + "covers": True, + }, + "#count" : 4, }, { From e02d2ff45dd835021d8070ebde8d01aa1e20ea5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 11 Apr 2024 23:41:50 +0200 Subject: [PATCH 100/154] [tapas] add 'creator' extractor (#5306) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tapas.py | 15 +++++++++++++++ test/results/tapas.py | 13 +++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9b2a4bc5..8805aebe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -838,7 +838,7 @@ Consider all listed sites to potentially be NSFW. Tapas https://tapas.io/ - Episodes, Series + Creators, Episodes, Series Supported diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 0a9df20c..167953d2 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -151,3 +151,18 @@ class TapasEpisodeExtractor(TapasExtractor): def episode_ids(self): return (self.episode_id,) + + +class TapasCreatorExtractor(TapasExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)" + example = "https://tapas.io/CREATOR" + + def items(self): + url = "{}/{}/series".format(self.root, self.groups[0]) + page = self.request(url).text + page = text.extr(page, '
    ") + + data = {"_extractor": TapasSeriesExtractor} + for path in text.extract_iter(page, ' href="', '"'): + yield Message.Queue, self.root + path, data diff --git a/test/results/tapas.py b/test/results/tapas.py index 1278d9f8..d4289383 100644 --- a/test/results/tapas.py +++ b/test/results/tapas.py @@ -73,4 +73,17 @@ __tests__ = ( }, }, +{ + "#url" : "https://tapas.io/SANG123/series", + "#comment" : "#5306", + "#category": ("", "tapas", "creator"), + "#class" : tapas.TapasCreatorExtractor, + "#urls" : ( + "https://tapas.io/series/the-return-of-the-disaster-class-hero-novel", + "https://tapas.io/series/the-return-of-the-disaster-class-hero", + "https://tapas.io/series/tomb-raider-king", + "https://tapas.io/series/tomb-raider-king-novel", + ), +}, + ) From 85bbb594839e83a407fdd6f9a66489355c5aafa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 12 Apr 2024 23:01:22 +0200 Subject: [PATCH 101/154] [twitter] implement 'relogin' option (#5445) --- docs/configuration.rst | 13 +++++ gallery_dl/extractor/twitter.py | 85 +++++++++++++++++++-------------- 2 files changed, 62 insertions(+), 36 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 524bf357..695f2392 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3916,6 +3916,19 @@ Description * ``"wait"``: Wait until rate limit reset +extractor.twitter.relogin +------------------------- +Type + ``bool`` +Default + ``true`` +Description + | When receiving a "Could not authenticate you" error while logged in with + `username & passeword `__, + | refresh the current login session and + try to continue from where it left off. + + extractor.twitter.locked ------------------------ Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 9f3e3918..f0baa91b 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1294,42 +1294,62 @@ class TwitterAPI(): if csrf_token: self.headers["x-csrf-token"] = csrf_token - if response.status_code < 400: + try: data = response.json() + except ValueError: + data = {"errors": ({"message": response.text},)} + + errors = data.get("errors") + if not errors: + return data + + retry = False + for error in errors: + msg = error.get("message") or "Unspecified" + self.log.debug("API error: '%s'", msg) + + if "this account is temporarily locked" in msg: + msg = "Account temporarily locked" + if self.extractor.config("locked") != "wait": + raise exception.AuthorizationError(msg) + self.log.warning("%s. Press ENTER to retry.", msg) + try: + input() + except (EOFError, OSError): + pass + retry = True - errors = data.get("errors") - if not errors: - return data + elif "Could not authenticate you" in msg: + if not self.extractor.config("relogin", True): + continue - retry = False - for error in errors: - msg = error.get("message") or "Unspecified" - self.log.debug("API error: '%s'", msg) + username, password = self.extractor._get_auth_info() + if not username: + continue - if "this account is temporarily locked" in msg: - msg = "Account temporarily locked" - if self.extractor.config("locked") != "wait": - raise exception.AuthorizationError(msg) - self.log.warning("%s. Press ENTER to retry.", msg) - try: - input() - except (EOFError, OSError): - pass - retry = True + _login_impl.invalidate(username) + self.extractor.cookies_update( + _login_impl(self.extractor, username, password)) + self.__init__(self.extractor) + retry = True - elif msg.lower().startswith("timeout"): - retry = True + elif msg.lower().startswith("timeout"): + retry = True - if not retry: - return data - elif self.headers["x-twitter-auth-type"]: + if retry: + if self.headers["x-twitter-auth-type"]: self.log.debug("Retrying API request") continue + else: + # fall through to "Login Required" + response.status_code = 404 - # fall through to "Login Required" - response.status_code = 404 - - if response.status_code == 429: + if response.status_code < 400: + return data + elif response.status_code in (403, 404) and \ + not self.headers["x-twitter-auth-type"]: + raise exception.AuthorizationError("Login required") + elif response.status_code == 429: # rate limit exceeded if self.extractor.config("ratelimit") == "abort": raise exception.StopExtraction("Rate limit exceeded") @@ -1339,18 +1359,11 @@ class TwitterAPI(): self.extractor.wait(until=until, seconds=seconds) continue - if response.status_code in (403, 404) and \ - not self.headers["x-twitter-auth-type"]: - raise exception.AuthorizationError("Login required") - # error try: - data = response.json() - errors = ", ".join(e["message"] for e in data["errors"]) - except ValueError: - errors = response.text + errors = ", ".join(e["message"] for e in errors) except Exception: - errors = data.get("errors", "") + pass raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, errors) From 141a93c8fdef4b0dd7b350ee982f942add40b83d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 13 Apr 2024 02:18:44 +0200 Subject: [PATCH 102/154] [docs] update docs/configuration links (#5059, #5369, #5423) --- README.rst | 8 ++++---- docs/supportedsites.md | 6 +++--- gallery_dl/extractor/deviantart.py | 5 ++--- gallery_dl/extractor/reddit.py | 10 +++++----- gallery_dl/extractor/tumblr.py | 6 +++--- scripts/supportedsites.py | 12 ++++++------ 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/README.rst b/README.rst index 9d017abb..366db3cb 100644 --- a/README.rst +++ b/README.rst @@ -7,8 +7,8 @@ to download image galleries and collections from several image hosting sites (see `Supported Sites `__). It is a cross-platform tool -with many `configuration options `__ -and powerful `filenaming capabilities `__. +with many `configuration options `__ +and powerful `filenaming capabilities `__. |pypi| |build| @@ -234,7 +234,7 @@ Documentation ------------- A list of all available configuration options and their descriptions -can be found in ``__. +can be found at ``__. | For a default configuration file with available options set to their default values, see ``__. @@ -330,7 +330,7 @@ CAPTCHA or similar, or has not been implemented yet, you can use the cookies from a browser login session and input them into *gallery-dl*. This can be done via the -`cookies `__ +`cookies `__ option in your configuration file by specifying - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8805aebe..dc24a29e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -947,7 +947,7 @@ Consider all listed sites to potentially be NSFW. Wallhaven https://wallhaven.cc/ Collections, individual Images, Search Results, User Profiles - API Key + API Key Wallpaper Cave @@ -965,7 +965,7 @@ Consider all listed sites to potentially be NSFW. Weasyl https://www.weasyl.com/ Favorites, Folders, Journals, Submissions - API Key + API Key webmshare @@ -1319,7 +1319,7 @@ Consider all listed sites to potentially be NSFW. Derpibooru https://derpibooru.org/ Galleries, Posts, Search Results - API Key + API Key Ponybooru diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ca8acaa5..993885ab 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1457,9 +1457,8 @@ class DeviantartOAuthAPI(): self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master/do" - "cs/configuration.rst#extractordeviantartclient-id" - "--client-secret") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-deviantart-client-id-client-secret") else: if log: self.log.error(msg) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index a3a455a9..ce602f6c 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -487,14 +487,14 @@ class RedditAPI(): remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: - if self._warn_429: - self._warn_429 = False + self.log.warning("API rate limit exceeded") + if self._warn_429 and self.client_id == self.CLIENT_ID: self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master" - "/docs/configuration.rst" - "#extractorredditclient-id--user-agent") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-reddit-client-id-user-agent") + self._warn_429 = False self.extractor.wait( seconds=response.headers["x-ratelimit-reset"]) continue diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index fee0145d..c34910f8 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -447,9 +447,9 @@ class TumblrAPI(oauth.OAuth1API): if api_key == self.API_KEY: self.log.info( "Register your own OAuth application and use its " - "credentials to prevent this error: https://githu" - "b.com/mikf/gallery-dl/blob/master/docs/configurat" - "ion.rst#extractortumblrapi-key--api-secret") + "credentials to prevent this error: " + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-tumblr-api-key-api-secret") if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 04b421c4..f3535f1d 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -342,12 +342,12 @@ URL_MAP = { _OAUTH = 'OAuth' _COOKIES = 'Cookies' -_APIKEY_DB = \ - 'API Key' -_APIKEY_WH = \ - 'API Key' -_APIKEY_WY = \ - 'API Key' +_APIKEY_DB = ('API Key') +_APIKEY_WH = ('API Key') +_APIKEY_WY = ('API Key') AUTH_MAP = { "aibooru" : "Supported", From 6dac43ad60c977cfa39eb402601c99fa8457b951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 13 Apr 2024 17:53:04 +0200 Subject: [PATCH 103/154] [docs] replace AnchorJS with custom script use it in rendered .rst documents as well as in .md ones --- docs/_layouts/default.html | 5 ++--- docs/links.js | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 docs/links.js diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html index 955164a3..8658aefa 100644 --- a/docs/_layouts/default.html +++ b/docs/_layouts/default.html @@ -1,5 +1,5 @@ - + @@ -8,6 +8,7 @@ {% seo %} +
    @@ -15,7 +16,5 @@ {{ content }}
    - - diff --git a/docs/links.js b/docs/links.js new file mode 100644 index 00000000..487907b9 --- /dev/null +++ b/docs/links.js @@ -0,0 +1,44 @@ +"use strict"; + + +function add_header_links() +{ + let style = document.createElement("style"); + style.id = "headerlinks" + document.head.appendChild(style); + style.sheet.insertRule( + "a.headerlink {" + + " visibility: hidden;" + + " text-decoration: none;" + + " font-size: 0.8em;" + + " padding: 0 4px 0 4px;" + + "}"); + style.sheet.insertRule( + ":hover > a.headerlink {" + + " visibility: visible;" + + "}"); + + let headers = document.querySelectorAll("h2, h3, h4, h5, h6"); + for (let i = 0, len = headers.length; i < len; ++i) + { + let header = headers[i]; + + let id = header.id || header.parentNode.id; + if (!id) + continue; + + let link = document.createElement("a"); + link.href = "#" + id; + link.className = "headerlink"; + link.textContent = "¶"; + + header.appendChild(link); + } +} + + +if (document.readyState !== "loading") { + add_header_links(); +} else { + document.addEventListener("DOMContentLoaded", add_header_links); +} From 5227bb6b1d62ecef5b281592b0d001e7f9c101e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 13 Apr 2024 18:51:40 +0200 Subject: [PATCH 104/154] [text] catch general Exceptions --- gallery_dl/text.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/gallery_dl/text.py b/gallery_dl/text.py index b7b5211d..92581875 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -73,7 +73,7 @@ def filename_from_url(url): """Extract the last part of an URL to use as a filename""" try: return url.partition("?")[0].rpartition("/")[2] - except (TypeError, AttributeError): + except Exception: return "" @@ -122,7 +122,7 @@ def extract(txt, begin, end, pos=0): first = txt.index(begin, pos) + len(begin) last = txt.index(end, first) return txt[first:last], last+len(end) - except (ValueError, TypeError, AttributeError): + except Exception: return None, pos @@ -131,7 +131,7 @@ def extr(txt, begin, end, default=""): try: first = txt.index(begin) + len(begin) return txt[first:txt.index(end, first)] - except (ValueError, TypeError, AttributeError): + except Exception: return default @@ -141,7 +141,7 @@ def rextract(txt, begin, end, pos=-1): first = txt.rindex(begin, 0, pos) last = txt.index(end, first + lbeg) return txt[first + lbeg:last], first - except (ValueError, TypeError, AttributeError): + except Exception: return None, pos @@ -167,7 +167,7 @@ def extract_iter(txt, begin, end, pos=0): last = index(end, first) pos = last + lend yield txt[first:last] - except (ValueError, TypeError, AttributeError): + except Exception: return @@ -180,7 +180,7 @@ def extract_from(txt, pos=0, default=""): last = index(end, first) pos = last + len(end) return txt[first:last] - except (ValueError, TypeError, AttributeError): + except Exception: return default return extr @@ -200,7 +200,7 @@ def parse_bytes(value, default=0, suffixes="bkmgtp"): """Convert a bytes-amount ("500k", "2.5M", ...) to int""" try: last = value[-1].lower() - except (TypeError, LookupError): + except Exception: return default if last in suffixes: @@ -221,7 +221,7 @@ def parse_int(value, default=0): return default try: return int(value) - except (ValueError, TypeError): + except Exception: return default @@ -231,7 +231,7 @@ def parse_float(value, default=0.0): return default try: return float(value) - except (ValueError, TypeError): + except Exception: return default @@ -242,7 +242,7 @@ def parse_query(qs): for key, value in urllib.parse.parse_qsl(qs): if key not in result: result[key] = value - except AttributeError: + except Exception: pass return result @@ -251,7 +251,7 @@ def parse_timestamp(ts, default=None): """Create a datetime object from a unix timestamp""" try: return datetime.datetime.utcfromtimestamp(int(ts)) - except (TypeError, ValueError, OverflowError): + except Exception: return default From 63ac06643fe12b28e89c0d261799b4ec31ef258a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 13 Apr 2024 18:59:18 +0200 Subject: [PATCH 105/154] compute tempfile path only once --- gallery_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 18cf0e3a..30eae7f0 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -430,10 +430,11 @@ class Extractor(): if not path: return + path_tmp = path + ".tmp" try: - with open(path + ".tmp", "w") as fp: + with open(path_tmp, "w") as fp: util.cookiestxt_store(fp, self.cookies) - os.replace(path + ".tmp", path) + os.replace(path_tmp, path) except OSError as exc: self.log.warning("cookies: %s", exc) From a3f580254a555e5d1bf1793f921954b398cc92f4 Mon Sep 17 00:00:00 2001 From: Aidan Harris Date: Sun, 14 Apr 2024 08:55:50 +0000 Subject: [PATCH 106/154] Add warnings flag This commit adds a warnings flag It can be combined with -q / --quiet to display warnings. The intent is to provide a silent option that still surfaces warning and error messages so that they are visible in logs. --- gallery_dl/__init__.py | 2 +- gallery_dl/option.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 19ea77b2..fb3c5cb7 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -113,7 +113,7 @@ def main(): # loglevels output.configure_logging(args.loglevel) - if args.loglevel >= logging.ERROR: + if args.loglevel >= logging.WARNING: config.set(("output",), "mode", "null") config.set(("downloader",), "progress", None) elif args.loglevel <= logging.DEBUG: diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 72a602f2..9825cd23 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -255,6 +255,12 @@ def build_parser(): action="store_const", const=logging.DEBUG, help="Print various debugging information", ) + output.add_argument( + "-w", "--warning", + dest="loglevel", + action="store_const", const=logging.WARNING, + help="Print warnings", + ) output.add_argument( "-g", "--get-urls", dest="list_urls", action="count", From a2affdcef3bef31b46f85649af16149348a64f23 Mon Sep 17 00:00:00 2001 From: Aidan Harris Date: Sun, 14 Apr 2024 13:15:07 +0000 Subject: [PATCH 107/154] re-order verbose and warning options --- gallery_dl/option.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 9825cd23..9ede0622 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -250,16 +250,16 @@ def build_parser(): help="Activate quiet mode", ) output.add_argument( - "-v", "--verbose", + "-w", "--warning", dest="loglevel", - action="store_const", const=logging.DEBUG, - help="Print various debugging information", + action="store_const", const=logging.WARNING, + help="Print only warnings and errors", ) output.add_argument( - "-w", "--warning", + "-v", "--verbose", dest="loglevel", - action="store_const", const=logging.WARNING, - help="Print warnings", + action="store_const", const=logging.DEBUG, + help="Print various debugging information", ) output.add_argument( "-g", "--get-urls", From 257e9fb435ee3f20c0fe87ced03a9c9e69231226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 15 Apr 2024 17:55:19 +0200 Subject: [PATCH 108/154] [gelbooru] improve pagination logic for meta tags (#5478) similar to 494acabd38fefdc8c1b482b584695aaa788f9112 --- gallery_dl/extractor/gelbooru.py | 35 +++++++++++++++++++++++++++----- test/results/gelbooru.py | 16 +++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 2459a61f..37c776e6 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -51,19 +51,44 @@ class GelbooruBase(): params["pid"] = self.page_start params["limit"] = self.per_page limit = self.per_page // 2 + pid = False + + if "tags" in params: + tags = params["tags"].split() + op = "<" + id = False + + for tag in tags: + if tag.startswith("sort:"): + if tag == "sort:id:asc": + op = ">" + elif tag == "sort:id" or tag.startswith("sort:id:"): + op = "<" + else: + pid = True + elif tag.startswith("id:"): + id = True + + if not pid: + if id: + tag = "id:" + op + tags = [t for t in tags if not t.startswith(tag)] + tags = "{} id:{}".format(" ".join(tags), op) while True: posts = self._api_request(params) - for post in posts: - yield post + yield from posts if len(posts) < limit: return - if "pid" in params: - del params["pid"] - params["tags"] = "{} id:<{}".format(self.tags, post["id"]) + if pid: + params["pid"] += 1 + else: + if "pid" in params: + del params["pid"] + params["tags"] = tags + str(posts[-1]["id"]) def _pagination_html(self, params): url = self.root + "/index.php" diff --git a/test/results/gelbooru.py b/test/results/gelbooru.py index 6302d56f..3f09ea69 100644 --- a/test/results/gelbooru.py +++ b/test/results/gelbooru.py @@ -39,6 +39,22 @@ __tests__ = ( "#sha1_url": "845a61aa1f90fb4ced841e8b7e62098be2e967bf", }, +{ + "#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=id:>=67800+id:<=68000", + "#comment" : "meta tags (#5478)", + "#category": ("booru", "gelbooru", "tag"), + "#class" : gelbooru.GelbooruTagExtractor, + "#count" : 187, +}, + +{ + "#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=id:>=67800+id:<=68000+sort:id:asc", + "#comment" : "meta + sort tags (#5478)", + "#category": ("booru", "gelbooru", "tag"), + "#class" : gelbooru.GelbooruTagExtractor, + "#count" : 187, +}, + { "#url" : "https://gelbooru.com/index.php?page=pool&s=show&id=761", "#category": ("booru", "gelbooru", "pool"), From b38a91735581ae1d7df6117d048378ac61374089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Apr 2024 00:02:48 +0200 Subject: [PATCH 109/154] [common] add Extractor.input() method --- gallery_dl/extractor/common.py | 10 ++++++++++ gallery_dl/extractor/readcomiconline.py | 5 +---- gallery_dl/extractor/twitter.py | 7 ++----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 25729eb7..7982ca7d 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -14,6 +14,7 @@ import ssl import time import netrc import queue +import getpass import logging import datetime import requests @@ -250,6 +251,15 @@ class Extractor(): seconds, reason) time.sleep(seconds) + def input(self, prompt, echo=True): + if echo: + try: + return input(prompt) + except (EOFError, OSError): + return None + else: + return getpass.getpass(prompt) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 35698605..115de9a2 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -35,10 +35,7 @@ class ReadcomiconlineBase(): self.log.warning( "Redirect to \n%s\nVisit this URL in your browser, solve " "the CAPTCHA, and press ENTER to continue", response.url) - try: - input() - except (EOFError, OSError): - pass + self.input() else: raise exception.StopExtraction( "Redirect to \n%s\nVisit this URL in your browser and " diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index f0baa91b..42cf2fa8 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1312,11 +1312,8 @@ class TwitterAPI(): msg = "Account temporarily locked" if self.extractor.config("locked") != "wait": raise exception.AuthorizationError(msg) - self.log.warning("%s. Press ENTER to retry.", msg) - try: - input() - except (EOFError, OSError): - pass + self.log.warning(msg) + self.extractor.input("Press ENTER to retry.") retry = True elif "Could not authenticate you" in msg: From 9e5d65fbf3a83958d98ad28dadea89a2a14c4fb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Apr 2024 01:25:04 +0200 Subject: [PATCH 110/154] [twitter] improve username & password login procedure (#5445) - handle more subtasks - support 2FA - support email verification codes --- gallery_dl/extractor/twitter.py | 133 +++++++++++++++++--------------- 1 file changed, 69 insertions(+), 64 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 42cf2fa8..bcd6cba0 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1710,23 +1710,24 @@ class TwitterAPI(): @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): - - import re import random - if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username): - extr.log.warning( - "Login with email is no longer possible. " - "You need to provide your username or phone number instead.") + def process(data, params=None): + response = extr.request( + url, params=params, headers=headers, json=data, + method="POST", fatal=None) - def process(response): try: data = response.json() except ValueError: data = {"errors": ({"message": "Invalid response"},)} else: if response.status_code < 400: - return data["flow_token"] + try: + return (data["flow_token"], + data["subtasks"][0]["subtask_id"]) + except LookupError: + pass errors = [] for error in data.get("errors") or (): @@ -1735,9 +1736,13 @@ def _login_impl(extr, username, password): extr.log.debug(response.text) raise exception.AuthenticationError(", ".join(errors)) - extr.cookies.clear() + cookies = extr.cookies + cookies.clear() api = TwitterAPI(extr) api._authenticate_guest() + + url = "https://api.twitter.com/1.1/onboarding/task.json" + params = {"flow_name": "login"} headers = api.headers extr.log.info("Logging in as %s", username) @@ -1794,31 +1799,18 @@ def _login_impl(extr, username, password): "web_modal": 1, }, } - url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login" - response = extr.request(url, method="POST", headers=headers, json=data) - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginJsInstrumentationSubtask", + flow_token, subtask = process(data, params) + while not cookies.get("auth_token"): + if subtask == "LoginJsInstrumentationSubtask": + data = { "js_instrumentation": { "response": "{}", "link": "next_link", }, - }, - ], - } - url = "https://api.twitter.com/1.1/onboarding/task.json" - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # username - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginEnterUserIdentifierSSO", + } + elif subtask == "LoginEnterUserIdentifierSSO": + data = { "settings_list": { "setting_responses": [ { @@ -1830,48 +1822,61 @@ def _login_impl(extr, username, password): ], "link": "next_link", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - extr.sleep(random.uniform(2.0, 4.0), "login (username)") - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # password - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginEnterPassword", + } + elif subtask == "LoginEnterPassword": + data = { "enter_password": { "password": password, "link": "next_link", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - extr.sleep(random.uniform(2.0, 4.0), "login (password)") - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # account duplication check ? - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "AccountDuplicationCheck", + } + elif subtask == "LoginEnterAlternateIdentifierSubtask": + alt = extr.input( + "Alternate Identifier (username, email, phone number): ") + data = { + "enter_text": { + "text": alt, + "link": "next_link", + }, + } + elif subtask == "LoginTwoFactorAuthChallenge": + data = { + "enter_text": { + "text": extr.input("2FA Token: "), + "link": "next_link", + }, + } + elif subtask == "LoginAcid": + data = { + "enter_text": { + "text": extr.input("Email Verification Code: "), + "link": "next_link", + }, + } + elif subtask == "AccountDuplicationCheck": + data = { "check_logged_in_account": { "link": "AccountDuplicationCheck_false", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - process(response) + } + elif subtask == "ArkoseLogin": + raise exception.AuthenticationError("Login requires CAPTCHA") + elif subtask == "DenyLoginSubtask": + raise exception.AuthenticationError("Login rejected as suspicious") + elif subtask == "ArkoseLogin": + raise exception.AuthenticationError("No auth token cookie") + else: + raise exception.StopExtraction("Unrecognized subtask %s", subtask) + + inputs = {"subtask_id": subtask} + inputs.update(data) + data = { + "flow_token": flow_token, + "subtask_inputs": [inputs], + } + + extr.sleep(random.uniform(1.0, 3.0), "login ({})".format(subtask)) + flow_token, subtask = process(data) return { cookie.name: cookie.value From 68f4208251b72c43c608a5dd2e69dc7932c5fcae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Apr 2024 17:51:14 +0200 Subject: [PATCH 111/154] [common] update Extractor.wait() message format --- gallery_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 7982ca7d..90f798c9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -220,7 +220,7 @@ class Extractor(): raise exception.HttpError(msg, response) def wait(self, seconds=None, until=None, adjust=1.0, - reason="rate limit reset"): + reason="rate limit"): now = time.time() if seconds: @@ -243,7 +243,7 @@ class Extractor(): if reason: t = datetime.datetime.fromtimestamp(until).time() isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) - self.log.info("Waiting until %s for %s.", isotime, reason) + self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) def sleep(self, seconds, reason): From 923c6f32141b113d870d8dddfdd1ac543b55a4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Apr 2024 18:37:59 +0200 Subject: [PATCH 112/154] [common] simplify 'status_code' check in Extractor.request() --- gallery_dl/extractor/common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 90f798c9..7d9367c7 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -177,9 +177,10 @@ class Extractor(): code = response.status_code if self._write_pages: self._dump_response(response) - if 200 <= code < 400 or fatal is None and \ - (400 <= code < 500) or not fatal and \ - (400 <= code < 429 or 431 <= code < 500): + if ( + code < 400 or + code < 500 and (not fatal and code != 429 or fatal is None) + ): if encoding: response.encoding = encoding return response From 566472f080c675d25a3c0785ce0884029a7cd3a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Apr 2024 18:41:28 +0200 Subject: [PATCH 113/154] [common] add 'sleep-429' option (#5160) --- docs/configuration.rst | 13 ++++++++++++- gallery_dl/extractor/common.py | 24 +++++++++++++++++------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 695f2392..a013f6c8 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -358,13 +358,24 @@ Description i.e. before starting a new extractor. +extractor.*.sleep-429 +--------------------- +Type + |Duration|_ +Default + ``60`` +Description + Number of seconds to sleep when receiving a `429 Too Many Requests` + response before `retrying `__ the request. + + extractor.*.sleep-request ------------------------- Type |Duration|_ Default * ``"0.5-1.5"`` - ``[Danbooru]``, ``[E621]``, ``[foolfuuka]``, ``itaku``, + ``[Danbooru]``, ``[E621]``, ``[foolfuuka]:search``, ``itaku``, ``newgrounds``, ``[philomena]``, ``pixiv:novel``, ``plurk``, ``poipiku`` , ``pornpics``, ``soundgasm``, ``urlgalleries``, ``vk``, ``zerochan`` diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 7d9367c7..fe68f5ac 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -198,7 +198,10 @@ class Extractor(): if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break - if code not in retry_codes and code < 500: + + if code == 429 and self._interval_429: + pass + elif code not in retry_codes and code < 500: break finally: @@ -208,14 +211,18 @@ class Extractor(): if tries > retries: break + seconds = tries if self._interval: - seconds = self._interval() - if seconds < tries: - seconds = tries + s = self._interval() + if seconds < s: + seconds = s + if code == 429 and self._interval_429: + s = self._interval_429() + if seconds < s: + seconds = s + self.wait(seconds=seconds, reason="429 Too Many Requests") else: - seconds = tries - - self.sleep(seconds, "retry") + self.sleep(seconds, "retry") tries += 1 raise exception.HttpError(msg, response) @@ -293,6 +300,9 @@ class Extractor(): self.config("sleep-request", self.request_interval), self.request_interval_min, ) + self._interval_429 = util.build_duration_func( + self.config("sleep-429", 60), + ) if self._retries < 0: self._retries = float("inf") From a5071c9ca05426b9646e4c750083855a878a089c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 18 Apr 2024 15:42:53 +0200 Subject: [PATCH 114/154] [common] fix NameError in Extractor.request() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … when accessing 'code' after an requests exception was raised. Caused by the changes in 566472f080c675d25a3c0785ce0884029a7cd3a5 --- gallery_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index fe68f5ac..25578536 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -171,6 +171,7 @@ class Extractor(): requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc + code = 0 except (requests.exceptions.RequestException) as exc: raise exception.HttpError(exc) else: From a7d8cbab0ea9989c8be8d61a101976b0e8acd405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 18 Apr 2024 15:45:36 +0200 Subject: [PATCH 115/154] [common] show full URL in Extractor.request() error messages --- gallery_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 25578536..aab27779 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -188,7 +188,8 @@ class Extractor(): if notfound and code == 404: raise exception.NotFoundError(notfound) - msg = "'{} {}' for '{}'".format(code, response.reason, url) + msg = "'{} {}' for '{}'".format( + code, response.reason, response.url) server = response.headers.get("Server") if server and server.startswith("cloudflare") and \ code in (403, 503): From bffadf35b7c2889552f39965c395af422741262f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 19 Apr 2024 16:08:31 +0200 Subject: [PATCH 116/154] [hotleak] download files with 404 status code (#5395) --- gallery_dl/downloader/http.py | 4 +++- gallery_dl/extractor/hotleak.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 0ff5dd9a..657e3e78 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -98,6 +98,8 @@ class HttpDownloader(DownloaderBase): metadata = self.metadata kwdict = pathfmt.kwdict + expected_status = kwdict.get( + "_http_expected_status", ()) adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) @@ -151,7 +153,7 @@ class HttpDownloader(DownloaderBase): # check response code = response.status_code - if code == 200: # OK + if code == 200 or code in expected_status: # OK offset = 0 size = response.headers.get("Content-Length") elif code == 206: # Partial Content diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 6d3184d9..a2b51be2 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,7 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post From c9d3b5e5d990cc1f54c277544b43782d2f90904b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 19 Apr 2024 16:41:31 +0200 Subject: [PATCH 117/154] [pixiv] change 'sanity_level' debug message to a warning (#5180) --- gallery_dl/extractor/pixiv.py | 5 +++-- test/results/pixiv.py | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index ab7c18e0..d732894a 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -104,8 +104,9 @@ class PixivExtractor(Extractor): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] if url == url_sanity: - self.log.debug("Skipping 'sanity_level' warning (%s)", - work["id"]) + self.log.warning( + "Unable to download work %s ('sanity_level' warning)", + work["id"]) continue work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) diff --git a/test/results/pixiv.py b/test/results/pixiv.py index 038ddb5d..0674369b 100644 --- a/test/results/pixiv.py +++ b/test/results/pixiv.py @@ -163,6 +163,14 @@ __tests__ = ( "#count" : ">= 10", }, +{ + "#url" : "https://www.pixiv.net/artworks/966412", + "#comment" : "limit_sanity_level_360.png (#4327, #5180)", + "#category": ("", "pixiv", "work"), + "#class" : pixiv.PixivWorkExtractor, + "#count" : 0, +}, + { "#url" : "https://www.pixiv.net/en/artworks/966412", "#category": ("", "pixiv", "work"), From 347af7f5c8423ce0f2ee9119d37143c0e319b590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 19 Apr 2024 21:42:22 +0200 Subject: [PATCH 118/154] [twitter] handle missing 'expanded_url' fields (#5463, #5490) --- gallery_dl/extractor/twitter.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index bcd6cba0..e5799e22 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -243,8 +243,8 @@ class TwitterExtractor(Extractor): # collect URLs from entities for url in tweet["entities"].get("urls") or (): - url = url["expanded_url"] - if "//twitpic.com/" not in url or "/photos/" in url: + url = url.get("expanded_url") or url.get("url") or "" + if not url or "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] @@ -336,7 +336,10 @@ class TwitterExtractor(Extractor): urls = entities.get("urls") if urls: for url in urls: - content = content.replace(url["url"], url["expanded_url"]) + try: + content = content.replace(url["url"], url["expanded_url"]) + except KeyError: + pass txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content @@ -403,7 +406,10 @@ class TwitterExtractor(Extractor): urls = entities["description"].get("urls") if urls: for url in urls: - descr = descr.replace(url["url"], url["expanded_url"]) + try: + descr = descr.replace(url["url"], url["expanded_url"]) + except KeyError: + pass udata["description"] = descr if "url" in entities: From 068ccfe0b3b7b7438baa0deb75b565fc3dfd01d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 19 Apr 2024 22:41:39 +0200 Subject: [PATCH 119/154] [tests] allow filtering extractor result tests by URL or comment python test_results.py twitter:+/i/web/ python test_results.py twitter:~twitpic --- test/test_results.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test/test_results.py b/test/test_results.py index 05946182..8175e3f2 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -442,7 +442,15 @@ def generate_tests(): tests = results.category(category) if subcategory: - tests = [t for t in tests if t["#category"][-1] == subcategory] + if subcategory.startswith("+"): + url = subcategory[1:] + tests = [t for t in tests if url in t["#url"]] + elif subcategory.startswith("~"): + com = subcategory[1:] + tests = [t for t in tests + if "#comment" in t and com in t["#comment"].lower()] + else: + tests = [t for t in tests if t["#category"][-1] == subcategory] else: tests = results.all() From ddffeeaa7befe26079c8ec19cee83d78ba118e7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 Apr 2024 00:05:39 +0200 Subject: [PATCH 120/154] [exhentai] detect CAPTCHAs during login (#5492) --- gallery_dl/extractor/exhentai.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index acad95ce..2cce281b 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -95,7 +95,11 @@ class ExhentaiExtractor(Extractor): self.cookies.clear() response = self.request(url, method="POST", headers=headers, data=data) - if b"You are now logged in as:" not in response.content: + content = response.content + if b"You are now logged in as:" not in content: + if b"The captcha was not entered correctly" in content: + raise exception.AuthenticationError( + "CAPTCHA required. Use cookies instead.") raise exception.AuthenticationError() # collect more cookies From 14b38264e06aa2366202c1a6ae58b75b0973214b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 Apr 2024 20:27:37 +0200 Subject: [PATCH 121/154] [output] extend 'output.colors' (#2566) allow specifying ANSI colors for all loglevels (debug, info, warning, error) --- docs/configuration.rst | 17 +++++++++++++++-- gallery_dl/output.py | 35 +++++++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a013f6c8..1f069259 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4923,8 +4923,21 @@ Type Default ``{"success": "1;32", "skip": "2"}`` Description - Controls the `ANSI colors `__ - used with |mode: color|__ for successfully downloaded or skipped files. + Controls the + `ANSI colors `__ + used for various outputs. + + Output for |mode: color|__ + + * ``success``: successfully downloaded files + * ``skip``: skipped files + + Logging Messages: + + * ``debug``: debug logging messages + * ``info``: info logging messages + * ``warning``: warning logging messages + * ``error``: error logging messages .. __: `output.mode`_ diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 2bcc222f..4ef71db0 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -21,6 +21,7 @@ from . import config, util, formatter LOG_FORMAT = "[{name}][{levelname}] {message}" LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S" LOG_LEVEL = logging.INFO +LOG_LEVELS = ("debug", "info", "warning", "error") class Logger(logging.Logger): @@ -129,7 +130,7 @@ class Formatter(logging.Formatter): def __init__(self, fmt, datefmt): if isinstance(fmt, dict): - for key in ("debug", "info", "warning", "error"): + for key in LOG_LEVELS: value = fmt[key] if key in fmt else LOG_FORMAT fmt[key] = (formatter.parse(value).format_map, "{asctime" in value) @@ -187,16 +188,34 @@ def configure_logging(loglevel): # stream logging handler handler = root.handlers[0] opts = config.interpolate(("output",), "log") + + colors = config.interpolate(("output",), "colors") or {} + if colors and not opts: + opts = LOG_FORMAT + if opts: if isinstance(opts, str): - opts = {"format": opts} - if handler.level == LOG_LEVEL and "level" in opts: + logfmt = opts + opts = {} + elif "format" in opts: + logfmt = opts["format"] + else: + logfmt = LOG_FORMAT + + if not isinstance(logfmt, dict) and colors: + ansifmt = "\033[{}m{}\033[0m".format + lf = {} + for level in LOG_LEVELS: + c = colors.get(level) + lf[level] = ansifmt(c, logfmt) if c else logfmt + logfmt = lf + + handler.setFormatter(Formatter( + logfmt, opts.get("format-date", LOG_FORMAT_DATE))) + + if "level" in opts and handler.level == LOG_LEVEL: handler.setLevel(opts["level"]) - if "format" in opts or "format-date" in opts: - handler.setFormatter(Formatter( - opts.get("format", LOG_FORMAT), - opts.get("format-date", LOG_FORMAT_DATE), - )) + if minlevel > handler.level: minlevel = handler.level From 20e2c0042b590ec243fc1fe18ce34c010739db9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 Apr 2024 20:49:28 +0200 Subject: [PATCH 122/154] [output] enable colors by default --- docs/configuration.rst | 14 ++++++++++++-- gallery_dl/__init__.py | 2 +- gallery_dl/output.py | 27 ++++++++++++++++++++++----- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 1f069259..ec32e3b9 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4921,7 +4921,17 @@ output.colors Type ``object`` (`key` -> `ANSI color`) Default - ``{"success": "1;32", "skip": "2"}`` + .. code:: json + + { + "success": "1;32", + "skip" : "2", + "debug" : "0;37", + "info" : "1;37", + "warning": "1;33", + "error" : "1;31" + } + Description Controls the `ANSI colors `__ @@ -4947,7 +4957,7 @@ output.ansi Type ``bool`` Default - ``false`` + ``true`` Description | On Windows, enable ANSI escape sequences and colored output | by setting the ``ENABLE_VIRTUAL_TERMINAL_PROCESSING`` flag for stdout and stderr. diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index fb3c5cb7..01a51904 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -86,7 +86,7 @@ def main(): signal.signal(signal_num, signal.SIG_IGN) # enable ANSI escape sequences on Windows - if util.WINDOWS and config.get(("output",), "ansi"): + if util.WINDOWS and config.get(("output",), "ansi", True): from ctypes import windll, wintypes, byref kernel32 = windll.kernel32 mode = wintypes.DWORD() diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 4ef71db0..5882d142 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -14,6 +14,15 @@ import functools import unicodedata from . import config, util, formatter +COLORS_DEFAULT = { + "success": "1;32", + "skip" : "2", + "debug" : "0;37", + "info" : "1;37", + "warning": "1;33", + "error" : "1;31", +} + # -------------------------------------------------------------------- # Logging @@ -189,7 +198,9 @@ def configure_logging(loglevel): handler = root.handlers[0] opts = config.interpolate(("output",), "log") - colors = config.interpolate(("output",), "colors") or {} + colors = config.interpolate(("output",), "colors") + if colors is None: + colors = COLORS_DEFAULT if colors and not opts: opts = LOG_FORMAT @@ -326,9 +337,12 @@ def select(): mode = config.get(("output",), "mode") if mode is None or mode == "auto": - if hasattr(sys.stdout, "isatty") and sys.stdout.isatty(): - output = ColorOutput() if ANSI else TerminalOutput() - else: + try: + if sys.stdout.isatty(): + output = ColorOutput() if ANSI else TerminalOutput() + else: + output = PipeOutput() + except Exception: output = PipeOutput() elif isinstance(mode, dict): output = CustomOutput(mode) @@ -407,7 +421,10 @@ class ColorOutput(TerminalOutput): def __init__(self): TerminalOutput.__init__(self) - colors = config.get(("output",), "colors") or {} + colors = config.interpolate(("output",), "colors") + if colors is None: + colors = COLORS_DEFAULT + self.color_skip = "\033[{}m".format( colors.get("skip", "2")) self.color_success = "\r\033[{}m".format( From bef0bd3b495156e5751f23cb51c8d735107b5540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 Apr 2024 21:00:26 +0200 Subject: [PATCH 123/154] add '--no-colors' command-line option --- docs/options.md | 5 +++-- gallery_dl/__init__.py | 5 +++++ gallery_dl/option.py | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/options.md b/docs/options.md index 45ce7eca..5b2de40b 100644 --- a/docs/options.md +++ b/docs/options.md @@ -29,6 +29,7 @@ ## Output Options: -q, --quiet Activate quiet mode + -w, --warning Print only warnings and errors -v, --verbose Print various debugging information -g, --get-urls Print URLs instead of downloading -G, --resolve-urls Print URLs instead of downloading; resolve @@ -48,12 +49,12 @@ extractors but cannot be handled, to FILE --write-pages Write downloaded intermediary pages to files in the current directory to debug problems + --no-colors Do not emit ANSI color codes in output ## Downloader Options: -r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M) -R, --retries N Maximum number of retries for failed HTTP - requests or -1 for infinite retries (default: - 4) + requests or -1 for infinite retries (default: 4) --http-timeout SECONDS Timeout for HTTP connections (default: 30.0) --sleep SECONDS Number of seconds to wait before each download. This can be either a constant value or a range diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 01a51904..c9439f43 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -38,6 +38,11 @@ def main(): except ImportError: import toml config.load(args.configs_toml, strict=True, loads=toml.loads) + if not args.colors: + output.ANSI = False + config.set((), "colors", False) + if util.WINDOWS: + config.set(("output",), "ansi", False) if args.filename: filename = args.filename if filename == "/O": diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 9ede0622..95165c22 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -325,6 +325,11 @@ def build_parser(): help=("Write downloaded intermediary pages to files " "in the current directory to debug problems"), ) + output.add_argument( + "--no-colors", + dest="colors", action="store_false", + help=("Do not emit ANSI color codes in output"), + ) downloader = parser.add_argument_group("Downloader Options") downloader.add_argument( From cd241bea0acf955885cc260cdbe744b5cf878335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 Apr 2024 01:01:35 +0200 Subject: [PATCH 124/154] [downloader:http] add MIME type and signature for .m4v files (#5505) --- gallery_dl/downloader/http.py | 5 ++++- test/test_downloader.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 657e3e78..54750ac7 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -401,6 +401,8 @@ MIME_TYPES = { "video/webm": "webm", "video/ogg" : "ogg", "video/mp4" : "mp4", + "video/m4v" : "m4v", + "video/x-m4v": "m4v", "video/quicktime": "mov", "audio/wav" : "wav", @@ -443,7 +445,8 @@ SIGNATURE_CHECKS = { "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", "psd" : lambda s: s[0:4] == b"8BPS", "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( - b"mp4", b"avc", b"iso", b"M4V")), + b"mp4", b"avc", b"iso")), + "m4v" : lambda s: s[4:11] == b"ftypM4V", "mov" : lambda s: s[4:12] == b"ftypqt ", "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", "ogg" : lambda s: s[0:4] == b"OggS", diff --git a/test/test_downloader.py b/test/test_downloader.py index 8027af50..f63b68a2 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -303,7 +303,7 @@ SAMPLES = { ("mp4" , b"????ftypmp4"), ("mp4" , b"????ftypavc1"), ("mp4" , b"????ftypiso3"), - ("mp4" , b"????ftypM4V"), + ("m4v" , b"????ftypM4V"), ("mov" , b"????ftypqt "), ("webm", b"\x1A\x45\xDF\xA3"), ("ogg" , b"OggS"), From 85550a37c4b462013a07b422d172c8582464b12a Mon Sep 17 00:00:00 2001 From: Delphox Date: Thu, 25 Apr 2024 13:19:40 -0300 Subject: [PATCH 125/154] [twitter] support fixvx urls --- gallery_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index e5799e22..688478cd 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -16,7 +16,7 @@ import json import re BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com") + r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com") class TwitterExtractor(Extractor): From 6969963125b55d5e530fe72236bf5363088cd3d3 Mon Sep 17 00:00:00 2001 From: Delphox Date: Thu, 25 Apr 2024 13:25:39 -0300 Subject: [PATCH 126/154] [furaffinity] support fxfuraffinity and fxraffinity urls --- gallery_dl/extractor/furaffinity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 56721d0f..83181822 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?f(?:[xu]|xfu)raffinity\.net" class FuraffinityExtractor(Extractor): From 1886721d8206c45eaf20637a512903b0586c5169 Mon Sep 17 00:00:00 2001 From: Delphox Date: Thu, 25 Apr 2024 13:28:30 -0300 Subject: [PATCH 127/154] update tests --- test/results/furaffinity.py | 12 ++++++++++++ test/results/twitter.py | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/test/results/furaffinity.py b/test/results/furaffinity.py index 187029bb..6d3e47c7 100644 --- a/test/results/furaffinity.py +++ b/test/results/furaffinity.py @@ -121,6 +121,18 @@ __tests__ = ( "#class" : furaffinity.FuraffinityPostExtractor, }, +{ + "#url" : "https://fxfuraffinity.net/view/21835115/", + "#category": ("", "furaffinity", "post"), + "#class" : furaffinity.FuraffinityPostExtractor, +}, + +{ + "#url" : "https://fxraffinity.net/view/21835115/", + "#category": ("", "furaffinity", "post"), + "#class" : furaffinity.FuraffinityPostExtractor, +}, + { "#url" : "https://sfw.furaffinity.net/view/21835115/", "#category": ("", "furaffinity", "post"), diff --git a/test/results/twitter.py b/test/results/twitter.py index ac119a33..9cba93b0 100644 --- a/test/results/twitter.py +++ b/test/results/twitter.py @@ -60,6 +60,12 @@ __tests__ = ( "#class" : twitter.TwitterUserExtractor, }, +{ + "#url" : "https://fixvx.com/supernaturepics", + "#category": ("", "twitter", "user"), + "#class" : twitter.TwitterUserExtractor, +}, + { "#url" : "https://x.com/supernaturepics", "#category": ("", "twitter", "user"), From 5aefa9465c3261f156022245c5cd2dced2a8c361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 Apr 2024 21:58:34 +0200 Subject: [PATCH 128/154] [furaffinity] simplify pattern, match original domain first --- gallery_dl/extractor/furaffinity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 83181822..d7238719 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?f(?:[xu]|xfu)raffinity\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?f(?:u|x|xfu)raffinity\.net" class FuraffinityExtractor(Extractor): From 3ba5fd9efda671834f2571578e962e4e6f5c4abb Mon Sep 17 00:00:00 2001 From: cenodis <57576911+cenodis@users.noreply.github.com> Date: Fri, 26 Apr 2024 22:51:56 +0200 Subject: [PATCH 129/154] [mastodon] Use boolean instead of integer keys for accounts/statuses endpoint --- gallery_dl/extractor/mastodon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 030d7d1a..93d0f512 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -227,8 +227,8 @@ class MastodonAPI(): exclude_replies=False): """Fetch an account's statuses""" endpoint = "/v1/accounts/{}/statuses".format(account_id) - params = {"only_media" : "1" if only_media else "0", - "exclude_replies": "1" if exclude_replies else "0"} + params = {"only_media" : "true" if only_media else "false", + "exclude_replies": "true" if exclude_replies else "false"} return self._pagination(endpoint, params) def status(self, status_id): From 9b1995dda3ca0e1151660f749027ddcbd98fe2c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 30 Apr 2024 15:53:32 +0200 Subject: [PATCH 130/154] [mastodon] add 'favorite', 'list', and 'hashtag' extractors (#5529) --- docs/supportedsites.md | 6 ++-- gallery_dl/extractor/mastodon.py | 54 ++++++++++++++++++++++++++++++-- test/results/mastodonsocial.py | 27 ++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dc24a29e..a90ce159 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1622,19 +1622,19 @@ Consider all listed sites to potentially be NSFW. mastodon.social https://mastodon.social/ - Bookmarks, Followed Users, Images from Statuses, User Profiles + Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles OAuth Pawoo https://pawoo.net/ - Bookmarks, Followed Users, Images from Statuses, User Profiles + Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles OAuth baraag https://baraag.net/ - Bookmarks, Followed Users, Images from Statuses, User Profiles + Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles OAuth diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 93d0f512..a021f00c 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -136,6 +136,36 @@ class MastodonBookmarkExtractor(MastodonExtractor): return MastodonAPI(self).account_bookmarks() +class MastodonFavoriteExtractor(MastodonExtractor): + """Extractor for mastodon favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favourites" + example = "https://mastodon.social/favourites" + + def statuses(self): + return MastodonAPI(self).account_favorites() + + +class MastodonListExtractor(MastodonExtractor): + """Extractor for mastodon lists""" + subcategory = "list" + pattern = BASE_PATTERN + r"/lists/(\w+)" + example = "https://mastodon.social/lists/12345" + + def statuses(self): + return MastodonAPI(self).timelines_list(self.item) + + +class MastodonHashtagExtractor(MastodonExtractor): + """Extractor for mastodon hashtags""" + subcategory = "hashtag" + pattern = BASE_PATTERN + r"/tags/(\w+)" + example = "https://mastodon.social/tags/NAME" + + def statuses(self): + return MastodonAPI(self).timelines_tag(self.item) + + class MastodonFollowingExtractor(MastodonExtractor): """Extractor for followed mastodon users""" subcategory = "following" @@ -205,37 +235,55 @@ class MastodonAPI(): raise exception.NotFoundError("account") def account_bookmarks(self): + """Statuses the user has bookmarked""" endpoint = "/v1/bookmarks" return self._pagination(endpoint, None) + def account_favorites(self): + """Statuses the user has favourited""" + endpoint = "/v1/favourites" + return self._pagination(endpoint, None) + def account_following(self, account_id): + """Accounts which the given account is following""" endpoint = "/v1/accounts/{}/following".format(account_id) return self._pagination(endpoint, None) def account_lookup(self, username): + """Quickly lookup a username to see if it is available""" endpoint = "/v1/accounts/lookup" params = {"acct": username} return self._call(endpoint, params).json() def account_search(self, query, limit=40): - """Search for accounts""" + """Search for matching accounts by username or display name""" endpoint = "/v1/accounts/search" params = {"q": query, "limit": limit} return self._call(endpoint, params).json() def account_statuses(self, account_id, only_media=True, exclude_replies=False): - """Fetch an account's statuses""" + """Statuses posted to the given account""" endpoint = "/v1/accounts/{}/statuses".format(account_id) params = {"only_media" : "true" if only_media else "false", "exclude_replies": "true" if exclude_replies else "false"} return self._pagination(endpoint, params) def status(self, status_id): - """Fetch a status""" + """Obtain information about a status""" endpoint = "/v1/statuses/" + status_id return self._call(endpoint).json() + def timelines_list(self, list_id): + """View statuses in the given list timeline""" + endpoint = "/v1/timelines/list/" + list_id + return self._pagination(endpoint, None) + + def timelines_tag(self, hashtag): + """View public statuses containing the given hashtag""" + endpoint = "/v1/timelines/tag/" + hashtag + return self._pagination(endpoint, None) + def _call(self, endpoint, params=None): if endpoint.startswith("http"): url = endpoint diff --git a/test/results/mastodonsocial.py b/test/results/mastodonsocial.py index 8c22bcf3..aa4a7b8a 100644 --- a/test/results/mastodonsocial.py +++ b/test/results/mastodonsocial.py @@ -74,6 +74,33 @@ __tests__ = ( "#url" : "https://mastodon.social/bookmarks", "#category": ("mastodon", "mastodon.social", "bookmark"), "#class" : mastodon.MastodonBookmarkExtractor, + "#auth" : True, + "#urls" : "https://files.mastodon.social/media_attachments/files/111/331/603/082/304/823/original/e12cde371c88c1b0.png", +}, + +{ + "#url" : "https://mastodon.social/favourites", + "#category": ("mastodon", "mastodon.social", "favorite"), + "#class" : mastodon.MastodonFavoriteExtractor, + "#auth" : True, + "#urls" : "https://files.mastodon.social/media_attachments/files/111/331/603/082/304/823/original/e12cde371c88c1b0.png", +}, + +{ + "#url" : "https://mastodon.social/lists/92653", + "#category": ("mastodon", "mastodon.social", "list"), + "#class" : mastodon.MastodonListExtractor, + "#auth" : True, + "#pattern" : r"https://files\.mastodon\.social/media_attachments/files/(\d+/){3,}original/\w+", + "#range" : "1-10", +}, + +{ + "#url" : "https://mastodon.social/tags/mastodon", + "#category": ("mastodon", "mastodon.social", "hashtag"), + "#class" : mastodon.MastodonHashtagExtractor, + "#pattern" : r"https://files\.mastodon\.social/media_attachments/files/(\d+/){3,}original/\w+", + "#range" : "1-10", }, { From 3cf5366143fbfa58948cc3633c3fa894a0c9935c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 30 Apr 2024 20:07:23 +0200 Subject: [PATCH 131/154] [mastodon] add support for card images --- docs/configuration.rst | 10 ++++++++++ gallery_dl/extractor/mastodon.py | 13 +++++++++++++ test/results/mastodonsocial.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index ec32e3b9..5624d1a5 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2616,6 +2616,16 @@ Description user IDs. +extractor.[mastodon].cards +-------------------------- +Type + ``bool`` +Default + ``false`` +Description + Fetch media from cards. + + extractor.[mastodon].reblogs ---------------------------- Type diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index a021f00c..cb7f701c 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -29,6 +29,7 @@ class MastodonExtractor(BaseExtractor): self.instance = self.root.partition("://")[2] self.reblogs = self.config("reblogs", False) self.replies = self.config("replies", True) + self.cards = self.config("cards", False) def items(self): for status in self.statuses(): @@ -48,6 +49,17 @@ class MastodonExtractor(BaseExtractor): if status["reblog"]: attachments.extend(status["reblog"]["media_attachments"]) + if self.cards: + card = status.get("card") + if card: + url = card.get("image") + if url: + card["weburl"] = card.get("url") + card["url"] = url + card["id"] = "card" + "".join( + url.split("/")[6:-2]).lstrip("0") + attachments.append(card) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -120,6 +132,7 @@ class MastodonUserExtractor(MastodonExtractor): api.account_id_by_username(self.item), only_media=( not self.reblogs and + not self.cards and not self.config("text-posts", False) ), exclude_replies=not self.replies, diff --git a/test/results/mastodonsocial.py b/test/results/mastodonsocial.py index aa4a7b8a..1eed8a24 100644 --- a/test/results/mastodonsocial.py +++ b/test/results/mastodonsocial.py @@ -164,4 +164,36 @@ __tests__ = ( "num" : int, }, +{ + "#url" : "https://mastodon.social/@technewsbot@assortedflotsam.com/112360601113258881", + "#comment" : "card image", + "#category": ("mastodon", "mastodon.social", "status"), + "#class" : mastodon.MastodonStatusExtractor, + "#options" : {"cards": True}, + "#urls" : "https://files.mastodon.social/cache/preview_cards/images/095/900/335/original/83f0b4a793c84123.jpg", + + "media": { + "author_name" : "Tom Warren", + "author_url" : "https://www.theverge.com/authors/tom-warren", + "blurhash" : "UHBDWMCjVGM0k,XjnPM#0h+vkpb^RkjYSh$*", + "description" : "Microsoft’s big Xbox games showcase will take place on June 9th. It will include more games than last year and a special Call of Duty Direct will follow.", + "embed_url" : "", + "height" : 628, + "html" : "", + "id" : "card95900335", + "image" : "https://files.mastodon.social/cache/preview_cards/images/095/900/335/original/83f0b4a793c84123.jpg", + "image_description": "The Xbox showcase illustration", + "language" : "en", + "provider_name": "The Verge", + "provider_url": "", + "published_at": "2024-04-30T14:15:30.341Z", + "title" : "The Xbox games showcase airs June 9th, followed by a Call of Duty Direct", + "type" : "link", + "url" : "https://files.mastodon.social/cache/preview_cards/images/095/900/335/original/83f0b4a793c84123.jpg", + "weburl" : "https://www.theverge.com/2024/4/30/24145262/xbox-games-showcase-summer-2024-call-of-duty-direct", + "width" : 1200, + }, + +}, + ) From 6db1837a6da82d712a268f0503b0e85b28b66d65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 1 May 2024 15:41:21 +0200 Subject: [PATCH 132/154] [bluesky] filter reposts only for user timelines (#5528) --- gallery_dl/extractor/bluesky.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 84c31878..c97bf65e 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -317,7 +317,7 @@ class BlueskyAPI(): def get_author_feed(self, actor, filter="posts_and_author_threads"): endpoint = "app.bsky.feed.getAuthorFeed" params = { - "actor" : self._did_from_actor(actor), + "actor" : self._did_from_actor(actor, True), "filter": filter, "limit" : "100", } @@ -327,7 +327,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getFeed" params = { "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor, False), feed), + self._did_from_actor(actor), feed), "limit": "100", } return self._pagination(endpoint, params) @@ -344,7 +344,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getListFeed" params = { "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor, False), list), + self._did_from_actor(actor), list), "limit": "100", } return self._pagination(endpoint, params) @@ -391,7 +391,7 @@ class BlueskyAPI(): } return self._pagination(endpoint, params, "posts") - def _did_from_actor(self, actor, user_did=True): + def _did_from_actor(self, actor, user_did=False): if actor.startswith("did:"): did = actor else: From d11ec009086d20b561704ec5bc7cfb2cefd974c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 1 May 2024 03:26:42 +0200 Subject: [PATCH 133/154] [common] fix _cfgpath for BaseExtractor objects After the changes in 0c178846734e6149b41e82502da1c038ecfd17e1, _cfgpath was mssing its 'category' value since that hadn't been initialized yet. --- gallery_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index aab27779..90b117d3 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -766,6 +766,7 @@ class BaseExtractor(Extractor): Extractor.__init__(self, match) if not self.category: self._init_category() + self._cfgpath = ("extractor", self.category, self.subcategory) def _init_category(self): for index, group in enumerate(self.groups): From 8ed70b32563b88a32287e534d0291edc127b677c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 1 May 2024 03:30:08 +0200 Subject: [PATCH 134/154] [tests] mark tests with missing auth as 'only_matching' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … instead of skipping them completely --- test/test_results.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index 8175e3f2..aaa71ecd 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -54,6 +54,7 @@ AUTH_CONFIG = ( "cookies", "api-key", "client-id", + "access-token", "refresh-token", ) @@ -88,6 +89,19 @@ class TestExtractorResults(unittest.TestCase): result.pop("#comment", None) only_matching = (len(result) <= 3) + auth = result.get("#auth") + if auth is None: + auth = (result["#category"][1] in AUTH) + elif not auth: + for key in AUTH_CONFIG: + config.set((), key, None) + + if auth: + extr = result["#class"].from_url(result["#url"]) + if not any(extr.config(key) for key in AUTH_CONFIG): + self._skipped.append((result["#url"], "no auth")) + only_matching = True + if only_matching: content = False else: @@ -95,21 +109,6 @@ class TestExtractorResults(unittest.TestCase): for key, value in result["#options"].items(): key = key.split(".") config.set(key[:-1], key[-1], value) - - auth = result.get("#auth") - if auth is None: - auth = (result["#category"][1] in AUTH) - elif not auth: - for key in AUTH_CONFIG: - config.set((), key, None) - - if auth: - extr = result["#class"].from_url(result["#url"]) - if not any(extr.config(key) for key in AUTH_CONFIG): - msg = "no auth" - self._skipped.append((result["#url"], msg)) - self.skipTest(msg) - if "#range" in result: config.set((), "image-range" , result["#range"]) config.set((), "chapter-range", result["#range"]) From 619bf5c644c2bdbdf4290696f9530e235160577c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 1 May 2024 22:46:24 +0200 Subject: [PATCH 135/154] [inkbunny] retry API calls with a loop instead of recursion --- gallery_dl/extractor/inkbunny.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 62586af5..2ae8cbe0 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -330,15 +330,18 @@ class InkbunnyAPI(): def _call(self, endpoint, params): url = "https://inkbunny.net/api_" + endpoint + ".php" params["sid"] = self.session_id - data = self.extractor.request(url, params=params).json() - if "error_code" in data: + while True: + data = self.extractor.request(url, params=params).json() + + if "error_code" not in data: + return data + if str(data["error_code"]) == "2": self.authenticate(invalidate=True) - return self._call(endpoint, params) - raise exception.StopExtraction(data.get("error_message")) + continue - return data + raise exception.StopExtraction(data.get("error_message")) def _pagination_search(self, params): params["page"] = 1 From d0cead105bd6f053fe5780a341f9878aa2387997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 May 2024 17:24:59 +0200 Subject: [PATCH 136/154] =?UTF-8?q?[formatter]=20allow=20dots=20etc=20in?= =?UTF-8?q?=20'=E2=80=A6'=20literals=20(#5539)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit don't parse fields starting with ' this disables the ability to directly apply […] to '…' literals, but that's not really useful anyway and can still be done with _lit --- gallery_dl/formatter.py | 7 +++---- test/test_formatter.py | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index b83cf21c..0b212d5e 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -243,13 +243,12 @@ class TemplateFStringFormatter(FStringFormatter): def parse_field_name(field_name): + if field_name[0] == "'": + return "_lit", (operator.itemgetter(field_name[1:-1]),) + first, rest = _string.formatter_field_name_split(field_name) funcs = [] - if first[0] == "'": - funcs.append(operator.itemgetter(first[1:-1])) - first = "_lit" - for is_attr, key in rest: if is_attr: func = operator.attrgetter diff --git a/test/test_formatter.py b/test/test_formatter.py index 89cb1aad..73e958cc 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -336,14 +336,14 @@ class TestFormatter(unittest.TestCase): def test_literals(self): value = "foo" - self._run_test("{'foo'}" , value) - self._run_test("{'foo'!u}" , value.upper()) - self._run_test("{'f00':R0/o/}" , value) - self._run_test("{'foobar'[:3]}", value) - self._run_test("{z|'foo'}" , value) - self._run_test("{z|''|'foo'}" , value) - self._run_test("{z|''}" , "") - self._run_test("{''|''}" , "") + self._run_test("{'foo'}" , value) + self._run_test("{'foo'!u}" , value.upper()) + self._run_test("{'f00':R0/o/}", value) + + self._run_test("{z|'foo'}" , value) + self._run_test("{z|''|'foo'}" , value) + self._run_test("{z|'foo'!u}" , value.upper()) + self._run_test("{z|'f00':R0/o/}", value) self._run_test("{_lit[foo]}" , value) self._run_test("{_lit[foo]!u}" , value.upper()) @@ -351,6 +351,17 @@ class TestFormatter(unittest.TestCase): self._run_test("{_lit[foobar][:3]}", value) self._run_test("{z|_lit[foo]}" , value) + # empty (#4492) + self._run_test("{z|''}" , "") + self._run_test("{''|''}", "") + + # special characters (dots, brackets, singlee quotes) (#5539) + self._run_test("{'f.o.o'}" , "f.o.o") + self._run_test("{_lit[f.o.o]}", "f.o.o") + self._run_test("{_lit[f'o'o]}", "f'o'o") + self._run_test("{'f.[].[]'}" , "f.[].[]") + self._run_test("{z|'f.[].[]'}", "f.[].[]") + def test_template(self): with tempfile.TemporaryDirectory() as tmpdirname: path1 = os.path.join(tmpdirname, "tpl1") From bd8e4797e51d01e09f3e2b52938832da35203996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 May 2024 18:12:19 +0200 Subject: [PATCH 137/154] [vsco] add 'avatar' extractor (#5341) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/vsco.py | 30 ++++++++++++++++++++++++++++++ test/results/vsco.py | 10 ++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a90ce159..034c8c6e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -940,7 +940,7 @@ Consider all listed sites to potentially be NSFW. VSCO https://vsco.co/ - Collections, individual Images, Spaces, User Profiles + Avatars, Collections, individual Images, Spaces, User Profiles diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 41141c6f..c112f4ae 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -46,6 +46,8 @@ class VscoExtractor(Extractor): url = "https://image-{}.vsco.co/{}".format(cdn, path) elif cdn.isdecimal(): url = "https://image.vsco.co/" + base + elif img["responsive_url"].startswith("http"): + url = img["responsive_url"] else: url = "https://" + img["responsive_url"] @@ -238,6 +240,34 @@ class VscoSpacesExtractor(VscoExtractor): yield Message.Queue, url, space +class VscoAvatarExtractor(VscoExtractor): + """Extractor for vsco.co user avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://vsco.co/USER/avatar" + + def images(self): + url = "{}/{}/gallery".format(self.root, self.user) + page = self.request(url).text + piid = text.extr(page, '"profileImageId":"', '"') + + url = "https://im.vsco.co/" + piid + # needs GET request, since HEAD does not redirect to full URL + response = self.request(url, allow_redirects=False) + + return ({ + "_id" : piid, + "is_video" : False, + "grid_name" : "", + "upload_date" : 0, + "responsive_url": response.headers["Location"], + "video_url" : "", + "image_meta" : None, + "width" : 0, + "height" : 0, + },) + + class VscoImageExtractor(VscoExtractor): """Extractor for individual images on vsco.co""" subcategory = "image" diff --git a/test/results/vsco.py b/test/results/vsco.py index 6fa9eb69..0553b4c3 100644 --- a/test/results/vsco.py +++ b/test/results/vsco.py @@ -55,6 +55,16 @@ __tests__ = ( ), }, +{ + "#url" : "https://vsco.co/vsco/avatar", + "#category": ("", "vsco", "avatar"), + "#class" : vsco.VscoAvatarExtractor, + "#urls" : "https://image-aws-us-west-2.vsco.co/3c69ae/304128/652d9f3b39a6007526dda683/vscoprofile-avatar.jpg", + "#sha1_content" : "57cd648759e34a6daefc5c79542ddb4595b9b677", + + "id": "652d9f3b39a6007526dda683", +}, + { "#url" : "https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2", "#category": ("", "vsco", "image"), From 06d102f19afd6a5abb83dddf1196e4983e53809a Mon Sep 17 00:00:00 2001 From: Jan Wikholm Date: Wed, 1 May 2024 19:29:46 +0300 Subject: [PATCH 138/154] optimize _find_most_recently_used_file for exact profile When reading cookies from the browser, the user is able to give either just the browser name, or also provide profile/container information. If an exact profile is provided, there is no need to find the latest profile with `os.walk` which is very expensive. This change optimizes that case and the performance increase is significant (~8 sec to 0.6 sec). ``` $ time gallery-dl --config-ignore -d . -D . --cookies-from-browser FIREFOX https://imgur.com/OO4UNqJ [cookies][info] Extracted 16 cookies from Firefox ./imgur_OO4UNqJ.jpg real 0m8.429s user 0m0.216s sys 0m0.431s $ time gallery-dl --config-ignore -d . -D . --cookies-from-browser FIREFOX:bgamf5r6.default-release https://imgur.com/OO4UNqJ [cookies][info] Extracted 16 cookies from Firefox ./imgur_OO4UNqJ.jpg real 0m0.456s user 0m0.183s sys 0m0.011s $ gallery-dl --version 1.26.9 ``` --- gallery_dl/cookies.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 71a45f00..4dd80864 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -1001,6 +1001,12 @@ def _decrypt_windows_dpapi(ciphertext): def _find_most_recently_used_file(root, filename): + # if the provided root points to an exact profile path + # check if it contains the wanted filename + first_choice = os.path.join(root, filename) + if os.path.exists(first_choice): + return first_choice + # if there are multiple browser profiles, take the most recently used one paths = [] for curr_root, dirs, files in os.walk(root): From 699592498b6466d7e1a4eeb3ca969d088ec6ff30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 2 May 2024 22:54:15 +0200 Subject: [PATCH 139/154] [tests] use random port number for local HTTP server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … and explicitly bind to 127.0.0.1 instead of all interfaces --- test/test_downloader.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/test/test_downloader.py b/test/test_downloader.py index f63b68a2..9f9fb3b8 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -174,9 +174,17 @@ class TestHTTPDownloader(TestDownloaderBase): TestDownloaderBase.setUpClass() cls.downloader = downloader.find("http")(cls.job) - port = 8088 - cls.address = "http://127.0.0.1:{}".format(port) - server = http.server.HTTPServer(("", port), HttpRequestHandler) + host = "127.0.0.1" + port = 0 # select random not-in-use port + + try: + server = http.server.HTTPServer((host, port), HttpRequestHandler) + except OSError as exc: + raise unittest.SkipTest( + "cannot spawn local HTTP server ({})".format(exc)) + + host, port = server.server_address + cls.address = "http://{}:{}".format(host, port) threading.Thread(target=server.serve_forever, daemon=True).start() def _run_test(self, ext, input, output, From dc9d83e64bc15c65100449a48c7390f194cab71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 3 May 2024 02:03:59 +0200 Subject: [PATCH 140/154] [output] support 'NO_COLOR' environment variable --- gallery_dl/__init__.py | 2 +- gallery_dl/output.py | 34 +++++++++++++++++++--------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index c9439f43..7ca405aa 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -91,7 +91,7 @@ def main(): signal.signal(signal_num, signal.SIG_IGN) # enable ANSI escape sequences on Windows - if util.WINDOWS and config.get(("output",), "ansi", True): + if util.WINDOWS and config.get(("output",), "ansi", output.COLORS): from ctypes import windll, wintypes, byref kernel32 = windll.kernel32 mode = wintypes.DWORD() diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 5882d142..35185452 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -14,6 +14,11 @@ import functools import unicodedata from . import config, util, formatter + +# -------------------------------------------------------------------- +# Globals + +COLORS = not os.environ.get("NO_COLOR") COLORS_DEFAULT = { "success": "1;32", "skip" : "2", @@ -21,7 +26,20 @@ COLORS_DEFAULT = { "info" : "1;37", "warning": "1;33", "error" : "1;31", -} +} if COLORS else {} + +if util.WINDOWS: + ANSI = COLORS and os.environ.get("TERM") == "ANSI" + OFFSET = 1 + CHAR_SKIP = "# " + CHAR_SUCCESS = "* " + CHAR_ELLIPSIES = "..." +else: + ANSI = COLORS + OFFSET = 0 + CHAR_SKIP = "# " + CHAR_SUCCESS = "✔ " + CHAR_ELLIPSIES = "…" # -------------------------------------------------------------------- @@ -550,17 +568,3 @@ def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()): right -= 1 return txt[:left] + sep + txt[right+1:] - - -if util.WINDOWS: - ANSI = os.environ.get("TERM") == "ANSI" - OFFSET = 1 - CHAR_SKIP = "# " - CHAR_SUCCESS = "* " - CHAR_ELLIPSIES = "..." -else: - ANSI = True - OFFSET = 0 - CHAR_SKIP = "# " - CHAR_SUCCESS = "✔ " - CHAR_ELLIPSIES = "…" From 11109d5badcd7c047288b77af4feeeef6a7a8284 Mon Sep 17 00:00:00 2001 From: Delphox Date: Wed, 8 May 2024 12:15:47 -0300 Subject: [PATCH 141/154] [furaffinity] match xfuraffinity.com --- gallery_dl/extractor/furaffinity.py | 2 +- test/results/furaffinity.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index d7238719..6040187e 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?f(?:u|x|xfu)raffinity\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" class FuraffinityExtractor(Extractor): diff --git a/test/results/furaffinity.py b/test/results/furaffinity.py index 6d3e47c7..fffaec56 100644 --- a/test/results/furaffinity.py +++ b/test/results/furaffinity.py @@ -127,6 +127,12 @@ __tests__ = ( "#class" : furaffinity.FuraffinityPostExtractor, }, +{ + "#url" : "https://xfuraffinity.net/view/21835115/", + "#category": ("", "furaffinity", "post"), + "#class" : furaffinity.FuraffinityPostExtractor, +}, + { "#url" : "https://fxraffinity.net/view/21835115/", "#category": ("", "furaffinity", "post"), From b27ffd790a67e2ca4e8f8e8f016cfefecf2e75bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 9 May 2024 15:14:08 +0200 Subject: [PATCH 142/154] [poipiku] fix downloading R-18 posts (#5567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … by automatically sending a `POIPIKU_CONTENTS_VIEW_MODE=1` cookie to enable "adult" mode. --- gallery_dl/extractor/poipiku.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index f42016fc..5cc964a3 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -23,6 +23,10 @@ class PoipikuExtractor(Extractor): archive_fmt = "{post_id}_{num}" request_interval = (0.5, 1.5) + def _init(self): + self.cookies.set( + "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com") + def items(self): password = self.config("password", "") From 88f94190f4d0fd7937b4ac6ed0e673391b13fb7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 May 2024 01:05:28 +0200 Subject: [PATCH 143/154] [archive] move DownloadArchive into its own module --- gallery_dl/archive.py | 56 ++++++++++++++++++++++++++++++ gallery_dl/job.py | 38 +++++++++++++------- gallery_dl/postprocessor/common.py | 25 ++++++------- gallery_dl/util.py | 44 ----------------------- 4 files changed, 95 insertions(+), 68 deletions(-) create mode 100644 gallery_dl/archive.py diff --git a/gallery_dl/archive.py b/gallery_dl/archive.py new file mode 100644 index 00000000..302dc5aa --- /dev/null +++ b/gallery_dl/archive.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Download Archives""" + +import os +import sqlite3 +from . import formatter + + +class DownloadArchive(): + + def __init__(self, path, format_string, pragma=None, + cache_key="_archive_key"): + try: + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + except sqlite3.OperationalError: + os.makedirs(os.path.dirname(path)) + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + con.isolation_level = None + + self.keygen = formatter.parse(format_string).format_map + self.connection = con + self.close = con.close + self.cursor = cursor = con.cursor() + self._cache_key = cache_key + + if pragma: + for stmt in pragma: + cursor.execute("PRAGMA " + stmt) + + try: + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY) WITHOUT ROWID") + except sqlite3.OperationalError: + # fallback for missing WITHOUT ROWID support (#553) + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY)") + + def add(self, kwdict): + """Add item described by 'kwdict' to archive""" + key = kwdict.get(self._cache_key) or self.keygen(kwdict) + self.cursor.execute( + "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) + + def check(self, kwdict): + """Return True if the item described by 'kwdict' exists in archive""" + key = kwdict[self._cache_key] = self.keygen(kwdict) + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() diff --git a/gallery_dl/job.py b/gallery_dl/job.py index eb10a0ce..6c2d2ba8 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -11,10 +11,23 @@ import errno import logging import functools import collections -from . import extractor, downloader, postprocessor -from . import config, text, util, path, formatter, output, exception, version + +from . import ( + extractor, + downloader, + postprocessor, + archive, + config, + exception, + formatter, + output, + path, + text, + util, + version, +) from .extractor.message import Message -from .output import stdout_write +stdout_write = output.stdout_write class Job(): @@ -507,23 +520,24 @@ class DownloadJob(Job): # monkey-patch method to do nothing and always return True self.download = pathfmt.fix_extension - archive = cfg("archive") - if archive: - archive = util.expand_path(archive) + archive_path = cfg("archive") + if archive_path: + archive_path = util.expand_path(archive_path) archive_format = (cfg("archive-prefix", extr.category) + cfg("archive-format", extr.archive_fmt)) archive_pragma = (cfg("archive-pragma")) try: - if "{" in archive: - archive = formatter.parse(archive).format_map(kwdict) - self.archive = util.DownloadArchive( - archive, archive_format, archive_pragma) + if "{" in archive_path: + archive_path = formatter.parse( + archive_path).format_map(kwdict) + self.archive = archive.DownloadArchive( + archive_path, archive_format, archive_pragma) except Exception as exc: extr.log.warning( "Failed to open download archive at '%s' (%s: %s)", - archive, exc.__class__.__name__, exc) + archive_path, exc.__class__.__name__, exc) else: - extr.log.debug("Using download archive '%s'", archive) + extr.log.debug("Using download archive '%s'", archive_path) skip = cfg("skip", True) if skip: diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 1d2fba87..d4e16034 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -8,7 +8,7 @@ """Common classes and constants used by postprocessor modules.""" -from .. import util, formatter +from .. import util, formatter, archive class PostProcessor(): @@ -22,30 +22,31 @@ class PostProcessor(): return self.__class__.__name__ def _init_archive(self, job, options, prefix=None): - archive = options.get("archive") - if archive: + archive_path = options.get("archive") + if archive_path: extr = job.extractor - archive = util.expand_path(archive) + archive_path = util.expand_path(archive_path) if not prefix: prefix = "_" + self.name.upper() + "_" archive_format = ( options.get("archive-prefix", extr.category) + options.get("archive-format", prefix + extr.archive_fmt)) try: - if "{" in archive: - archive = formatter.parse(archive).format_map( + if "{" in archive_path: + archive_path = formatter.parse(archive_path).format_map( job.pathfmt.kwdict) - self.archive = util.DownloadArchive( - archive, archive_format, + self.archive = archive.DownloadArchive( + archive_path, archive_format, options.get("archive-pragma"), "_archive_" + self.name) except Exception as exc: self.log.warning( "Failed to open %s archive at '%s' (%s: %s)", - self.name, archive, exc.__class__.__name__, exc) + self.name, archive_path, exc.__class__.__name__, exc) else: - self.log.debug("Using %s archive '%s'", self.name, archive) + self.log.debug( + "Using %s archive '%s'", self.name, archive_path) return True - else: - self.archive = None + + self.archive = None return False diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 0e6f04a9..861ec7eb 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -16,7 +16,6 @@ import time import random import getpass import hashlib -import sqlite3 import binascii import datetime import functools @@ -852,46 +851,3 @@ class FilterPredicate(): raise except Exception as exc: raise exception.FilterError(exc) - - -class DownloadArchive(): - - def __init__(self, path, format_string, pragma=None, - cache_key="_archive_key"): - try: - con = sqlite3.connect(path, timeout=60, check_same_thread=False) - except sqlite3.OperationalError: - os.makedirs(os.path.dirname(path)) - con = sqlite3.connect(path, timeout=60, check_same_thread=False) - con.isolation_level = None - - from . import formatter - self.keygen = formatter.parse(format_string).format_map - self.close = con.close - self.cursor = cursor = con.cursor() - self._cache_key = cache_key - - if pragma: - for stmt in pragma: - cursor.execute("PRAGMA " + stmt) - - try: - cursor.execute("CREATE TABLE IF NOT EXISTS archive " - "(entry TEXT PRIMARY KEY) WITHOUT ROWID") - except sqlite3.OperationalError: - # fallback for missing WITHOUT ROWID support (#553) - cursor.execute("CREATE TABLE IF NOT EXISTS archive " - "(entry TEXT PRIMARY KEY)") - - def check(self, kwdict): - """Return True if the item described by 'kwdict' exists in archive""" - key = kwdict[self._cache_key] = self.keygen(kwdict) - self.cursor.execute( - "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) - return self.cursor.fetchone() - - def add(self, kwdict): - """Add item described by 'kwdict' to archive""" - key = kwdict.get(self._cache_key) or self.keygen(kwdict) - self.cursor.execute( - "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) From 215abbc3e440a7aa6842dd3c558ad68e49b4cbd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 May 2024 01:15:44 +0200 Subject: [PATCH 144/154] [archive] implement DownloadArchiveMemory class (#5255) keeps archive IDs in memory and only writes them to disk in a 'finalize' step --- gallery_dl/archive.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/gallery_dl/archive.py b/gallery_dl/archive.py index 302dc5aa..5f05bbfd 100644 --- a/gallery_dl/archive.py +++ b/gallery_dl/archive.py @@ -54,3 +54,45 @@ class DownloadArchive(): self.cursor.execute( "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) return self.cursor.fetchone() + + def finalize(self): + pass + + +class DownloadArchiveMemory(DownloadArchive): + + def __init__(self, path, format_string, pragma=None, + cache_key="_archive_key"): + DownloadArchive.__init__(self, path, format_string, pragma, cache_key) + self.keys = set() + + def add(self, kwdict): + self.keys.add( + kwdict.get(self._cache_key) or + self.keygen(kwdict)) + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + if key in self.keys: + return True + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() + + def finalize(self): + if not self.keys: + return + + cursor = self.cursor + with self.connection: + try: + cursor.execute("BEGIN") + except sqlite3.OperationalError: + pass + + stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)" + if len(self.keys) < 100: + for key in self.keys: + cursor.execute(stmt, (key,)) + else: + cursor.executemany(stmt, ((key,) for key in self.keys)) From 33006fe12624eb10ebc8f50dbfadf98575ff0b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 May 2024 18:20:08 +0200 Subject: [PATCH 145/154] [common] disable 'check_hostname' for non-urllib3 SSLContexts e.g. when 'browser' is set to a non-empty value and gallery-dl creates its own SSLContext instance instead of using requests' and urllib3's defaults. urllib3 disables this option for its default contexts, since it does this check on its own. Fixes "ValueError: Cannot set verify_mode to CERT_NONE when check_hostname is enabled" when using --no-check-certificate. (#3614, #4891, #5576) --- gallery_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 90b117d3..7efd06a2 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -840,6 +840,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): if ssl_ciphers: ssl_context.set_ecdh_curve("prime256v1") ssl_context.set_ciphers(ssl_ciphers) + ssl_context.check_hostname = False else: ssl_context = None From 28039229fec4c481efced928d5a5fd8931b5e1c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 May 2024 19:17:39 +0200 Subject: [PATCH 146/154] [common] use 'create_urllib3_context' for creating SSLContexts enables dumping TLS session keys by setting SSLKEYLOGFILE (#5215) as well as other potentially useful settings. --- gallery_dl/extractor/common.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 7efd06a2..d80dea2a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -22,6 +22,7 @@ import threading from requests.adapters import HTTPAdapter from .message import Message from .. import config, text, util, cache, exception +urllib3 = requests.packages.urllib3 class Extractor(): @@ -834,12 +835,8 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): pass if ssl_options or ssl_ciphers: - ssl_context = ssl.create_default_context() - if ssl_options: - ssl_context.options |= ssl_options - if ssl_ciphers: - ssl_context.set_ecdh_curve("prime256v1") - ssl_context.set_ciphers(ssl_ciphers) + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) ssl_context.check_hostname = False else: ssl_context = None @@ -960,8 +957,6 @@ SSL_CIPHERS = { } -urllib3 = requests.packages.urllib3 - # detect brotli support try: BROTLI = urllib3.response.brotli is not None From fd734b92223a02c0c392e4eece6bf82ba0da1fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 May 2024 22:17:53 +0200 Subject: [PATCH 147/154] [archive] add 'archive-mode' option (#5255) --- docs/configuration.rst | 18 +++++++++++++++++- gallery_dl/job.py | 8 +++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 5624d1a5..3f4539f8 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -844,6 +844,22 @@ Description An alternative `format string`_ to build archive IDs with. +extractor.*.archive-mode +------------------------ +Type + ``string`` +Default + ``"file"`` +Description + Controls when to write `archive IDs `__ + to the archive database. + + * ``"file"``: Write IDs immediately + after completing or skipping a file download. + * ``"memory"``: Keep IDs in memory + and only write them after successful job completion. + + extractor.*.archive-prefix -------------------------- Type @@ -6172,7 +6188,7 @@ Description * format * General format string for logging messages - or a dictionary with format strings for each loglevel. + or an ``object`` with format strings for each loglevel. In addition to the default `LogRecord attributes `__, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 6c2d2ba8..b4efb26f 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -436,6 +436,8 @@ class DownloadJob(Job): def handle_finalize(self): if self.archive: + if not self.status: + self.archive.finalize() self.archive.close() pathfmt = self.pathfmt @@ -530,7 +532,11 @@ class DownloadJob(Job): if "{" in archive_path: archive_path = formatter.parse( archive_path).format_map(kwdict) - self.archive = archive.DownloadArchive( + if cfg("archive-mode") == "memory": + archive_cls = archive.DownloadArchiveMemory + else: + archive_cls = archive.DownloadArchive + self.archive = archive_cls( archive_path, archive_format, archive_pragma) except Exception as exc: extr.log.warning( From d2f50ecf0954c51297a1f7491e9318550695b36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 10 May 2024 22:56:51 +0200 Subject: [PATCH 148/154] add 'skip-filter' option (#5255) --- docs/configuration.rst | 9 +++++++++ gallery_dl/job.py | 15 ++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 3f4539f8..4eebca2c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -337,6 +337,15 @@ Description filename extension (``file.1.ext``, ``file.2.ext``, etc.) +extractor.*.skip-filter +----------------------- +Type + ``string`` +Description + Python expression controlling which skipped files to count towards + ``"abort"`` / ``"terminate"`` / ``"exit"``. + + extractor.*.sleep ----------------- Type diff --git a/gallery_dl/job.py b/gallery_dl/job.py index b4efb26f..2822fc9a 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -468,9 +468,12 @@ class DownloadJob(Job): for callback in self.hooks["skip"]: callback(pathfmt) if self._skipexc: - self._skipcnt += 1 - if self._skipcnt >= self._skipmax: - raise self._skipexc() + if not self._skipftr or self._skipftr(pathfmt.kwdict): + self._skipcnt += 1 + if self._skipcnt >= self._skipmax: + raise self._skipexc() + else: + self._skipcnt = 0 def download(self, url): """Download 'url'""" @@ -559,6 +562,12 @@ class DownloadJob(Job): elif skip == "exit": self._skipexc = SystemExit self._skipmax = text.parse_int(smax) + + skip_filter = cfg("skip-filter") + if skip_filter: + self._skipftr = util.compile_expression(skip_filter) + else: + self._skipftr = None else: # monkey-patch methods to always return False pathfmt.exists = lambda x=None: False From 5b6b5dd81b0e8726e04e8b5e9f50665d0a3eddbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 May 2024 15:34:21 +0200 Subject: [PATCH 149/154] [8chan] fix downloaded files by sending 'TOS' cookie (#5578) --- gallery_dl/extractor/8chan.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index fc16f43c..a4b09977 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -26,6 +26,9 @@ class _8chanExtractor(Extractor): self.root = "https://8chan." + match.group(1) Extractor.__init__(self, match) + def _init(self): + self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + @memcache() def cookies_prepare(self): # fetch captcha cookies From f178839fc2db94be9545a5508594ddb5f377a754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 May 2024 18:32:52 +0200 Subject: [PATCH 150/154] [exhentai] fix multi-page viewer detection (#4969) --- gallery_dl/extractor/exhentai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 2cce281b..5329fee3 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -441,7 +441,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.AuthorizationError() if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") - if "hentai.org/mpv/" in page: + if page.count("hentai.org/mpv/") > 1: self.log.warning("Enabled Multi-Page Viewer is not supported") return page From 90b37416434146c2dc0bcb1906a6b1cfb518f5ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 May 2024 19:46:35 +0200 Subject: [PATCH 151/154] [cookies] set proper 'expires' value for Chrome session cookies https://github.com/yt-dlp/yt-dlp/pull/9747 --- gallery_dl/cookies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 4dd80864..b4986c1e 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -146,7 +146,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, set_cookie(Cookie( 0, name, value, None, False, domain, bool(domain), domain.startswith("."), - path, bool(path), secure, expires, False, None, None, {}, + path, bool(path), secure, expires or None, False, + None, None, {}, )) if failed_cookies > 0: From 3b6f306a8bfae77db43459a31efc84623f2188f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 May 2024 22:01:34 +0200 Subject: [PATCH 152/154] [exhentai] fix blank page detection --- gallery_dl/extractor/exhentai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 5329fee3..18054035 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -50,7 +50,7 @@ class ExhentaiExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and response.headers.get("Content-Length") == "0": + if "Cache-Control" not in response.headers and not response.content: self.log.info("blank page") raise exception.AuthorizationError() return response From 33b07c46036331d45a88a91b165d3de4feb130d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 11 May 2024 23:54:05 +0200 Subject: [PATCH 153/154] [twitter] wait for rate limit reset before 429 error (#5532) --- gallery_dl/extractor/twitter.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 688478cd..7ace6a72 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache import itertools +import random import json import re @@ -1300,6 +1301,11 @@ class TwitterAPI(): if csrf_token: self.headers["x-csrf-token"] = csrf_token + remaining = int(response.headers.get("x-rate-limit-remaining", 6)) + if remaining < 6 and remaining <= random.randrange(1, 6): + self._handle_ratelimit(response) + continue + try: data = response.json() except ValueError: @@ -1353,13 +1359,7 @@ class TwitterAPI(): not self.headers["x-twitter-auth-type"]: raise exception.AuthorizationError("Login required") elif response.status_code == 429: - # rate limit exceeded - if self.extractor.config("ratelimit") == "abort": - raise exception.StopExtraction("Rate limit exceeded") - - until = response.headers.get("x-rate-limit-reset") - seconds = None if until else 60 - self.extractor.wait(until=until, seconds=seconds) + self._handle_ratelimit(response) continue # error @@ -1702,6 +1702,13 @@ class TwitterAPI(): return variables["cursor"] = cursor + def _handle_ratelimit(self, response): + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + + until = response.headers.get("x-rate-limit-reset") + self.extractor.wait(until=until, seconds=None if until else 60) + def _process_tombstone(self, entry, tombstone): text = (tombstone.get("richText") or tombstone["text"])["text"] tweet_id = entry["entryId"].rpartition("-")[2] @@ -1716,7 +1723,6 @@ class TwitterAPI(): @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): - import random def process(data, params=None): response = extr.request( From 07d962d60aed598f0ee8578df914c38e5fc939aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 12 May 2024 16:52:38 +0200 Subject: [PATCH 154/154] [workflows:tests] remove Python 3.5 currently causing errors in the setup phase: Error: Could not find a version that satisfies the requirement pip Error: No matching distribution found for pip Error: The process '/usr/bin/bash' failed with exit code 1 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 18a30c72..6c031739 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] + python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] steps: - uses: actions/checkout@v4