Merge branch 'mikf:master' into rawkuma

pull/4571/head
thatDudo 8 months ago committed by GitHub
commit 0f2dc855b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,56 @@
name: docker
on:
push:
tags:
- v[0-9]+.[0-9]+.[0-9]+
permissions:
packages: write
jobs:
docker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
# https://github.com/docker/setup-buildx-action
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
# https://github.com/docker/login-action
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
# https://github.com/docker/metadata-action
- name: Generate Docker tags
uses: docker/metadata-action@v5
id: metadata
with:
images: |
mikf123/gallery-dl
ghcr.io/mikf/gallery-dl
tags: |
type=sha,format=long,prefix=
type=ref,event=tag
# https://github.com/docker/build-push-action
- name: Build image
uses: docker/build-push-action@v5
with:
push: true
tags: ${{ steps.metadata.outputs.tags }}
labels: ${{ steps.metadata.outputs.labels }}
platforms: linux/amd64

@ -11,12 +11,12 @@ jobs:
matrix: matrix:
os: ["windows-latest", "macOS-latest"] os: ["windows-latest", "macOS-latest"]
architecture: ["x64"] architecture: ["x64"]
python-version: ["3.11"] python-version: ["3.12"]
python-packages: [""] python-packages: [""]
include: include:
- os: "ubuntu-latest" - os: "ubuntu-latest"
architecture: "x64" architecture: "x64"
python-version: "3.11" python-version: "3.12"
python-packages: "secretstorage" python-packages: "secretstorage"
- os: "windows-2019" - os: "windows-2019"
architecture: "x86" architecture: "x86"
@ -24,7 +24,7 @@ jobs:
python-packages: "toml" python-packages: "toml"
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }} - name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }}
uses: actions/setup-python@v4 uses: actions/setup-python@v4

@ -15,10 +15,10 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.9"] python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"]
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- name: Check file permissions - name: Check file permissions
run: | run: |
@ -40,7 +40,7 @@ jobs:
3.4|3.5) 3.4|3.5)
# don't install yt-dlp # don't install yt-dlp
;; ;;
3.6) 3.6|3.7)
# install from PyPI # install from PyPI
pip install yt-dlp pip install yt-dlp
;; ;;

@ -1,5 +1,383 @@
# Changelog # Changelog
## 1.26.7 - 2024-01-21
### Extractors
#### Additions
- [2ch] add support ([#1009](https://github.com/mikf/gallery-dl/issues/1009), [#3540](https://github.com/mikf/gallery-dl/issues/3540), [#4444](https://github.com/mikf/gallery-dl/issues/4444))
- [deviantart:avatar] add `formats` option ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [hatenablog] add support ([#5036](https://github.com/mikf/gallery-dl/issues/5036), [#5037](https://github.com/mikf/gallery-dl/issues/5037))
- [mangadex] add `list` extractor ([#5025](https://github.com/mikf/gallery-dl/issues/5025))
- [steamgriddb] add support ([#5033](https://github.com/mikf/gallery-dl/issues/5033), [#5041](https://github.com/mikf/gallery-dl/issues/5041))
- [wikimedia] add support ([#1443](https://github.com/mikf/gallery-dl/issues/1443), [#2906](https://github.com/mikf/gallery-dl/issues/2906), [#3660](https://github.com/mikf/gallery-dl/issues/3660), [#2340](https://github.com/mikf/gallery-dl/issues/2340))
- [wikimedia] support `fandom` wikis ([#2677](https://github.com/mikf/gallery-dl/issues/2677), [#3378](https://github.com/mikf/gallery-dl/issues/3378))
#### Fixes
- [blogger] fix `lh-*.googleusercontent.com` URLs ([#5091](https://github.com/mikf/gallery-dl/issues/5091))
- [bunkr] update domain ([#5088](https://github.com/mikf/gallery-dl/issues/5088))
- [deviantart] fix AttributeError for URLs without username ([#5065](https://github.com/mikf/gallery-dl/issues/5065))
- [deviantart] fix `KeyError: 'premium_folder_data'` ([#5063](https://github.com/mikf/gallery-dl/issues/5063))
- [deviantart:avatar] fix exception when `comments` are enabled ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [fuskator] make metadata extraction non-fatal ([#5039](https://github.com/mikf/gallery-dl/issues/5039))
- [gelbooru] only log "Incomplete API response" for favorites ([#5045](https://github.com/mikf/gallery-dl/issues/5045))
- [giantessbooru] update domain
- [issuu] fix extraction
- [nijie] fix download URLs of single image posts ([#5049](https://github.com/mikf/gallery-dl/issues/5049))
- [patreon] fix `KeyError: 'name'` ([#5048](https://github.com/mikf/gallery-dl/issues/5048), [#5069](https://github.com/mikf/gallery-dl/issues/5069), [#5093](https://github.com/mikf/gallery-dl/issues/5093))
- [pixiv] update API headers ([#5029](https://github.com/mikf/gallery-dl/issues/5029))
- [realbooru] fix download URLs of older posts
- [twitter] revert to using `media` timeline by default ([#4953](https://github.com/mikf/gallery-dl/issues/4953))
- [vk] transform image URLs to non-blurred versions ([#5017](https://github.com/mikf/gallery-dl/issues/5017))
#### Improvements
- [batoto] support more mirror domains ([#5042](https://github.com/mikf/gallery-dl/issues/5042))
- [batoto] improve v2 manga URL pattern
- [gelbooru] support `all` tag and URLs with empty tags ([#5076](https://github.com/mikf/gallery-dl/issues/5076))
- [patreon] download `m3u8` manifests with ytdl
- [sankaku] support post URLs with alphanumeric IDs ([#5073](https://github.com/mikf/gallery-dl/issues/5073))
#### Metadata
- [batoto] improve `manga_id` extraction ([#5042](https://github.com/mikf/gallery-dl/issues/5042))
- [erome] fix `count` metadata
- [kemonoparty] add `revision_hash` metadata ([#4706](https://github.com/mikf/gallery-dl/issues/4706), [#4727](https://github.com/mikf/gallery-dl/issues/4727), [#5013](https://github.com/mikf/gallery-dl/issues/5013))
- [paheal] fix `source` metadata
- [webtoons] extract more metadata ([#5061](https://github.com/mikf/gallery-dl/issues/5061), [#5094](https://github.com/mikf/gallery-dl/issues/5094))
#### Removals
- [chevereto] remove `pixl.li`
- [hbrowse] remove module
- [nitter] remove `nitter.lacontrevoie.fr`
## 1.26.6 - 2024-01-06
### Extractors
#### Additions
- [batoto] add `chapter` and `manga` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434), [#2111](https://github.com/mikf/gallery-dl/issues/2111), [#4979](https://github.com/mikf/gallery-dl/issues/4979))
- [deviantart] add `avatar` and `background` extractors ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [poringa] add support ([#4675](https://github.com/mikf/gallery-dl/issues/4675), [#4962](https://github.com/mikf/gallery-dl/issues/4962))
- [szurubooru] support `snootbooru.com` ([#5023](https://github.com/mikf/gallery-dl/issues/5023))
- [zzup] add `gallery` extractor ([#4517](https://github.com/mikf/gallery-dl/issues/4517), [#4604](https://github.com/mikf/gallery-dl/issues/4604), [#4659](https://github.com/mikf/gallery-dl/issues/4659), [#4863](https://github.com/mikf/gallery-dl/issues/4863), [#5016](https://github.com/mikf/gallery-dl/issues/5016))
#### Fixes
- [gelbooru] fix `favorite` extractor ([#4903](https://github.com/mikf/gallery-dl/issues/4903))
- [idolcomplex] fix extraction & update URL patterns ([#5002](https://github.com/mikf/gallery-dl/issues/5002))
- [imagechest] fix loading more than 10 images in a gallery ([#4469](https://github.com/mikf/gallery-dl/issues/4469))
- [jpgfish] update domain
- [komikcast] fix `manga` extractor ([#5027](https://github.com/mikf/gallery-dl/issues/5027))
- [komikcast] update domain ([#5027](https://github.com/mikf/gallery-dl/issues/5027))
- [lynxchan] update `bbw-chan` domain ([#4970](https://github.com/mikf/gallery-dl/issues/4970))
- [manganelo] fix extraction & recognize `.to` TLDs ([#5005](https://github.com/mikf/gallery-dl/issues/5005))
- [paheal] restore `extension` metadata ([#4976](https://github.com/mikf/gallery-dl/issues/4976))
- [rule34us] add fallback for `video-cdn1` videos ([#4985](https://github.com/mikf/gallery-dl/issues/4985))
- [weibo] fix AttributeError in `user` extractor ([#5022](https://github.com/mikf/gallery-dl/issues/5022))
#### Improvements
- [gelbooru] show error for invalid API responses ([#4903](https://github.com/mikf/gallery-dl/issues/4903))
- [rule34] recognize URLs with `www` subdomain ([#4984](https://github.com/mikf/gallery-dl/issues/4984))
- [twitter] raise error for invalid `strategy` values ([#4953](https://github.com/mikf/gallery-dl/issues/4953))
#### Metadata
- [fanbox] add `metadata` option ([#4921](https://github.com/mikf/gallery-dl/issues/4921))
- [nijie] add `count` metadata ([#146](https://github.com/mikf/gallery-dl/issues/146))
- [pinterest] add `count` metadata ([#4981](https://github.com/mikf/gallery-dl/issues/4981))
### Miscellaneous
- fix and update zsh completion ([#4972](https://github.com/mikf/gallery-dl/issues/4972))
- fix `--cookies-from-browser` macOS Firefox profile path
## 1.26.5 - 2023-12-23
### Extractors
#### Additions
- [deviantart] add `intermediary` option ([#4955](https://github.com/mikf/gallery-dl/issues/4955))
- [inkbunny] add `unread` extractor ([#4934](https://github.com/mikf/gallery-dl/issues/4934))
- [mastodon] support non-numeric status IDs ([#4936](https://github.com/mikf/gallery-dl/issues/4936))
- [myhentaigallery] recognize `/g/` URLs ([#4920](https://github.com/mikf/gallery-dl/issues/4920))
- [postmill] add support ([#4917](https://github.com/mikf/gallery-dl/issues/4917), [#4919](https://github.com/mikf/gallery-dl/issues/4919))
- {shimmie2[ support `rule34hentai.net` ([#861](https://github.com/mikf/gallery-dl/issues/861), [#4789](https://github.com/mikf/gallery-dl/issues/4789), [#4945](https://github.com/mikf/gallery-dl/issues/4945))
#### Fixes
- [deviantart] add workaround for integer `client-id` values ([#4924](https://github.com/mikf/gallery-dl/issues/4924))
- [exhentai] fix error for infinite `fallback-retries` ([#4911](https://github.com/mikf/gallery-dl/issues/4911))
- [inkbunny] stop pagination on empty results
- [patreon] fix bootstrap data extraction again ([#4904](https://github.com/mikf/gallery-dl/issues/4904))
- [tumblr] fix exception after waiting for rate limit ([#4916](https://github.com/mikf/gallery-dl/issues/4916))
#### Improvements
- [exhentai] output continuation URL when interrupted ([#4782](https://github.com/mikf/gallery-dl/issues/4782))
- [inkbunny] improve `/submissionsviewall.php` patterns ([#4934](https://github.com/mikf/gallery-dl/issues/4934))
- [tumblr] support infinite `fallback-retries`
- [twitter] default to `tweets` timeline when `replies` are enabled ([#4953](https://github.com/mikf/gallery-dl/issues/4953))
#### Metadata
- [danbooru] provide `tags` as list ([#4942](https://github.com/mikf/gallery-dl/issues/4942))
- [deviantart] set `is_original` for intermediary URLs to `false`
- [twitter] remove `date_liked` ([#3850](https://github.com/mikf/gallery-dl/issues/3850), [#4108](https://github.com/mikf/gallery-dl/issues/4108), [#4657](https://github.com/mikf/gallery-dl/issues/4657))
### Docker
- add Docker instructions to README ([#4850](https://github.com/mikf/gallery-dl/issues/4850))
- fix auto-generation of `latest` tags
## 1.26.4 - 2023-12-10
### Extractors
#### Additions
- [exhentai] add `fallback-retries` option ([#4792](https://github.com/mikf/gallery-dl/issues/4792))
- [urlgalleries] add `gallery` extractor ([#919](https://github.com/mikf/gallery-dl/issues/919), [#1184](https://github.com/mikf/gallery-dl/issues/1184), [#2905](https://github.com/mikf/gallery-dl/issues/2905), [#4886](https://github.com/mikf/gallery-dl/issues/4886))
#### Fixes
- [nijie] fix image URLs of multi-image posts ([#4876](https://github.com/mikf/gallery-dl/issues/4876))
- [patreon] fix bootstrap data extraction ([#4904](https://github.com/mikf/gallery-dl/issues/4904), [#4906](https://github.com/mikf/gallery-dl/issues/4906))
- [twitter] fix `/media` timelines ([#4898](https://github.com/mikf/gallery-dl/issues/4898), [#4899](https://github.com/mikf/gallery-dl/issues/4899))
- [twitter] retry API requests when response contains incomplete results ([#4811](https://github.com/mikf/gallery-dl/issues/4811))
#### Improvements
- [exhentai] store more cookies when logging in with username & password ([#4881](https://github.com/mikf/gallery-dl/issues/4881))
- [twitter] generalize "Login Required" errors ([#4734](https://github.com/mikf/gallery-dl/issues/4734), [#4324](https://github.com/mikf/gallery-dl/issues/4324))
### Options
- add `-e/--error-file` command-line and `output.errorfile` config option ([#4732](https://github.com/mikf/gallery-dl/issues/4732))
### Miscellaneous
- automatically build and push Docker images
- prompt for passwords on login when necessary
- fix `util.dump_response()` to work with `bytes` header values
## 1.26.3 - 2023-11-27
### Extractors
#### Additions
- [behance] support `text` modules ([#4799](https://github.com/mikf/gallery-dl/issues/4799))
- [behance] add `modules` option ([#4799](https://github.com/mikf/gallery-dl/issues/4799))
- [blogger] support `www.micmicidol.club` ([#4759](https://github.com/mikf/gallery-dl/issues/4759))
- [erome] add `count` metadata ([#4812](https://github.com/mikf/gallery-dl/issues/4812))
- [exhentai] add `gp` option ([#4576](https://github.com/mikf/gallery-dl/issues/4576))
- [fapello] support `.su` TLD ([#4840](https://github.com/mikf/gallery-dl/issues/4840), [#4841](https://github.com/mikf/gallery-dl/issues/4841))
- [pixeldrain] add `file` and `album` extractors ([#4839](https://github.com/mikf/gallery-dl/issues/4839))
- [pixeldrain] add `api-key` option ([#4839](https://github.com/mikf/gallery-dl/issues/4839))
- [tmohentai] add `gallery` extractor ([#4808](https://github.com/mikf/gallery-dl/issues/4808), [#4832](https://github.com/mikf/gallery-dl/issues/4832))
#### Fixes
- [cyberdrop] update to site layout changes
- [exhentai] handle `Downloading … requires GP` errors ([#4576](https://github.com/mikf/gallery-dl/issues/4576), [#4763](https://github.com/mikf/gallery-dl/issues/4763))
- [exhentai] fix empty API URL with `"source": "hitomi"` ([#4829](https://github.com/mikf/gallery-dl/issues/4829))
- [hentaifoundry] check for and update expired sessions ([#4694](https://github.com/mikf/gallery-dl/issues/4694))
- [hiperdex] fix `manga` metadata
- [idolcomplex] update to site layout changes
- [imagefap] fix resolution of single images
- [instagram] fix exception on empty `video_versions` ([#4795](https://github.com/mikf/gallery-dl/issues/4795))
- [mangaread] fix extraction
- [mastodon] fix reblogs ([#4580](https://github.com/mikf/gallery-dl/issues/4580))
- [nitter] fix video extraction ([#4853](https://github.com/mikf/gallery-dl/issues/4853), [#4855](https://github.com/mikf/gallery-dl/issues/4855))
- [pornhub] fix `user` metadata for gifs
- [tumblr] fix `day` extractor
- [wallpapercave] fix extraction
- [warosu] fix file URLs
- [webtoons] fix pagination when receiving an HTTP redirect
- [xvideos] fix metadata extraction
- [zerochan] fix metadata extraction
#### Improvements
- [hentaicosplays] force `https://` for download URLs
- [oauth] warn when cache is enabled but not writeable ([#4771](https://github.com/mikf/gallery-dl/issues/4771))
- [sankaku] update URL patterns
- [twitter] ignore promoted Tweets ([#3894](https://github.com/mikf/gallery-dl/issues/3894), [#4790](https://github.com/mikf/gallery-dl/issues/4790))
- [weibo] detect redirects to login page ([#4773](https://github.com/mikf/gallery-dl/issues/4773))
#### Removals
- [foolslide] remove `powermanga.org`
### Downloaders
#### Changes
- [http] treat files not passing `filesize-min`/`-max` as skipped ([#4821](https://github.com/mikf/gallery-dl/issues/4821))
### Options
#### Additions
- add `metadata-extractor` option ([#4549](https://github.com/mikf/gallery-dl/issues/4549))
- support `metadata-*` names for `*-metadata` options
(for example `url-metadata` is now also recognized as `metadata-url`)
### CLI
#### Additions
- implement `-I/--input-file-comment` and `-x/--input-file-delete` options ([#4732](https://github.com/mikf/gallery-dl/issues/4732))
- add `--ugoira` as a general version of `--ugoira-conv` and co.
- add `--mtime` as a general version of `--mtime-from-date`
- add `--cbz`
#### Fixes
- allow `--mtime-from-date` to work with Weibo`s metadata structure
### Miscellaneous
#### Additions
- add a simple Dockerfile ([#4831](https://github.com/mikf/gallery-dl/issues/4831))
## 1.26.2 - 2023-11-04
### Extractors
#### Additions
- [4archive] add `thread` and `board` extractors ([#1262](https://github.com/mikf/gallery-dl/issues/1262), [#2418](https://github.com/mikf/gallery-dl/issues/2418), [#4400](https://github.com/mikf/gallery-dl/issues/4400), [#4710](https://github.com/mikf/gallery-dl/issues/4710), [#4714](https://github.com/mikf/gallery-dl/issues/4714))
- [hitomi] recognize `imageset` gallery URLs ([#4756](https://github.com/mikf/gallery-dl/issues/4756))
- [kemonoparty] add `revision_index` metadata field ([#4727](https://github.com/mikf/gallery-dl/issues/4727))
- [misskey] support `misskey.design` ([#4713](https://github.com/mikf/gallery-dl/issues/4713))
- [reddit] support Reddit Mobile share links ([#4693](https://github.com/mikf/gallery-dl/issues/4693))
- [sankaku] support `/posts/` tag search URLs ([#4740](https://github.com/mikf/gallery-dl/issues/4740))
- [twitter] recognize `fixupx.com` URLs ([#4755](https://github.com/mikf/gallery-dl/issues/4755))
#### Fixes
- [exhentai] update to site layout changes ([#4730](https://github.com/mikf/gallery-dl/issues/4730), [#4754](https://github.com/mikf/gallery-dl/issues/4754))
- [exhentai] provide fallback URLs ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4745](https://github.com/mikf/gallery-dl/issues/4745))
- [exhentai] disable `DH` ciphers to avoid `DH_KEY_TOO_SMALL` errors ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4593](https://github.com/mikf/gallery-dl/issues/4593))
- [idolcomplex] disable sending Referer headers ([#4726](https://github.com/mikf/gallery-dl/issues/4726))
- [instagram] update API headers
- [kemonoparty] fix parsing of non-standard `date` values ([#4676](https://github.com/mikf/gallery-dl/issues/4676))
- [patreon] fix `campaign_id` extraction ([#4699](https://github.com/mikf/gallery-dl/issues/4699), [#4715](https://github.com/mikf/gallery-dl/issues/4715), [#4736](https://github.com/mikf/gallery-dl/issues/4736), [#4738](https://github.com/mikf/gallery-dl/issues/4738))
- [pixiv] load cookies for non-OAuth URLs ([#4760](https://github.com/mikf/gallery-dl/issues/4760))
- [twitter] fix avatars without `date` information ([#4696](https://github.com/mikf/gallery-dl/issues/4696))
- [twitter] restore truncated retweet texts ([#3430](https://github.com/mikf/gallery-dl/issues/3430), [#4690](https://github.com/mikf/gallery-dl/issues/4690))
- [weibo] fix Sina Visitor requests
#### Improvements
- [behance] unescape embed URLs ([#4742](https://github.com/mikf/gallery-dl/issues/4742))
- [fantia] simplify `tags` to a list of strings ([#4752](https://github.com/mikf/gallery-dl/issues/4752))
- [kemonoparty] limit `title` length ([#4741](https://github.com/mikf/gallery-dl/issues/4741))
- [nijie] set 1-2s delay between requests to avoid 429 errors
- [patreon] provide ways to manually specify a user's campaign_id
- `https://www.patreon.com/id:12345`
- `https://www.patreon.com/USER?c=12345`
- `https://www.patreon.com/USER?campaign_id=12345`
- [twitter] cache `user_by_…` results ([#4719](https://github.com/mikf/gallery-dl/issues/4719))
### Post Processors
#### Fixes
- [metadata] ignore non-string tag values ([#4764](https://github.com/mikf/gallery-dl/issues/4764))
### Miscellaneous
#### Fixes
- prevent crash when `stdout.line_buffering` is not defined ([#642](https://github.com/mikf/gallery-dl/issues/642))
## 1.26.1 - 2023-10-21
### Extractors
#### Additions
- [bunkr] add extractor for media URLs ([#4684](https://github.com/mikf/gallery-dl/issues/4684))
- [chevereto] add generic extractors for `chevereto` sites ([#4664](https://github.com/mikf/gallery-dl/issues/4664))
- `deltaporno.com` ([#1381](https://github.com/mikf/gallery-dl/issues/1381))
- `img.kiwi`
- `jpgfish`
- `pixl.li` ([#3179](https://github.com/mikf/gallery-dl/issues/3179), [#4357](https://github.com/mikf/gallery-dl/issues/4357))
- [deviantart] implement `"group": "skip"` ([#4630](https://github.com/mikf/gallery-dl/issues/4630))
- [fantia] add `content_count` and `content_num` metadata fields ([#4627](https://github.com/mikf/gallery-dl/issues/4627))
- [imgbb] add `displayname` and `user_id` metadata ([#4626](https://github.com/mikf/gallery-dl/issues/4626))
- [kemonoparty] support post revisions; add `revisions` option ([#4498](https://github.com/mikf/gallery-dl/issues/4498), [#4597](https://github.com/mikf/gallery-dl/issues/4597))
- [kemonoparty] support searches ([#3385](https://github.com/mikf/gallery-dl/issues/3385), [#4057](https://github.com/mikf/gallery-dl/issues/4057))
- [kemonoparty] support discord URLs with channel IDs ([#4662](https://github.com/mikf/gallery-dl/issues/4662))
- [moebooru] add `metadata` option ([#4646](https://github.com/mikf/gallery-dl/issues/4646))
- [newgrounds] support multi-image posts ([#4642](https://github.com/mikf/gallery-dl/issues/4642))
- [sankaku] support `/posts/` URLs ([#4688](https://github.com/mikf/gallery-dl/issues/4688))
- [twitter] add `sensitive` metadata field ([#4619](https://github.com/mikf/gallery-dl/issues/4619))
#### Fixes
- [4chanarchives] disable Referer headers by default ([#4686](https://github.com/mikf/gallery-dl/issues/4686))
- [bunkr] fix `/d/` file URLs ([#4685](https://github.com/mikf/gallery-dl/issues/4685))
- [deviantart] expand nested comment replies ([#4653](https://github.com/mikf/gallery-dl/issues/4653))
- [deviantart] disable `jwt` ([#4652](https://github.com/mikf/gallery-dl/issues/4652))
- [hentaifoundry] fix `.swf` file downloads ([#4641](https://github.com/mikf/gallery-dl/issues/4641))
- [imgbb] fix `user` metadata extraction ([#4626](https://github.com/mikf/gallery-dl/issues/4626))
- [imgbb] update pagination end condition ([#4626](https://github.com/mikf/gallery-dl/issues/4626))
- [kemonoparty] update API endpoints ([#4676](https://github.com/mikf/gallery-dl/issues/4676), [#4677](https://github.com/mikf/gallery-dl/issues/4677))
- [patreon] update `campaign_id` path ([#4639](https://github.com/mikf/gallery-dl/issues/4639))
- [reddit] fix wrong previews ([#4649](https://github.com/mikf/gallery-dl/issues/4649))
- [redgifs] fix `niches` extraction ([#4666](https://github.com/mikf/gallery-dl/issues/4666), [#4667](https://github.com/mikf/gallery-dl/issues/4667))
- [twitter] fix crash due to missing `source` ([#4620](https://github.com/mikf/gallery-dl/issues/4620))
- [warosu] fix extraction ([#4634](https://github.com/mikf/gallery-dl/issues/4634))
### Post Processors
#### Additions
- support `{_filename}`, `{_directory}`, and `{_path}` replacement fields for `--exec` ([#4633](https://github.com/mikf/gallery-dl/issues/4633))
### Miscellaneous
#### Improvements
- avoid temporary copies with `--cookies-from-browser` by opening cookie databases in read-only mode
## 1.26.0 - 2023-10-03
- ### Extractors
#### Additions
- [behance] add `date` metadata field ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [danbooru] support `booru.borvar.art` ([#4096](https://github.com/mikf/gallery-dl/issues/4096))
- [danbooru] support `donmai.moe`
- [deviantart] add `is_original` metadata field ([#4559](https://github.com/mikf/gallery-dl/issues/4559))
- [e621] support `e6ai.net` ([#4320](https://github.com/mikf/gallery-dl/issues/4320))
- [exhentai] add `fav` option ([#4409](https://github.com/mikf/gallery-dl/issues/4409))
- [gelbooru_v02] support `xbooru.com` ([#4493](https://github.com/mikf/gallery-dl/issues/4493))
- [instagram] add `following` extractor ([#1848](https://github.com/mikf/gallery-dl/issues/1848))
- [pillowfort] support `/tagged/` URLs ([#4570](https://github.com/mikf/gallery-dl/issues/4570))
- [pornhub] add `gif` support ([#4463](https://github.com/mikf/gallery-dl/issues/4463))
- [reddit] add `previews` option ([#4322](https://github.com/mikf/gallery-dl/issues/4322))
- [redgifs] add `niches` extractor ([#4311](https://github.com/mikf/gallery-dl/issues/4311), [#4312](https://github.com/mikf/gallery-dl/issues/4312))
- [redgifs] support `order` parameter for user URLs ([#4583](https://github.com/mikf/gallery-dl/issues/4583))
- [twitter] add `user` extractor and `include` option ([#4275](https://github.com/mikf/gallery-dl/issues/4275))
- [twitter] add `tweet-endpoint` option ([#4307](https://github.com/mikf/gallery-dl/issues/4307))
- [twitter] add `date_original` metadata for retweets ([#4337](https://github.com/mikf/gallery-dl/issues/4337), [#4443](https://github.com/mikf/gallery-dl/issues/4443))
- [twitter] extract `source` metadata ([#4459](https://github.com/mikf/gallery-dl/issues/4459))
- [twitter] support `x.com` URLs ([#4452](https://github.com/mikf/gallery-dl/issues/4452))
#### Improvements
- include `Referer` header in all HTTP requests ([#4490](https://github.com/mikf/gallery-dl/issues/4490), [#4518](https://github.com/mikf/gallery-dl/issues/4518))
(can be disabled with `referer` option)
- [behance] show errors for mature content ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [deviantart] re-add `quality` option and `/intermediary/` transform
- [fantia] improve metadata extraction ([#4126](https://github.com/mikf/gallery-dl/issues/4126))
- [instagram] better error messages for invalid users ([#4606](https://github.com/mikf/gallery-dl/issues/4606))
- [mangadex] support multiple values for `lang` ([#4093](https://github.com/mikf/gallery-dl/issues/4093))
- [mastodon] support `/@USER/following` URLs ([#4608](https://github.com/mikf/gallery-dl/issues/4608))
- [moebooru] match search URLs with empty `tags` ([#4354](https://github.com/mikf/gallery-dl/issues/4354))
- [pillowfort] extract `b2_lg_url` media ([#4570](https://github.com/mikf/gallery-dl/issues/4570))
- [reddit] improve comment metadata ([#4482](https://github.com/mikf/gallery-dl/issues/4482))
- [reddit] ignore `/message/compose` URLs ([#4482](https://github.com/mikf/gallery-dl/issues/4482), [#4581](https://github.com/mikf/gallery-dl/issues/4581))
- [redgifs] provide `collection` metadata as separate field ([#4508](https://github.com/mikf/gallery-dl/issues/4508))
- [redgifs] match `gfycat` image URLs ([#4558](https://github.com/mikf/gallery-dl/issues/4558))
- [twitter] improve error messages for single Tweets ([#4369](https://github.com/mikf/gallery-dl/issues/4369))
#### Fixes
- [acidimg] fix extraction
- [architizer] fix extraction ([#4537](https://github.com/mikf/gallery-dl/issues/4537))
- [behance] fix and update `user` extractor ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [behance] fix cookie usage ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [behance] handle videos without `renditions` ([#4523](https://github.com/mikf/gallery-dl/issues/4523))
- [bunkr] fix media domain for `cdn9` ([#4386](https://github.com/mikf/gallery-dl/issues/4386), [#4412](https://github.com/mikf/gallery-dl/issues/4412))
- [bunkr] fix extracting `.wmv` files ([#4419](https://github.com/mikf/gallery-dl/issues/4419))
- [bunkr] fix media domain for `cdn-pizza.bunkr.ru` ([#4489](https://github.com/mikf/gallery-dl/issues/4489))
- [bunkr] fix extraction ([#4514](https://github.com/mikf/gallery-dl/issues/4514), [#4532](https://github.com/mikf/gallery-dl/issues/4532), [#4529](https://github.com/mikf/gallery-dl/issues/4529), [#4540](https://github.com/mikf/gallery-dl/issues/4540))
- [deviantart] fix full resolution URLs for non-downloadable images ([#293](https://github.com/mikf/gallery-dl/issues/293), [#4548](https://github.com/mikf/gallery-dl/issues/4548), [#4563](https://github.com/mikf/gallery-dl/issues/4563))
- [deviantart] fix shortened URLs ([#4316](https://github.com/mikf/gallery-dl/issues/4316))
- [deviantart] fix search ([#4384](https://github.com/mikf/gallery-dl/issues/4384))
- [deviantart] update Eclipse API endpoints ([#4553](https://github.com/mikf/gallery-dl/issues/4553), [#4615](https://github.com/mikf/gallery-dl/issues/4615))
- [deviantart] use private tokens for `is_mature` posts ([#4563](https://github.com/mikf/gallery-dl/issues/4563))
- [flickr] update default API credentials ([#4332](https://github.com/mikf/gallery-dl/issues/4332))
- [giantessbooru] fix extraction ([#4373](https://github.com/mikf/gallery-dl/issues/4373))
- [hiperdex] fix crash for titles containing Unicode characters ([#4325](https://github.com/mikf/gallery-dl/issues/4325))
- [hiperdex] fix `manga` metadata
- [imagefap] fix pagination ([#3013](https://github.com/mikf/gallery-dl/issues/3013))
- [imagevenue] fix extraction ([#4473](https://github.com/mikf/gallery-dl/issues/4473))
- [instagram] fix private posts with long shortcodes ([#4362](https://github.com/mikf/gallery-dl/issues/4362))
- [instagram] fix video preview archive IDs ([#2135](https://github.com/mikf/gallery-dl/issues/2135), [#4455](https://github.com/mikf/gallery-dl/issues/4455))
- [instagram] handle exceptions due to missing media ([#4555](https://github.com/mikf/gallery-dl/issues/4555))
- [issuu] fix extraction ([#4420](https://github.com/mikf/gallery-dl/issues/4420))
- [jpgfish] update domain to `jpg1.su` ([#4494](https://github.com/mikf/gallery-dl/issues/4494))
- [kemonoparty] update `favorite` API endpoint ([#4522](https://github.com/mikf/gallery-dl/issues/4522))
- [lensdump] fix extraction ([#4352](https://github.com/mikf/gallery-dl/issues/4352))
- [mangakakalot] update domain
- [reddit] fix `preview.redd.it` URLs ([#4470](https://github.com/mikf/gallery-dl/issues/4470))
- [patreon] fix extraction ([#4547](https://github.com/mikf/gallery-dl/issues/4547))
- [pixiv] handle errors for private novels ([#4481](https://github.com/mikf/gallery-dl/issues/4481))
- [pornhub] fix extraction ([#4301](https://github.com/mikf/gallery-dl/issues/4301))
- [pururin] fix extraction ([#4375](https://github.com/mikf/gallery-dl/issues/4375))
- [subscribestar] fix preview detection ([#4468](https://github.com/mikf/gallery-dl/issues/4468))
- [twitter] fix crash on private user ([#4349](https://github.com/mikf/gallery-dl/issues/4349))
- [twitter] fix `TweetWithVisibilityResults` ([#4369](https://github.com/mikf/gallery-dl/issues/4369))
- [twitter] fix crash when `sortIndex` is undefined ([#4499](https://github.com/mikf/gallery-dl/issues/4499))
- [zerochan] fix `tags` extraction ([#4315](https://github.com/mikf/gallery-dl/issues/4315), [#4319](https://github.com/mikf/gallery-dl/issues/4319))
#### Removals
- [gfycat] remove module
- [shimmie2] remove `meme.museum`
- ### Post Processors
#### Changes
- update `finalize` events
- add `finalize-error` and `finalize-success` events that trigger
depending on whether error(s) did or did not happen
- change `finalize` to always trigger regardless of error status
#### Additions
- add `python` post processor
- add `prepare-after` event ([#4083](https://github.com/mikf/gallery-dl/issues/4083))
- [ugoira] add `"framerate": "uniform"` ([#4421](https://github.com/mikf/gallery-dl/issues/4421))
#### Improvements
- [ugoira] extend `ffmpeg-output` ([#4421](https://github.com/mikf/gallery-dl/issues/4421))
#### Fixes
- [ugoira] restore `libx264-prevent-odd` ([#4407](https://github.com/mikf/gallery-dl/issues/4407))
- [ugoira] fix high frame rates ([#4421](https://github.com/mikf/gallery-dl/issues/4421))
- ### Downloaders
#### Fixes
- [http] close connection when file already exists ([#4403](https://github.com/mikf/gallery-dl/issues/4403))
- ### Options
#### Additions
- support `parent>child` categories for child extractor options,
for example an `imgur` album from a `reddit` thread with `reddit>imgur`
- implement `subconfigs` option ([#4440](https://github.com/mikf/gallery-dl/issues/4440))
- add `"ascii+"` as a special `path-restrict` value ([#4371](https://github.com/mikf/gallery-dl/issues/4371))
#### Removals
- remove `pyopenssl` option
- ### Tests
#### Improvements
- move extractor results into their own, separate files ([#4504](https://github.com/mikf/gallery-dl/issues/4504))
- include fallback URLs in content tests ([#3163](https://github.com/mikf/gallery-dl/issues/3163))
- various test method improvements
- ### Miscellaneous
#### Fixes
- [formatter] use value of last alternative ([#4492](https://github.com/mikf/gallery-dl/issues/4492))
- fix imports when running `__main__.py` ([#4581](https://github.com/mikf/gallery-dl/issues/4581))
- fix symlink resolution in `__main__.py`
- fix default Firefox user agent string
## 1.25.8 - 2023-07-15 ## 1.25.8 - 2023-07-15
### Changes ### Changes
- update default User-Agent header to Firefox 115 ESR - update default User-Agent header to Firefox 115 ESR

@ -0,0 +1,5 @@
FROM python:alpine
RUN python3 -m pip install -U gallery-dl yt-dlp
RUN apk update
RUN apk add ffmpeg
ENTRYPOINT [ "gallery-dl" ]

@ -72,9 +72,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and Prebuilt executable files with a Python interpreter and
required Python packages included are available for required Python packages included are available for
- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.exe>`__ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.bin>`__ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.bin>`__
Nightly Builds Nightly Builds
@ -132,6 +132,43 @@ For macOS users with MacPorts:
sudo port install gallery-dl sudo port install gallery-dl
Docker
--------
Using the Dockerfile in the repository:
.. code:: bash
git clone https://github.com/mikf/gallery-dl.git
cd gallery-dl/
docker build -t gallery-dl:latest .
Pulling image from `Docker Hub <https://hub.docker.com/r/mikf123/gallery-dl>`__:
.. code:: bash
docker pull mikf123/gallery-dl
docker tag mikf123/gallery-dl gallery-dl
Pulling image from `GitHub Container Registry <https://github.com/mikf/gallery-dl/pkgs/container/gallery-dl>`__:
.. code:: bash
docker pull ghcr.io/mikf/gallery-dl
docker tag ghcr.io/mikf/gallery-dl gallery-dl
To run the container you will probably want to attach some directories on the host so that the config file and downloads can persist across runs.
Make sure to either download the example config file reference in the repo and place it in the mounted volume location or touch an empty file there.
If you gave the container a different tag or are using podman then make sure you adjust. Run ``docker image ls`` to check the name if you are not sure.
This will remove the container after every use so you will always have a fresh environment for it to run. If you setup a ci-cd pipeline to autobuild the container you can also add a ``--pull=newer`` flag so that when you run it docker will check to see if there is a newer container and download it before running.
.. code:: bash
docker run --rm -v $HOME/Downloads/:/gallery-dl/ -v $HOME/.config/gallery-dl/gallery-dl.conf:/etc/gallery-dl.conf -it gallery-dl:latest
You can also add an alias to your shell for "gallery-dl" or create a simple bash script and drop it somewhere in your $PATH to act as a shim for this command.
Usage Usage
===== =====

@ -166,6 +166,8 @@ Description
extractor.*.parent-metadata extractor.*.parent-metadata
--------------------------- ---------------------------
extractor.*.metadata-parent
---------------------------
Type Type
* ``bool`` * ``bool``
* ``string`` * ``string``
@ -377,7 +379,7 @@ Description
The username and password to use when attempting to log in to The username and password to use when attempting to log in to
another site. another site.
Specifying a username and password is required for Specifying username and password is required for
* ``nijie`` * ``nijie``
@ -413,6 +415,10 @@ Description
(*) The password value for these sites should be (*) The password value for these sites should be
the API key found in your user profile, not the actual account password. the API key found in your user profile, not the actual account password.
Note: Leave the ``password`` value empty or undefined
to get prompted for a passeword when performing a login
(see `getpass() <https://docs.python.org/3/library/getpass.html#getpass.getpass>`__).
extractor.*.netrc extractor.*.netrc
----------------- -----------------
@ -621,6 +627,20 @@ Description
`ssl.SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>`__ `ssl.SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>`__
extractor.*.tls12
-----------------
Type
``bool``
Default
* ``true``
* ``false`` for ``patreon``, ``pixiv:series``
Description
Allow selecting TLS 1.2 cipher suites.
Can be disabled to alter TLS fingerprints
and potentially bypass Cloudflare blocks.
extractor.*.keywords extractor.*.keywords
-------------------- --------------------
Type Type
@ -642,12 +662,12 @@ Description
`format strings`_. `format strings`_.
extractor.*.metadata-url
------------------------
extractor.*.url-metadata extractor.*.url-metadata
------------------------ ------------------------
Type Type
``string`` ``string``
Default
``null``
Description Description
Insert a file's download URL into its metadata dictionary as the given name. Insert a file's download URL into its metadata dictionary as the given name.
@ -658,12 +678,12 @@ Description
with a ``metadata`` post processor, etc. with a ``metadata`` post processor, etc.
extractor.*.metadata-path
-------------------------
extractor.*.path-metadata extractor.*.path-metadata
------------------------- -------------------------
Type Type
``string`` ``string``
Default
``null``
Description Description
Insert a reference to the current Insert a reference to the current
`PathFormat <https://github.com/mikf/gallery-dl/blob/v1.24.2/gallery_dl/path.py#L27>`__ `PathFormat <https://github.com/mikf/gallery-dl/blob/v1.24.2/gallery_dl/path.py#L27>`__
@ -673,12 +693,24 @@ Description
to access the current file's filename as ``"{gdl_path.filename}"``. to access the current file's filename as ``"{gdl_path.filename}"``.
extractor.*.metadata-extractor
------------------------------
extractor.*.extractor-metadata
------------------------------
Type
``string``
Description
Insert a reference to the current
`Extractor <https://github.com/mikf/gallery-dl/blob/v1.26.2/gallery_dl/extractor/common.py#L26>`__
object into metadata dictionaries as the given name.
extractor.*.metadata-http
-------------------------
extractor.*.http-metadata extractor.*.http-metadata
------------------------- -------------------------
Type Type
``string`` ``string``
Default
``null``
Description Description
Insert an ``object`` containing a file's HTTP headers and Insert an ``object`` containing a file's HTTP headers and
``filename``, ``extension``, and ``date`` parsed from them ``filename``, ``extension``, and ``date`` parsed from them
@ -689,12 +721,12 @@ Description
and its parsed form as ``"{gdl_http[date]}"``. and its parsed form as ``"{gdl_http[date]}"``.
extractor.*.metadata-version
----------------------------
extractor.*.version-metadata extractor.*.version-metadata
---------------------------- ----------------------------
Type Type
``string`` ``string``
Default
``null``
Description Description
Insert an ``object`` containing gallery-dl's version info into Insert an ``object`` containing gallery-dl's version info into
metadata dictionaries as the given name. metadata dictionaries as the given name.
@ -1048,6 +1080,25 @@ Description
after a colon ``:``, for example ``{date:%Y%m%d}``. after a colon ``:``, for example ``{date:%Y%m%d}``.
extractor.*.write-pages
-----------------------
Type
* ``bool``
* ``string``
Default
``false``
Description
During data extraction,
write received HTTP request data
to enumerated files in the current working directory.
Special values:
* ``"all"``: Include HTTP request and response headers. Hide ``Authorization``, ``Cookie``, and ``Set-Cookie`` values.
* ``"ALL"``: Include all HTTP request and response headers.
Extractor-specific Options Extractor-specific Options
========================== ==========================
@ -1110,6 +1161,19 @@ Description
The maximum possible value appears to be ``1920``. The maximum possible value appears to be ``1920``.
extractor.behance.modules
-------------------------
Type
``list`` of ``strings``
Default
``["image", "video", "mediacollection", "embed"]``
Description
Selects which gallery modules to download from.
Supported module types are
``image``, ``video``, ``mediacollection``, ``embed``, ``text``.
extractor.blogger.videos extractor.blogger.videos
------------------------ ------------------------
Type Type
@ -1306,13 +1370,21 @@ Description
extractor.deviantart.group extractor.deviantart.group
-------------------------- --------------------------
Type Type
``bool`` * ``bool``
* ``string``
Default Default
``true`` ``true``
Description Description
Check whether the profile name in a given URL Check whether the profile name in a given URL
belongs to a group or a regular user. belongs to a group or a regular user.
When disabled, assume every given profile name
belongs to a regular user.
Special values:
* ``"skip"``: Skip groups
extractor.deviantart.include extractor.deviantart.include
---------------------------- ----------------------------
@ -1329,11 +1401,28 @@ Description
when processing a user profile. when processing a user profile.
Possible values are Possible values are
``"gallery"``, ``"scraps"``, ``"journal"``, ``"favorite"``, ``"status"``. ``"avatar"``,
``"background"``,
``"gallery"``,
``"scraps"``,
``"journal"``,
``"favorite"``,
``"status"``.
It is possible to use ``"all"`` instead of listing all values separately. It is possible to use ``"all"`` instead of listing all values separately.
extractor.deviantart.intermediary
---------------------------------
Type
``bool``
Default
``true``
Description
For older non-downloadable images,
download a higher-quality ``/intermediary/`` version.
extractor.deviantart.journals extractor.deviantart.journals
----------------------------- -----------------------------
Type Type
@ -1360,7 +1449,7 @@ Description
of otherwise non-downloadable, low-resolution images of otherwise non-downloadable, low-resolution images
to be able to download them in full resolution. to be able to download them in full resolution.
Note: This got patched by DeviantArt on 2023-09-19 and no longer works. Note: No longer functional as of 2023-10-11
extractor.deviantart.mature extractor.deviantart.mature
@ -1429,6 +1518,19 @@ Description
when a `refresh token <extractor.deviantart.refresh-token_>`__ is provided. when a `refresh token <extractor.deviantart.refresh-token_>`__ is provided.
extractor.deviantart.quality
----------------------------
Type
``integer``
Default
``100``
Description
JPEG quality level of newer images for which
an original file download is not available.
Note: Only has an effect when `deviantart.jwt <extractor.deviantart.jwt_>`__ is disabled.
extractor.deviantart.refresh-token extractor.deviantart.refresh-token
---------------------------------- ----------------------------------
Type Type
@ -1457,6 +1559,19 @@ Description
Minimum wait time in seconds before API requests. Minimum wait time in seconds before API requests.
extractor.deviantart.avatar.formats
-----------------------------------
Type
``list`` of ``strings``
Example
``["original.jpg", "big.jpg", "big.gif", ".png"]``
Description
Avatar URL formats to return.
| Each format is parsed as ``SIZE.EXT``.
| Leave ``SIZE`` empty to download the regular, small avatar format.
extractor.[E621].metadata extractor.[E621].metadata
------------------------- -------------------------
Type Type
@ -1467,7 +1582,7 @@ Default
``false`` ``false``
Example Example
* ``notes,pools`` * ``notes,pools``
* ``["notes", "pools"`` * ``["notes", "pools"]``
Description Description
Extract additional metadata (notes, pool metadata) if available. Extract additional metadata (notes, pool metadata) if available.
@ -1504,6 +1619,17 @@ Description
* ``"exhentai.org"``: Use ``exhentai.org`` for all URLs * ``"exhentai.org"``: Use ``exhentai.org`` for all URLs
extractor.exhentai.fallback-retries
-----------------------------------
Type
``integer``
Default
``2``
Description
Number of times a failed image gets retried
or ``-1`` for infinite retries.
extractor.exhentai.fav extractor.exhentai.fav
---------------------- ----------------------
Type Type
@ -1520,6 +1646,20 @@ Description
to already favorited galleries. to already favorited galleries.
extractor.exhentai.gp
---------------------
Type
``string``
Default
``"resized"``
Description
Selects how to handle "you do not have enough GP" errors.
* `"resized"`: Continue downloading `non-original <extractor.exhentai.original_>`__ images.
* `"stop"`: Stop the current extractor run.
* `"wait"`: Wait for user input before retrying the current image.
extractor.exhentai.limits extractor.exhentai.limits
------------------------- -------------------------
Type Type
@ -1584,6 +1724,21 @@ Description
* ``false``: Ignore embeds. * ``false``: Ignore embeds.
extractor.fanbox.metadata
-------------------------
Type
* ``bool``
* ``string``
* ``list`` of ``strings``
Default
``false``
Example
* ``user,plan``
* ``["user", "plan"]``
Description
Extract ``plan`` and extended ``user`` metadata.
extractor.flickr.access-token & .access-token-secret extractor.flickr.access-token & .access-token-secret
---------------------------------------------------- ----------------------------------------------------
Type Type
@ -2051,7 +2206,22 @@ Type
Default Default
``false`` ``false``
Description Description
Extract ``username`` metadata Extract ``username`` metadata.
extractor.kemonoparty.revisions
-------------------------------
Type
* ``bool``
* ``string``
Default
``false``
Description
Extract post revisions.
Set this to ``"unique"`` to filter out duplicate revisions.
Note: This requires 1 additional HTTP request per post.
extractor.khinsider.format extractor.khinsider.format
@ -2236,6 +2406,18 @@ Description
Fetch media from replies to other notes. Fetch media from replies to other notes.
extractor.[moebooru].pool.metadata
----------------------------------
Type
``bool``
Default
``false``
Description
Extract extended ``pool`` metadata.
Note: Not supported by all ``moebooru`` instances.
extractor.newgrounds.flash extractor.newgrounds.flash
-------------------------- --------------------------
Type Type
@ -2481,6 +2663,14 @@ Description
Download from video pins. Download from video pins.
extractor.pixeldrain.api-key
----------------------------
Type
``string``
Description
Your account's `API key <https://pixeldrain.com/user/api_keys>`__
extractor.pixiv.include extractor.pixiv.include
----------------------- -----------------------
Type Type
@ -2625,6 +2815,16 @@ Description
Also search Plurk comments for URLs. Also search Plurk comments for URLs.
extractor.[postmill].save-link-post-body
----------------------------------------
Type
``bool``
Default
``false``
Description
Whether or not to save the body for link/image posts.
extractor.reactor.gif extractor.reactor.gif
--------------------- ---------------------
Type Type
@ -2809,6 +3009,19 @@ Description
restrict it to only one possible format. restrict it to only one possible format.
extractor.sankaku.id-format
---------------------------
Type
``string``
Default
``"numeric"``
Description
Format of ``id`` metadata fields.
* ``"alphanumeric"`` or ``"alnum"``: 11-character alphanumeric IDs (``y0abGlDOr2o``)
* ``"numeric"`` or ``"legacy"``: numeric IDs (``360451``)
extractor.sankaku.refresh extractor.sankaku.refresh
------------------------- -------------------------
Type Type
@ -2892,6 +3105,176 @@ Description
Download video files. Download video files.
extractor.steamgriddb.animated
------------------------------
Type
``bool``
Default
``true``
Description
Include animated assets when downloading from a list of assets.
extractor.steamgriddb.epilepsy
------------------------------
Type
``bool``
Default
``true``
Description
Include assets tagged with epilepsy when downloading from a list of assets.
extractor.steamgriddb.dimensions
--------------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"all"``
Examples
* ``"1024x512,512x512"``
* ``["460x215", "920x430"]``
Description
Only include assets that are in the specified dimensions. ``all`` can be
used to specify all dimensions. Valid values are:
* Grids: ``460x215``, ``920x430``, ``600x900``, ``342x482``, ``660x930``,
``512x512``, ``1024x1024``
* Heroes: ``1920x620``, ``3840x1240``, ``1600x650``
* Logos: N/A (will be ignored)
* Icons: ``8x8``, ``10x10``, ``14x14``, ``16x16``, ``20x20``, ``24x24``,
``28x28``, ``32x32``, ``35x35``, ``40x40``, ``48x48``, ``54x54``,
``56x56``, ``57x57``, ``60x60``, ``64x64``, ``72x72``, ``76x76``,
``80x80``, ``90x90``, ``96x96``, ``100x100``, ``114x114``, ``120x120``,
``128x128``, ``144x144``, ``150x150``, ``152x152``, ``160x160``,
``180x180``, ``192x192``, ``194x194``, ``256x256``, ``310x310``,
``512x512``, ``768x768``, ``1024x1024``
extractor.steamgriddb.file-types
--------------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"all"``
Examples
* ``"png,jpeg"``
* ``["jpeg", "webp"]``
Description
Only include assets that are in the specified file types. ``all`` can be
used to specifiy all file types. Valid values are:
* Grids: ``png``, ``jpeg``, ``jpg``, ``webp``
* Heroes: ``png``, ``jpeg``, ``jpg``, ``webp``
* Logos: ``png``, ``webp``
* Icons: ``png``, ``ico``
extractor.steamgriddb.download-fake-png
---------------------------------------
Type
``bool``
Default
``true``
Description
Download fake PNGs alongside the real file.
extractor.steamgriddb.humor
---------------------------
Type
``bool``
Default
``true``
Description
Include assets tagged with humor when downloading from a list of assets.
extractor.steamgriddb.languages
-------------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"all"``
Examples
* ``"en,km"``
* ``["fr", "it"]``
Description
Only include assets that are in the specified languages. ``all`` can be
used to specifiy all languages. Valid values are `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639-1>`__
language codes.
extractor.steamgriddb.nsfw
--------------------------
Type
``bool``
Default
``true``
Description
Include assets tagged with adult content when downloading from a list of assets.
extractor.steamgriddb.sort
--------------------------
Type
``string``
Default
``score_desc``
Description
Set the chosen sorting method when downloading from a list of assets. Can be one of:
* ``score_desc`` (Highest Score (Beta))
* ``score_asc`` (Lowest Score (Beta))
* ``score_old_desc`` (Highest Score (Old))
* ``score_old_asc`` (Lowest Score (Old))
* ``age_desc`` (Newest First)
* ``age_asc`` (Oldest First)
extractor.steamgriddb.static
----------------------------
Type
``bool``
Default
``true``
Description
Include static assets when downloading from a list of assets.
extractor.steamgriddb.styles
----------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``all``
Examples
* ``white,black``
* ``["no_logo", "white_logo"]``
Description
Only include assets that are in the specified styles. ``all`` can be used
to specify all styles. Valid values are:
* Grids: ``alternate``, ``blurred``, ``no_logo``, ``material``, ``white_logo``
* Heroes: ``alternate``, ``blurred``, ``material``
* Logos: ``official``, ``white``, ``black``, ``custom``
* Icons: ``official``, ``custom``
extractor.steamgriddb.untagged
------------------------------
Type
``bool``
Default
``true``
Description
Include untagged assets when downloading from a list of assets.
extractor.[szurubooru].username & .token extractor.[szurubooru].username & .token
---------------------------------------- ----------------------------------------
Type Type
@ -3035,7 +3418,8 @@ Type
Default Default
``2`` ``2``
Description Description
Number of retries for fetching full-resolution images. Number of retries for fetching full-resolution images
or ``-1`` for infinite retries.
extractor.twibooru.api-key extractor.twibooru.api-key
@ -3064,6 +3448,16 @@ Description
See `Filters <https://twibooru.org/filters>`__ for details. See `Filters <https://twibooru.org/filters>`__ for details.
extractor.twitter.ads
---------------------
Type
``bool``
Default
``false``
Description
Fetch media from promoted Tweets.
extractor.twitter.cards extractor.twitter.cards
----------------------- -----------------------
Type Type
@ -3142,8 +3536,6 @@ Description
for each Tweet in said timeline. for each Tweet in said timeline.
Note: This requires at least 1 additional API call per initial Tweet. Note: This requires at least 1 additional API call per initial Tweet.
Age-restricted replies cannot be expanded when using the
`syndication <extractor.twitter.syndication_>`__ API.
extractor.twitter.include extractor.twitter.include
@ -3211,30 +3603,6 @@ Description
``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``. ``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``.
extractor.twitter.syndication
-----------------------------
Type
* ``bool``
* ``string``
Default
``false``
Description
Controls how to retrieve age-restricted content when not logged in.
* ``false``: Skip age-restricted Tweets.
* ``true``: Download using Twitter's syndication API.
* ``"extended"``: Try to fetch Tweet metadata using the normal API
in addition to the syndication API. This requires additional HTTP
requests in some cases (e.g. when `retweets <extractor.twitter.retweets_>`_
are enabled).
Note: This does not apply to search results (including
`timeline strategies <extractor.twitter.timeline.strategy_>`__).
To retrieve such content from search results, you must log in and
disable "Hide sensitive content" in your `search settings
<https://twitter.com/settings/search>`__.
extractor.twitter.logout extractor.twitter.logout
------------------------ ------------------------
Type Type
@ -4300,6 +4668,24 @@ Description
The default format string here is ``"{message}"``. The default format string here is ``"{message}"``.
output.errorfile
----------------
Type
* |Path|_
* |Logging Configuration|_
Description
File to write input URLs which returned an error to.
The default format string here is also ``"{message}"``.
When combined with
``-I``/``--input-file-comment`` or
``-x``/``--input-file-delete``,
this option will cause *all* input URLs from these files
to be commented/deleted after processing them
and not just successful ones.
output.num-to-str output.num-to-str
----------------- -----------------
Type Type
@ -5234,9 +5620,14 @@ How To
* login and visit the `apps <https://www.reddit.com/prefs/apps/>`__ * login and visit the `apps <https://www.reddit.com/prefs/apps/>`__
section of your account's preferences section of your account's preferences
* click the "are you a developer? create an app..." button * click the "are you a developer? create an app..." button
* fill out the form, choose "installed app", preferably set * fill out the form:
"http://localhost:6414/" as "redirect uri" and finally click
"create app" * choose a name
* select "installed app"
* set ``http://localhost:6414/`` as "redirect uri"
* solve the "I'm not a rebot" reCATCHA if needed
* click "create app"
* copy the client id (third line, under your application's name and * copy the client id (third line, under your application's name and
"installed app") and put it in your configuration file "installed app") and put it in your configuration file
as ``"client-id"`` as ``"client-id"``

@ -176,16 +176,15 @@
"imgur": "imgur":
{ {
"#": "use different directory and filename formats when coming from a reddit post", "#": "general imgur settings",
"directory": "filename": "{id}.{extension}"
{ },
"'_reddit' in locals()": []
}, "reddit>imgur":
"filename": {
{ "#": "special settings for imgur URLs found in reddit posts",
"'_reddit' in locals()": "{_reddit[id]} {id}.{extension}", "directory": [],
"" : "{id}.{extension}" "filename": "{_reddit[id]} {_reddit[title]} {id}.{extension}"
}
}, },
"tumblr": "tumblr":

@ -75,6 +75,7 @@
"client-id": null, "client-id": null,
"client-secret": null, "client-secret": null,
"refresh-token": null, "refresh-token": null,
"auto-watch": false, "auto-watch": false,
"auto-unwatch": false, "auto-unwatch": false,
"comments": false, "comments": false,
@ -84,11 +85,13 @@
"group": true, "group": true,
"include": "gallery", "include": "gallery",
"journals": "html", "journals": "html",
"jwt": false,
"mature": true, "mature": true,
"metadata": false, "metadata": false,
"original": true, "original": true,
"pagination": "api", "pagination": "api",
"public": true, "public": true,
"quality": 100,
"wait-min": 0 "wait-min": 0
}, },
"e621": "e621":

@ -0,0 +1,9 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>gallery-dl</title>
</head>
<body>
</body>
</html>

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>gallery-dl - OAuth Redirect</title>
<script>
window.location.href = "http://localhost:6414/" + window.location.search;
</script>
</head>
<body>
</body>
</html>

@ -6,8 +6,6 @@
## General Options: ## General Options:
-h, --help Print this help message and exit -h, --help Print this help message and exit
--version Print program version and exit --version Print program version and exit
-i, --input-file FILE Download URLs found in FILE ('-' for stdin).
More than one --input-file can be specified
-f, --filename FORMAT Filename format string for downloaded files -f, --filename FORMAT Filename format string for downloaded files
('/O' for "original" filenames) ('/O' for "original" filenames)
-d, --destination PATH Target location for file downloads -d, --destination PATH Target location for file downloads
@ -19,6 +17,16 @@
--clear-cache MODULE Delete cached login sessions, cookies, etc. for --clear-cache MODULE Delete cached login sessions, cookies, etc. for
MODULE (ALL to delete everything) MODULE (ALL to delete everything)
## Input Options:
-i, --input-file FILE Download URLs found in FILE ('-' for stdin).
More than one --input-file can be specified
-I, --input-file-comment FILE
Download URLs found in FILE. Comment them out
after they were downloaded successfully.
-x, --input-file-delete FILE
Download URLs found in FILE. Delete them after
they were downloaded successfully.
## Output Options: ## Output Options:
-q, --quiet Activate quiet mode -q, --quiet Activate quiet mode
-v, --verbose Print various debugging information -v, --verbose Print various debugging information
@ -31,6 +39,7 @@
-E, --extractor-info Print extractor defaults and settings -E, --extractor-info Print extractor defaults and settings
-K, --list-keywords Print a list of available keywords and example -K, --list-keywords Print a list of available keywords and example
values for the given URLs values for the given URLs
-e, --error-file FILE Add input URLs which returned an error to FILE
--list-modules Print a list of available extractor modules --list-modules Print a list of available extractor modules
--list-extractors Print a list of extractor classes with --list-extractors Print a list of extractor classes with
description, (sub)category and example URL description, (sub)category and example URL
@ -43,7 +52,8 @@
## Downloader Options: ## Downloader Options:
-r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M) -r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M)
-R, --retries N Maximum number of retries for failed HTTP -R, --retries N Maximum number of retries for failed HTTP
requests or -1 for infinite retries (default: 4) requests or -1 for infinite retries (default:
4)
--http-timeout SECONDS Timeout for HTTP connections (default: 30.0) --http-timeout SECONDS Timeout for HTTP connections (default: 30.0)
--sleep SECONDS Number of seconds to wait before each download. --sleep SECONDS Number of seconds to wait before each download.
This can be either a constant value or a range This can be either a constant value or a range
@ -110,23 +120,24 @@
and other delegated URLs and other delegated URLs
## Post-processing Options: ## Post-processing Options:
--zip Store downloaded files in a ZIP archive -P, --postprocessor NAME Activate the specified post processor
--ugoira-conv Convert Pixiv Ugoira to WebM (requires FFmpeg) -O, --postprocessor-option KEY=VALUE
--ugoira-conv-lossless Convert Pixiv Ugoira to WebM in VP9 lossless Additional post processor options
mode
--ugoira-conv-copy Convert Pixiv Ugoira to MKV without re-encoding
any frames
--write-metadata Write metadata to separate JSON files --write-metadata Write metadata to separate JSON files
--write-info-json Write gallery metadata to a info.json file --write-info-json Write gallery metadata to a info.json file
--write-tags Write image tags to separate text files --write-tags Write image tags to separate text files
--mtime-from-date Set file modification times according to 'date' --zip Store downloaded files in a ZIP archive
metadata --cbz Store downloaded files in a CBZ archive
--exec CMD Execute CMD for each downloaded file. Example: --mtime NAME Set file modification times according to
--exec "convert {} {}.png && rm {}" metadata selected by NAME. Examples: 'date' or
--exec-after CMD Execute CMD after all files were downloaded 'status[date]'
successfully. Example: --exec-after "cd {} && --ugoira FORMAT Convert Pixiv Ugoira to FORMAT using FFmpeg.
Supported formats are 'webm', 'mp4', 'gif',
'vp8', 'vp9', 'vp9-lossless', 'copy'.
--exec CMD Execute CMD for each downloaded file. Supported
replacement fields are {} or {_path},
{_directory}, {_filename}. Example: --exec
"convert {} {}.png && rm {}"
--exec-after CMD Execute CMD after all files were downloaded.
Example: --exec-after "cd {_directory} &&
convert * ../doc.pdf" convert * ../doc.pdf"
-P, --postprocessor NAME Activate the specified post processor
-O, --postprocessor-option OPT
Additional '<key>=<value>' post processor
options

@ -1,7 +1,7 @@
# Supported Sites # Supported Sites
<!-- auto-generated by scripts/supportedsites.py --> <!-- auto-generated by scripts/supportedsites.py -->
Consider all sites to be NSFW unless otherwise known. Consider all listed sites to potentially be NSFW.
<table> <table>
<thead valign="bottom"> <thead valign="bottom">
@ -13,6 +13,12 @@ Consider all sites to be NSFW unless otherwise known.
</tr> </tr>
</thead> </thead>
<tbody valign="top"> <tbody valign="top">
<tr>
<td>2ch</td>
<td>https://2ch.hk/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr> <tr>
<td>2chen</td> <td>2chen</td>
<td>https://sturdychan.help/</td> <td>https://sturdychan.help/</td>
@ -31,6 +37,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Pools, Popular Images, Posts, Tag Searches</td> <td>Pools, Popular Images, Posts, Tag Searches</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>4archive</td>
<td>https://4archive.org/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr> <tr>
<td>4chan</td> <td>4chan</td>
<td>https://www.4chan.org/</td> <td>https://www.4chan.org/</td>
@ -91,6 +103,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Albums, Artwork Listings, Challenges, Followed Users, individual Images, Likes, Search Results, User Profiles</td> <td>Albums, Artwork Listings, Challenges, Followed Users, individual Images, Likes, Search Results, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>BATO.TO</td>
<td>https://bato.to/</td>
<td>Chapters, Manga</td>
<td></td>
</tr>
<tr> <tr>
<td>BBC</td> <td>BBC</td>
<td>https://bbc.co.uk/</td> <td>https://bbc.co.uk/</td>
@ -103,16 +121,10 @@ Consider all sites to be NSFW unless otherwise known.
<td>Collections, Galleries, User Profiles</td> <td>Collections, Galleries, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Blogger</td>
<td>https://www.blogger.com/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr> <tr>
<td>Bunkr</td> <td>Bunkr</td>
<td>https://bunkrr.su/</td> <td>https://bunkr.sk/</td>
<td>Albums</td> <td>Albums, Media Files</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
@ -148,7 +160,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td>DeviantArt</td> <td>DeviantArt</td>
<td>https://www.deviantart.com/</td> <td>https://www.deviantart.com/</td>
<td>Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches</td> <td>Avatars, Backgrounds, Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td> <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr> </tr>
<tr> <tr>
@ -254,9 +266,9 @@ Consider all sites to be NSFW unless otherwise known.
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>HBrowse</td> <td>HatenaBlog</td>
<td>https://www.hbrowse.com/</td> <td>https://hatenablog.com</td>
<td>Chapters, Manga</td> <td>Archive, Individual Posts, Home Feed, Search Results</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
@ -400,7 +412,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td>Inkbunny</td> <td>Inkbunny</td>
<td>https://inkbunny.net/</td> <td>https://inkbunny.net/</td>
<td>Favorites, Followed Users, Pools, Posts, Search Results, User Profiles</td> <td>Favorites, Followed Users, Pools, Posts, Search Results, Unread Submissions, User Profiles</td>
<td>Supported</td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@ -427,12 +439,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Games</td> <td>Games</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>JPG Fish</td>
<td>https://jpg1.su/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr> <tr>
<td>Keenspot</td> <td>Keenspot</td>
<td>http://www.keenspot.com/</td> <td>http://www.keenspot.com/</td>
@ -453,7 +459,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr> </tr>
<tr> <tr>
<td>Komikcast</td> <td>Komikcast</td>
<td>https://komikcast.site/</td> <td>https://komikcast.lol/</td>
<td>Chapters, Manga</td> <td>Chapters, Manga</td>
<td></td> <td></td>
</tr> </tr>
@ -502,7 +508,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td>MangaDex</td> <td>MangaDex</td>
<td>https://mangadex.org/</td> <td>https://mangadex.org/</td>
<td>Chapters, Followed Feed, Manga</td> <td>Chapters, Followed Feed, Lists, Manga</td>
<td>Supported</td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@ -595,12 +601,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Albums</td> <td>Albums</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Nudecollect</td>
<td>https://nudecollect.com/</td>
<td>Albums, individual Images</td>
<td></td>
</tr>
<tr> <tr>
<td>Patreon</td> <td>Patreon</td>
<td>https://www.patreon.com/</td> <td>https://www.patreon.com/</td>
@ -643,6 +643,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>All Pins, Created Pins, Pins, pin.it Links, related Pins, Search Results, Sections, User Profiles</td> <td>All Pins, Created Pins, Pins, pin.it Links, related Pins, Search Results, Sections, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td> <td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td>
</tr> </tr>
<tr>
<td>pixeldrain</td>
<td>https://pixeldrain.com/</td>
<td>Albums, Files</td>
<td></td>
</tr>
<tr> <tr>
<td>Pixhost</td> <td>Pixhost</td>
<td>https://pixhost.to/</td> <td>https://pixhost.to/</td>
@ -679,6 +685,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Posts, User Profiles</td> <td>Posts, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Poringa</td>
<td>http://www.poringa.net/</td>
<td>Posts Images, Search Results, User Profiles</td>
<td></td>
</tr>
<tr> <tr>
<td>Porn Image</td> <td>Porn Image</td>
<td>https://porn-images-xxx.com/</td> <td>https://porn-images-xxx.com/</td>
@ -718,7 +730,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td>Reddit</td> <td>Reddit</td>
<td>https://www.reddit.com/</td> <td>https://www.reddit.com/</td>
<td>Home Feed, individual Images, Submissions, Subreddits, User Profiles</td> <td>Home Feed, individual Images, Redirects, Submissions, Subreddits, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td> <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr> </tr>
<tr> <tr>
@ -805,6 +817,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Presentations</td> <td>Presentations</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>SteamGridDB</td>
<td>https://www.steamgriddb.com</td>
<td>Individual Assets, Grids, Heroes, Icons, Logos</td>
<td></td>
</tr>
<tr> <tr>
<td>SubscribeStar</td> <td>SubscribeStar</td>
<td>https://www.subscribestar.com/</td> <td>https://www.subscribestar.com/</td>
@ -829,6 +847,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Galleries</td> <td>Galleries</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>TMOHentai</td>
<td>https://tmohentai.com/</td>
<td>Galleries</td>
<td></td>
</tr>
<tr> <tr>
<td>Toyhouse</td> <td>Toyhouse</td>
<td>https://toyhou.se/</td> <td>https://toyhou.se/</td>
@ -883,6 +907,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Files</td> <td>Files</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Urlgalleries</td>
<td>https://urlgalleries.net/</td>
<td>Galleries</td>
<td></td>
</tr>
<tr> <tr>
<td>Vipergirls</td> <td>Vipergirls</td>
<td>https://vipergirls.to/</td> <td>https://vipergirls.to/</td>
@ -985,6 +1015,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>individual Images, Tag Searches</td> <td>individual Images, Tag Searches</td>
<td>Supported</td> <td>Supported</td>
</tr> </tr>
<tr>
<td>Zzup</td>
<td>https://zzup.com/</td>
<td>Galleries</td>
<td></td>
</tr>
<tr> <tr>
<td>かべうち</td> <td>かべうち</td>
<td>https://kabe-uchiroom.com/</td> <td>https://kabe-uchiroom.com/</td>
@ -998,6 +1034,44 @@ Consider all sites to be NSFW unless otherwise known.
<td></td> <td></td>
</tr> </tr>
<tr>
<td colspan="4"><strong>Blogger Instances</strong></td>
</tr>
<tr>
<td>Blogspot</td>
<td>https://www.blogger.com/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td>MIC MIC IDOL</td>
<td>https://www.micmicidol.club/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Chevereto Instances</strong></td>
</tr>
<tr>
<td>JPG Fish</td>
<td>https://jpg4.su/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr>
<td>IMG.Kiwi</td>
<td>https://img.kiwi/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr>
<td>DeltaPorno</td>
<td>https://gallery.deltaporno.com/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>Danbooru Instances</strong></td> <td colspan="4"><strong>Danbooru Instances</strong></td>
</tr> </tr>
@ -1137,7 +1211,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr> </tr>
<tr> <tr>
<td>Bbw-chan</td> <td>Bbw-chan</td>
<td>https://bbw-chan.nl/</td> <td>https://bbw-chan.link/</td>
<td>Boards, Threads</td> <td>Boards, Threads</td>
<td></td> <td></td>
</tr> </tr>
@ -1163,6 +1237,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Favorites, Followed Users, Images from Notes, User Profiles</td> <td>Favorites, Followed Users, Images from Notes, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Misskey.design</td>
<td>https://misskey.design/</td>
<td>Favorites, Followed Users, Images from Notes, User Profiles</td>
<td></td>
</tr>
<tr> <tr>
<td>Lesbian.energy</td> <td>Lesbian.energy</td>
<td>https://lesbian.energy/</td> <td>https://lesbian.energy/</td>
@ -1201,12 +1281,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Media Files, Replies, Search Results, Tweets</td> <td>Media Files, Replies, Search Results, Tweets</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Nitter.lacontrevoie.fr</td>
<td>https://nitter.lacontrevoie.fr/</td>
<td>Media Files, Replies, Search Results, Tweets</td>
<td></td>
</tr>
<tr> <tr>
<td>Nitter.1d4.us</td> <td>Nitter.1d4.us</td>
<td>https://nitter.1d4.us/</td> <td>https://nitter.1d4.us/</td>
@ -1254,6 +1328,16 @@ Consider all sites to be NSFW unless otherwise known.
<td></td> <td></td>
</tr> </tr>
<tr>
<td colspan="4"><strong>Postmill Instances</strong></td>
</tr>
<tr>
<td>Raddle</td>
<td>https://raddle.me/</td>
<td>Forums, Home Feed, Individual Posts, Search Results, Tag Searches, User Profiles</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>Reactor Instances</strong></td> <td colspan="4"><strong>Reactor Instances</strong></td>
</tr> </tr>
@ -1285,12 +1369,6 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td colspan="4"><strong>Shimmie2 Instances</strong></td> <td colspan="4"><strong>Shimmie2 Instances</strong></td>
</tr> </tr>
<tr>
<td>meme.museum</td>
<td>https://meme.museum/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr> <tr>
<td>Loudbooru</td> <td>Loudbooru</td>
<td>https://loudbooru.com/</td> <td>https://loudbooru.com/</td>
@ -1299,7 +1377,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr> </tr>
<tr> <tr>
<td>Giantessbooru</td> <td>Giantessbooru</td>
<td>https://giantessbooru.com/</td> <td>https://sizechangebooru.com/</td>
<td>Posts, Tag Searches</td> <td>Posts, Tag Searches</td>
<td></td> <td></td>
</tr> </tr>
@ -1315,6 +1393,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Posts, Tag Searches</td> <td>Posts, Tag Searches</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Rule34Hentai</td>
<td>https://rule34hentai.net/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>szurubooru Instances</strong></td> <td colspan="4"><strong>szurubooru Instances</strong></td>
@ -1331,6 +1415,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Posts, Tag Searches</td> <td>Posts, Tag Searches</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Snootbooru</td>
<td>https://snootbooru.com/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>URL Shorteners</strong></td> <td colspan="4"><strong>URL Shorteners</strong></td>
@ -1370,6 +1460,82 @@ Consider all sites to be NSFW unless otherwise known.
<td></td> <td></td>
</tr> </tr>
<tr>
<td colspan="4"><strong>Wikimedia Instances</strong></td>
</tr>
<tr>
<td>Wikipedia</td>
<td>https://www.wikipedia.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wiktionary</td>
<td>https://www.wiktionary.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikiquote</td>
<td>https://www.wikiquote.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikibooks</td>
<td>https://www.wikibooks.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikisource</td>
<td>https://www.wikisource.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikinews</td>
<td>https://www.wikinews.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikiversity</td>
<td>https://www.wikiversity.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikispecies</td>
<td>https://species.wikimedia.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikimedia Commons</td>
<td>https://commons.wikimedia.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>MediaWiki</td>
<td>https://www.mediawiki.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Fandom</td>
<td>https://www.fandom.com/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Super Mario Wiki</td>
<td>https://www.mariowiki.com/</td>
<td>Articles</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>Moebooru and MyImouto</strong></td> <td colspan="4"><strong>Moebooru and MyImouto</strong></td>
</tr> </tr>
@ -1456,16 +1622,6 @@ Consider all sites to be NSFW unless otherwise known.
<td></td> <td></td>
</tr> </tr>
<tr>
<td colspan="4"><strong>FoOlSlide Instances</strong></td>
</tr>
<tr>
<td>PowerManga</td>
<td>https://read.powermanga.org/</td>
<td>Chapters, Manga</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>Mastodon Instances</strong></td> <td colspan="4"><strong>Mastodon Instances</strong></td>
</tr> </tr>

@ -18,19 +18,6 @@ __email__ = "mike_faehrmann@web.de"
__version__ = version.__version__ __version__ = version.__version__
def progress(urls, pformat):
"""Wrapper around urls to output a simple progress indicator"""
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
pinfo = {"total": len(urls)}
for pinfo["current"], pinfo["url"] in enumerate(urls, 1):
output.stderr_write(pformat.format_map(pinfo))
yield pinfo["url"]
def main(): def main():
try: try:
parser = option.build_parser() parser = option.build_parser()
@ -58,7 +45,7 @@ def main():
elif filename.startswith("\\f"): elif filename.startswith("\\f"):
filename = "\f" + filename[2:] filename = "\f" + filename[2:]
config.set((), "filename", filename) config.set((), "filename", filename)
if args.directory: if args.directory is not None:
config.set((), "base-directory", args.directory) config.set((), "base-directory", args.directory)
config.set((), "directory", ()) config.set((), "directory", ())
if args.postprocessors: if args.postprocessors:
@ -128,6 +115,7 @@ def main():
output.configure_logging(args.loglevel) output.configure_logging(args.loglevel)
if args.loglevel >= logging.ERROR: if args.loglevel >= logging.ERROR:
config.set(("output",), "mode", "null") config.set(("output",), "mode", "null")
config.set(("downloader",), "progress", None)
elif args.loglevel <= logging.DEBUG: elif args.loglevel <= logging.DEBUG:
import platform import platform
import requests import requests
@ -224,7 +212,7 @@ def main():
return config.initialize() return config.initialize()
else: else:
if not args.urls and not args.inputfiles: if not args.urls and not args.input_files:
parser.error( parser.error(
"The following arguments are required: URL\n" "The following arguments are required: URL\n"
"Use 'gallery-dl --help' to get a list of all options.") "Use 'gallery-dl --help' to get a list of all options.")
@ -238,50 +226,62 @@ def main():
else: else:
jobtype = args.jobtype or job.DownloadJob jobtype = args.jobtype or job.DownloadJob
urls = args.urls input_manager = InputManager()
if args.inputfiles: input_manager.log = input_log = logging.getLogger("inputfile")
for inputfile in args.inputfiles:
try:
if inputfile == "-":
if sys.stdin:
urls += util.parse_inputfile(sys.stdin, log)
else:
log.warning(
"input file: stdin is not readable")
else:
with open(inputfile, encoding="utf-8") as file:
urls += util.parse_inputfile(file, log)
except OSError as exc:
log.warning("input file: %s", exc)
# unsupported file logging handler # unsupported file logging handler
handler = output.setup_logging_handler( handler = output.setup_logging_handler(
"unsupportedfile", fmt="{message}") "unsupportedfile", fmt="{message}")
if handler: if handler:
ulog = logging.getLogger("unsupported") ulog = job.Job.ulog = logging.getLogger("unsupported")
ulog.addHandler(handler) ulog.addHandler(handler)
ulog.propagate = False ulog.propagate = False
job.Job.ulog = ulog
# error file logging handler
handler = output.setup_logging_handler(
"errorfile", fmt="{message}", mode="a")
if handler:
elog = input_manager.err = logging.getLogger("errorfile")
elog.addHandler(handler)
elog.propagate = False
# collect input URLs
input_manager.add_list(args.urls)
if args.input_files:
for input_file, action in args.input_files:
try:
path = util.expand_path(input_file)
input_manager.add_file(path, action)
except Exception as exc:
input_log.error(exc)
return getattr(exc, "code", 128)
pformat = config.get(("output",), "progress", True) pformat = config.get(("output",), "progress", True)
if pformat and len(urls) > 1 and args.loglevel < logging.ERROR: if pformat and len(input_manager.urls) > 1 and \
urls = progress(urls, pformat) args.loglevel < logging.ERROR:
else: input_manager.progress(pformat)
urls = iter(urls)
# process input URLs
retval = 0 retval = 0
url = next(urls, None) for url in input_manager:
while url is not None:
try: try:
log.debug("Starting %s for '%s'", jobtype.__name__, url) log.debug("Starting %s for '%s'", jobtype.__name__, url)
if isinstance(url, util.ExtendedUrl):
if isinstance(url, ExtendedUrl):
for opts in url.gconfig: for opts in url.gconfig:
config.set(*opts) config.set(*opts)
with config.apply(url.lconfig): with config.apply(url.lconfig):
retval |= jobtype(url.value).run() status = jobtype(url.value).run()
else: else:
retval |= jobtype(url).run() status = jobtype(url).run()
if status:
retval |= status
input_manager.error()
else:
input_manager.success()
except exception.TerminateExtraction: except exception.TerminateExtraction:
pass pass
except exception.RestartExtraction: except exception.RestartExtraction:
@ -290,9 +290,9 @@ def main():
except exception.NoExtractorError: except exception.NoExtractorError:
log.error("Unsupported URL '%s'", url) log.error("Unsupported URL '%s'", url)
retval |= 64 retval |= 64
input_manager.error()
url = next(urls, None) input_manager.next()
return retval return retval
except KeyboardInterrupt: except KeyboardInterrupt:
@ -304,3 +304,226 @@ def main():
if exc.errno != errno.EPIPE: if exc.errno != errno.EPIPE:
raise raise
return 1 return 1
class InputManager():
def __init__(self):
self.urls = []
self.files = ()
self.log = self.err = None
self._url = ""
self._item = None
self._index = 0
self._pformat = None
def add_url(self, url):
self.urls.append(url)
def add_list(self, urls):
self.urls += urls
def add_file(self, path, action=None):
"""Process an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair
separated by an '='. where
'key' is a dot-separated option name and
'value' is a JSON-parsable string.
These configuration options will be applied
while processing the next URL only.
Lines starting with '-G' are the same as above, except these options
will be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
if path == "-" and not action:
try:
lines = sys.stdin.readlines()
except Exception:
raise exception.InputFileError("stdin is not readable")
path = None
else:
try:
with open(path, encoding="utf-8") as fp:
lines = fp.readlines()
except Exception as exc:
raise exception.InputFileError(str(exc))
if self.files:
self.files[path] = lines
else:
self.files = {path: lines}
if action == "c":
action = self._action_comment
elif action == "d":
action = self._action_delete
else:
action = None
gconf = []
lconf = []
indicies = []
strip_comment = None
append = self.urls.append
for n, line in enumerate(lines):
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
if action:
indicies.append(n)
key, sep, value = line.partition("=")
if not sep:
raise exception.InputFileError(
"Invalid KEY=VALUE pair '%s' on line %s in %s",
line, n+1, path)
try:
value = util.json_loads(value.strip())
except ValueError as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
raise exception.InputFileError(
"Unable to parse '%s' on line %s in %s",
value, n+1, path)
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
import re
strip_comment = re.compile(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
url = ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
url = line
if action:
indicies.append(n)
append((url, path, action, indicies))
indicies = []
else:
append(url)
def progress(self, pformat=True):
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
self._pformat = pformat.format_map
def next(self):
self._index += 1
def success(self):
if self._item:
self._rewrite()
def error(self):
if self.err:
if self._item:
url, path, action, indicies = self._item
lines = self.files[path]
out = "".join(lines[i] for i in indicies)
if out and out[-1] == "\n":
out = out[:-1]
self._rewrite()
else:
out = str(self._url)
self.err.info(out)
def _rewrite(self):
url, path, action, indicies = self._item
lines = self.files[path]
action(lines, indicies)
try:
with open(path, "w", encoding="utf-8") as fp:
fp.writelines(lines)
except Exception as exc:
self.log.warning(
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)
@staticmethod
def _action_comment(lines, indicies):
for i in indicies:
lines[i] = "# " + lines[i]
@staticmethod
def _action_delete(lines, indicies):
for i in indicies:
lines[i] = ""
def __iter__(self):
self._index = 0
return self
def __next__(self):
try:
url = self.urls[self._index]
except IndexError:
raise StopIteration
if isinstance(url, tuple):
self._item = url
url = url[0]
else:
self._item = None
self._url = url
if self._pformat:
output.stderr_write(self._pformat({
"total" : len(self.urls),
"current": self._index + 1,
"url" : url,
}))
return url
class ExtendedUrl():
"""URL with attached config key-value pairs"""
__slots__ = ("value", "gconfig", "lconfig")
def __init__(self, url, gconf, lconf):
self.value = url
self.gconfig = gconf
self.lconfig = lconf
def __str__(self):
return self.value

@ -9,10 +9,10 @@
import sys import sys
if __package__ is None and not hasattr(sys, "frozen"): if not __package__ and not hasattr(sys, "frozen"):
import os.path import os.path
path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = os.path.realpath(os.path.abspath(__file__))
sys.path.insert(0, os.path.realpath(path)) sys.path.insert(0, os.path.dirname(os.path.dirname(path)))
import gallery_dl import gallery_dl

@ -47,7 +47,7 @@ def load_cookies(cookiejar, browser_specification):
def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None):
path, container_id = _firefox_cookies_database(profile, container) path, container_id = _firefox_cookies_database(profile, container)
with DatabaseCopy(path) as db: with DatabaseConnection(path) as db:
sql = ("SELECT name, value, host, path, isSecure, expiry " sql = ("SELECT name, value, host, path, isSecure, expiry "
"FROM moz_cookies") "FROM moz_cookies")
@ -100,7 +100,7 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None,
path = _chrome_cookies_database(profile, config) path = _chrome_cookies_database(profile, config)
_log_debug("Extracting cookies from %s", path) _log_debug("Extracting cookies from %s", path)
with DatabaseCopy(path) as db: with DatabaseConnection(path) as db:
db.text_factory = bytes db.text_factory = bytes
decryptor = get_cookie_decryptor( decryptor = get_cookie_decryptor(
config["directory"], config["keyring"], keyring) config["directory"], config["keyring"], keyring)
@ -215,9 +215,11 @@ def _firefox_cookies_database(profile=None, container=None):
def _firefox_browser_directory(): def _firefox_browser_directory():
if sys.platform in ("win32", "cygwin"): if sys.platform in ("win32", "cygwin"):
return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles") return os.path.expandvars(
r"%APPDATA%\Mozilla\Firefox\Profiles")
if sys.platform == "darwin": if sys.platform == "darwin":
return os.path.expanduser("~/Library/Application Support/Firefox") return os.path.expanduser(
"~/Library/Application Support/Firefox/Profiles")
return os.path.expanduser("~/.mozilla/firefox") return os.path.expanduser("~/.mozilla/firefox")
@ -814,7 +816,7 @@ class DataParser:
self.skip_to(len(self._data), description) self.skip_to(len(self._data), description)
class DatabaseCopy(): class DatabaseConnection():
def __init__(self, path): def __init__(self, path):
self.path = path self.path = path
@ -822,13 +824,27 @@ class DatabaseCopy():
self.directory = None self.directory = None
def __enter__(self): def __enter__(self):
try:
# https://www.sqlite.org/uri.html#the_uri_path
path = self.path.replace("?", "%3f").replace("#", "%23")
if util.WINDOWS:
path = "/" + os.path.abspath(path)
uri = "file:{}?mode=ro&immutable=1".format(path)
self.database = sqlite3.connect(
uri, uri=True, isolation_level=None, check_same_thread=False)
return self.database
except Exception as exc:
_log_debug("Falling back to temporary database copy (%s: %s)",
exc.__class__.__name__, exc)
try: try:
self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-") self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-")
path_copy = os.path.join(self.directory.name, "copy.sqlite") path_copy = os.path.join(self.directory.name, "copy.sqlite")
shutil.copyfile(self.path, path_copy) shutil.copyfile(self.path, path_copy)
self.database = db = sqlite3.connect( self.database = sqlite3.connect(
path_copy, isolation_level=None, check_same_thread=False) path_copy, isolation_level=None, check_same_thread=False)
return db return self.database
except BaseException: except BaseException:
if self.directory: if self.directory:
self.directory.cleanup() self.directory.cleanup()
@ -836,7 +852,8 @@ class DatabaseCopy():
def __exit__(self, exc, value, tb): def __exit__(self, exc, value, tb):
self.database.close() self.database.close()
self.directory.cleanup() if self.directory:
self.directory.cleanup()
def Popen_communicate(*args): def Popen_communicate(*args):

@ -200,13 +200,15 @@ class HttpDownloader(DownloaderBase):
self.log.warning( self.log.warning(
"File size smaller than allowed minimum (%s < %s)", "File size smaller than allowed minimum (%s < %s)",
size, self.minsize) size, self.minsize)
return False pathfmt.temppath = ""
return True
if self.maxsize and size > self.maxsize: if self.maxsize and size > self.maxsize:
self.release_conn(response) self.release_conn(response)
self.log.warning( self.log.warning(
"File size larger than allowed maximum (%s > %s)", "File size larger than allowed maximum (%s > %s)",
size, self.maxsize) size, self.maxsize)
return False pathfmt.temppath = ""
return True
build_path = False build_path = False

@ -21,6 +21,7 @@ Exception
| +-- FilenameFormatError | +-- FilenameFormatError
| +-- DirectoryFormatError | +-- DirectoryFormatError
+-- FilterError +-- FilterError
+-- InputFileError
+-- NoExtractorError +-- NoExtractorError
+-- StopExtraction +-- StopExtraction
+-- TerminateExtraction +-- TerminateExtraction
@ -99,6 +100,15 @@ class FilterError(GalleryDLException):
code = 32 code = 32
class InputFileError(GalleryDLException):
"""Error when parsing input file"""
code = 32
def __init__(self, message, *args):
GalleryDLException.__init__(
self, message % args if args else message)
class NoExtractorError(GalleryDLException): class NoExtractorError(GalleryDLException):
"""No extractor can handle the given URL""" """No extractor can handle the given URL"""
code = 64 code = 64

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://2ch.hk/"""
from .common import Extractor, Message
from .. import text, util
class _2chThreadExtractor(Extractor):
"""Extractor for 2ch threads"""
category = "2ch"
subcategory = "thread"
root = "https://2ch.hk"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{tim}{filename:? //}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
example = "https://2ch.hk/a/res/12345.html"
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
def items(self):
url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
posts = self.request(url).json()["threads"][0]["posts"]
op = posts[0]
title = op.get("subject") or text.remove_html(op["comment"])
thread = {
"board" : self.board,
"thread": self.thread,
"title" : text.unescape(title)[:50],
}
yield Message.Directory, thread
for post in posts:
files = post.get("files")
if files:
post["post_name"] = post["name"]
post["date"] = text.parse_timestamp(post["timestamp"])
del post["files"]
del post["name"]
for file in files:
file.update(thread)
file.update(post)
file["filename"] = file["fullname"].rpartition(".")[0]
file["tim"], _, file["extension"] = \
file["name"].rpartition(".")
yield Message.Url, self.root + file["path"], file
class _2chBoardExtractor(Extractor):
"""Extractor for 2ch boards"""
category = "2ch"
subcategory = "board"
root = "https://2ch.hk"
pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
example = "https://2ch.hk/a/"
def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)
def items(self):
# index page
url = "{}/{}/index.json".format(self.root, self.board)
index = self.request(url).json()
index["_extractor"] = _2chThreadExtractor
for thread in index["threads"]:
url = "{}/{}/res/{}.html".format(
self.root, self.board, thread["thread_num"])
yield Message.Queue, url, index
# pages 1..n
for n in util.advance(index["pages"], 1):
url = "{}/{}/{}.json".format(self.root, self.board, n)
page = self.request(url).json()
page["_extractor"] = _2chThreadExtractor
for thread in page["threads"]:
url = "{}/{}/res/{}.html".format(
self.root, self.board, thread["thread_num"])
yield Message.Queue, url, page

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://4archive.org/"""
from .common import Extractor, Message
from .. import text, util
class _4archiveThreadExtractor(Extractor):
"""Extractor for 4archive threads"""
category = "4archive"
subcategory = "thread"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{no} {filename}.{extension}"
archive_fmt = "{board}_{thread}_{no}"
root = "https://4archive.org"
referer = False
pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)"
example = "https://4archive.org/board/a/thread/12345/"
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
def items(self):
url = "{}/board/{}/thread/{}".format(
self.root, self.board, self.thread)
page = self.request(url).text
data = self.metadata(page)
posts = self.posts(page)
if not data["title"]:
data["title"] = posts[0]["com"][:50]
for post in posts:
post.update(data)
post["time"] = int(util.datetime_to_timestamp(post["date"]))
yield Message.Directory, post
if "url" in post:
yield Message.Url, post["url"], text.nameext_from_url(
post["filename"], post)
def metadata(self, page):
return {
"board" : self.board,
"thread": text.parse_int(self.thread),
"title" : text.unescape(text.extr(
page, 'class="subject">', "</span>"))
}
def posts(self, page):
return [
self.parse(post)
for post in page.split('class="postContainer')[1:]
]
@staticmethod
def parse(post):
extr = text.extract_from(post)
data = {
"name": extr('class="name">', "</span>"),
"date": text.parse_datetime(
extr('class="dateTime postNum" >', "<").strip(),
"%Y-%m-%d %H:%M:%S"),
"no" : text.parse_int(extr('href="#p', '"')),
}
if 'class="file"' in post:
extr('class="fileText"', ">File: <a")
data.update({
"url" : extr('href="', '"'),
"filename": extr(
'rel="noreferrer noopener"', "</a>").strip()[1:],
"size" : text.parse_bytes(extr(" (", ", ")[:-1]),
"width" : text.parse_int(extr("", "x")),
"height" : text.parse_int(extr("", "px")),
})
extr("<blockquote ", "")
data["com"] = text.unescape(text.remove_html(
extr(">", "</blockquote>")))
return data
class _4archiveBoardExtractor(Extractor):
"""Extractor for 4archive boards"""
category = "4archive"
subcategory = "board"
root = "https://4archive.org"
pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$"
example = "https://4archive.org/board/a/"
def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)
self.num = text.parse_int(match.group(2), 1)
def items(self):
data = {"_extractor": _4archiveThreadExtractor}
while True:
url = "{}/board/{}/{}".format(self.root, self.board, self.num)
page = self.request(url).text
if 'class="thread"' not in page:
return
for thread in text.extract_iter(page, 'class="thread" id="t', '"'):
url = "{}/board/{}/thread/{}".format(
self.root, self.board, thread)
yield Message.Queue, url, data
self.num += 1

@ -20,6 +20,7 @@ class _4chanarchivesThreadExtractor(Extractor):
directory_fmt = ("{category}", "{board}", "{thread} - {title}") directory_fmt = ("{category}", "{board}", "{thread} - {title}")
filename_fmt = "{no}-{filename}.{extension}" filename_fmt = "{no}-{filename}.{extension}"
archive_fmt = "{board}_{thread}_{no}" archive_fmt = "{board}_{thread}_{no}"
referer = False
pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)" pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)"
example = "https://4chanarchives.com/board/a/thread/12345/" example = "https://4chanarchives.com/board/a/thread/12345/"

@ -10,11 +10,13 @@ import sys
import re import re
modules = [ modules = [
"2ch",
"2chan", "2chan",
"2chen", "2chen",
"35photo", "35photo",
"3dbooru", "3dbooru",
"4chan", "4chan",
"4archive",
"4chanarchives", "4chanarchives",
"500px", "500px",
"8chan", "8chan",
@ -23,11 +25,13 @@ modules = [
"architizer", "architizer",
"artstation", "artstation",
"aryion", "aryion",
"batoto",
"bbc", "bbc",
"behance", "behance",
"blogger", "blogger",
"bunkr", "bunkr",
"catbox", "catbox",
"chevereto",
"comicvine", "comicvine",
"cyberdrop", "cyberdrop",
"danbooru", "danbooru",
@ -50,7 +54,7 @@ modules = [
"gelbooru_v01", "gelbooru_v01",
"gelbooru_v02", "gelbooru_v02",
"gofile", "gofile",
"hbrowse", "hatenablog",
"hentai2read", "hentai2read",
"hentaicosplays", "hentaicosplays",
"hentaifoundry", "hentaifoundry",
@ -73,7 +77,6 @@ modules = [
"issuu", "issuu",
"itaku", "itaku",
"itchio", "itchio",
"jpgfish",
"jschan", "jschan",
"kabeuchi", "kabeuchi",
"keenspot", "keenspot",
@ -106,7 +109,6 @@ modules = [
"nitter", "nitter",
"nozomi", "nozomi",
"nsfwalbum", "nsfwalbum",
"nudecollect",
"paheal", "paheal",
"patreon", "patreon",
"philomena", "philomena",
@ -116,12 +118,15 @@ modules = [
"piczel", "piczel",
"pillowfort", "pillowfort",
"pinterest", "pinterest",
"pixeldrain",
"pixiv", "pixiv",
"pixnet", "pixnet",
"plurk", "plurk",
"poipiku", "poipiku",
"poringa",
"pornhub", "pornhub",
"pornpics", "pornpics",
"postmill",
"pururin", "pururin",
"rawkuma", "rawkuma",
"reactor", "reactor",
@ -142,17 +147,20 @@ modules = [
"smugmug", "smugmug",
"soundgasm", "soundgasm",
"speakerdeck", "speakerdeck",
"steamgriddb",
"subscribestar", "subscribestar",
"szurubooru", "szurubooru",
"tapas", "tapas",
"tcbscans", "tcbscans",
"telegraph", "telegraph",
"tmohentai",
"toyhouse", "toyhouse",
"tsumino", "tsumino",
"tumblr", "tumblr",
"tumblrgallery", "tumblrgallery",
"twibooru", "twibooru",
"twitter", "twitter",
"urlgalleries",
"unsplash", "unsplash",
"uploadir", "uploadir",
"urlshortener", "urlshortener",
@ -170,9 +178,11 @@ modules = [
"weibo", "weibo",
"wikiart", "wikiart",
"wikifeet", "wikifeet",
"wikimedia",
"xhamster", "xhamster",
"xvideos", "xvideos",
"zerochan", "zerochan",
"zzup",
"booru", "booru",
"moebooru", "moebooru",
"foolfuuka", "foolfuuka",

@ -40,7 +40,7 @@ class AryionExtractor(Extractor):
if username: if username:
self.cookies_update(self._login_impl(username, password)) self.cookies_update(self._login_impl(username, password))
@cache(maxage=14*24*3600, keyarg=1) @cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)

@ -0,0 +1,123 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://bato.to/"""
from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception
import re
BASE_PATTERN = (r"(?:https?://)?(?:"
r"(?:ba|d|h|m|w)to\.to|"
r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|"
r"comiko\.(?:net|org)|"
r"bat(?:otoo|o?two)\.com)")
class BatotoBase():
"""Base class for batoto extractors"""
category = "batoto"
root = "https://bato.to"
def request(self, url, **kwargs):
kwargs["encoding"] = "utf-8"
return Extractor.request(self, url, **kwargs)
class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
"""Extractor for bato.to manga chapters"""
pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
example = "https://bato.to/title/12345-MANGA/54321"
def __init__(self, match):
self.root = text.root_from_url(match.group(0))
self.chapter_id = match.group(1)
url = "{}/title/0/{}".format(self.root, self.chapter_id)
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
extr = text.extract_from(page)
manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)
manga_id = text.extr(
extr('rel="canonical" href="', '"'), "/title/", "/")
match = re.match(
r"(?:Volume\s+(\d+) )?"
r"\w+\s+(\d+)(.*)", info)
if match:
volume, chapter, minor = match.groups()
title = text.remove_html(extr(
"selected>", "</option")).partition(" : ")[2]
else:
volume = chapter = 0
minor = ""
title = info
return {
"manga" : text.unescape(manga),
"manga_id" : text.parse_int(manga_id),
"title" : text.unescape(title),
"volume" : text.parse_int(volume),
"chapter" : text.parse_int(chapter),
"chapter_minor": minor,
"chapter_id" : text.parse_int(self.chapter_id),
"date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
}
def images(self, page):
images_container = text.extr(page, 'pageOpts', ':[0,0]}"')
images_container = text.unescape(images_container)
return [
(url, None)
for url in text.extract_iter(images_container, r"\"", r"\"")
]
class BatotoMangaExtractor(BatotoBase, MangaExtractor):
"""Extractor for bato.to manga"""
reverse = False
chapterclass = BatotoChapterExtractor
pattern = (BASE_PATTERN +
r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
example = "https://bato.to/title/12345-MANGA/"
def __init__(self, match):
self.root = text.root_from_url(match.group(0))
self.manga_id = match.group(1) or match.group(2)
url = "{}/title/{}".format(self.root, self.manga_id)
MangaExtractor.__init__(self, match, url)
def chapters(self, page):
extr = text.extract_from(page)
warning = extr(' class="alert alert-warning">', "</div><")
if warning:
raise exception.StopExtraction("'%s'", text.remove_html(warning))
data = {
"manga_id": text.parse_int(self.manga_id),
"manga" : text.unescape(extr(
"<title>", "<").rpartition(" - ")[0]),
}
extr('<div data-hk="0-0-0-0"', "")
results = []
while True:
href = extr('<a href="/title/', '"')
if not href:
break
chapter = href.rpartition("-ch_")[2]
chapter, sep, minor = chapter.partition(".")
data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = sep + minor
data["date"] = text.parse_datetime(
extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
url = "{}/title/{}".format(self.root, href)
results.append((url, data.copy()))
return results

@ -89,6 +89,17 @@ class BehanceGalleryExtractor(BehanceExtractor):
BehanceExtractor.__init__(self, match) BehanceExtractor.__init__(self, match)
self.gallery_id = match.group(1) self.gallery_id = match.group(1)
def _init(self):
BehanceExtractor._init(self)
modules = self.config("modules")
if modules:
if isinstance(modules, str):
modules = modules.split(",")
self.modules = set(modules)
else:
self.modules = {"image", "video", "mediacollection", "embed"}
def items(self): def items(self):
data = self.get_gallery_data() data = self.get_gallery_data()
imgs = self.get_images(data) imgs = self.get_images(data)
@ -97,7 +108,8 @@ class BehanceGalleryExtractor(BehanceExtractor):
yield Message.Directory, data yield Message.Directory, data
for data["num"], (url, module) in enumerate(imgs, 1): for data["num"], (url, module) in enumerate(imgs, 1):
data["module"] = module data["module"] = module
data["extension"] = text.ext_from_url(url) data["extension"] = (module.get("extension") or
text.ext_from_url(url))
yield Message.Url, url, data yield Message.Url, url, data
def get_gallery_data(self): def get_gallery_data(self):
@ -133,13 +145,17 @@ class BehanceGalleryExtractor(BehanceExtractor):
append = result.append append = result.append
for module in data["modules"]: for module in data["modules"]:
mtype = module["__typename"] mtype = module["__typename"][:-6].lower()
if mtype == "ImageModule": if mtype not in self.modules:
self.log.debug("Skipping '%s' module", mtype)
continue
if mtype == "image":
url = module["imageSizes"]["size_original"]["url"] url = module["imageSizes"]["size_original"]["url"]
append((url, module)) append((url, module))
elif mtype == "VideoModule": elif mtype == "video":
try: try:
renditions = module["videoData"]["renditions"] renditions = module["videoData"]["renditions"]
except Exception: except Exception:
@ -158,7 +174,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
append((url, module)) append((url, module))
elif mtype == "MediaCollectionModule": elif mtype == "mediacollection":
for component in module["components"]: for component in module["components"]:
for size in component["imageSizes"].values(): for size in component["imageSizes"].values():
if size: if size:
@ -167,10 +183,16 @@ class BehanceGalleryExtractor(BehanceExtractor):
append(("/".join(parts), module)) append(("/".join(parts), module))
break break
elif mtype == "EmbedModule": elif mtype == "embed":
embed = module.get("originalEmbed") or module.get("fluidEmbed") embed = module.get("originalEmbed") or module.get("fluidEmbed")
if embed: if embed:
append(("ytdl:" + text.extr(embed, 'src="', '"'), module)) embed = text.unescape(text.extr(embed, 'src="', '"'))
module["extension"] = "mp4"
append(("ytdl:" + embed, module))
elif mtype == "text":
module["extension"] = "txt"
append(("text:" + module["text"], module))
return result return result

@ -8,30 +8,22 @@
"""Extractors for Blogger blogs""" """Extractors for Blogger blogs"""
from .common import Extractor, Message from .common import BaseExtractor, Message
from .. import text, util from .. import text, util
import re import re
BASE_PATTERN = (
r"(?:blogger:(?:https?://)?([^/]+)|"
r"(?:https?://)?([\w-]+\.blogspot\.com))")
class BloggerExtractor(BaseExtractor):
class BloggerExtractor(Extractor):
"""Base class for blogger extractors""" """Base class for blogger extractors"""
category = "blogger" basecategory = "blogger"
directory_fmt = ("{category}", "{blog[name]}", directory_fmt = ("blogger", "{blog[name]}",
"{post[date]:%Y-%m-%d} {post[title]}") "{post[date]:%Y-%m-%d} {post[title]}")
filename_fmt = "{num:>03}.{extension}" filename_fmt = "{num:>03}.{extension}"
archive_fmt = "{post[id]}_{num}" archive_fmt = "{post[id]}_{num}"
root = "https://www.blogger.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.blog = match.group(1) or match.group(2)
def _init(self): def _init(self):
self.api = BloggerAPI(self) self.api = BloggerAPI(self)
self.blog = self.root.rpartition("/")[2]
self.videos = self.config("videos", True) self.videos = self.config("videos", True)
def items(self): def items(self):
@ -45,7 +37,7 @@ class BloggerExtractor(Extractor):
findall_image = re.compile( findall_image = re.compile(
r'src="(https?://(?:' r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|' r'blogger\.googleusercontent\.com/img|'
r'lh\d+\.googleusercontent\.com/|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|'
r'\d+\.bp\.blogspot\.com)/[^"]+)').findall r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
findall_video = re.compile( findall_video = re.compile(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
@ -92,6 +84,18 @@ class BloggerExtractor(Extractor):
"""Return additional metadata""" """Return additional metadata"""
BASE_PATTERN = BloggerExtractor.update({
"blogspot": {
"root": None,
"pattern": r"[\w-]+\.blogspot\.com",
},
"micmicidol": {
"root": "https://www.micmicidol.club",
"pattern": r"(?:www\.)?micmicidol\.club",
},
})
class BloggerPostExtractor(BloggerExtractor): class BloggerPostExtractor(BloggerExtractor):
"""Extractor for a single blog post""" """Extractor for a single blog post"""
subcategory = "post" subcategory = "post"
@ -100,7 +104,7 @@ class BloggerPostExtractor(BloggerExtractor):
def __init__(self, match): def __init__(self, match):
BloggerExtractor.__init__(self, match) BloggerExtractor.__init__(self, match)
self.path = match.group(3) self.path = match.group(match.lastindex)
def posts(self, blog): def posts(self, blog):
return (self.api.post_by_path(blog["id"], self.path),) return (self.api.post_by_path(blog["id"], self.path),)
@ -124,7 +128,7 @@ class BloggerSearchExtractor(BloggerExtractor):
def __init__(self, match): def __init__(self, match):
BloggerExtractor.__init__(self, match) BloggerExtractor.__init__(self, match)
self.query = text.unquote(match.group(3)) self.query = text.unquote(match.group(match.lastindex))
def posts(self, blog): def posts(self, blog):
return self.api.blog_search(blog["id"], self.query) return self.api.blog_search(blog["id"], self.query)
@ -141,7 +145,7 @@ class BloggerLabelExtractor(BloggerExtractor):
def __init__(self, match): def __init__(self, match):
BloggerExtractor.__init__(self, match) BloggerExtractor.__init__(self, match)
self.label = text.unquote(match.group(3)) self.label = text.unquote(match.group(match.lastindex))
def posts(self, blog): def posts(self, blog):
return self.api.blog_posts(blog["id"], self.label) return self.api.blog_posts(blog["id"], self.label)

@ -6,12 +6,14 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extractors for https://bunkrr.su/""" """Extractors for https://bunkr.sk/"""
from .lolisafe import LolisafeAlbumExtractor from .lolisafe import LolisafeAlbumExtractor
from .. import text from .. import text
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:sk|[rs]u|la|is|to)"
MEDIA_DOMAIN_OVERRIDES = { MEDIA_DOMAIN_OVERRIDES = {
"cdn9.bunkr.ru" : "c9.bunkr.ru", "cdn9.bunkr.ru" : "c9.bunkr.ru",
"cdn12.bunkr.ru": "media-files12.bunkr.la", "cdn12.bunkr.ru": "media-files12.bunkr.la",
@ -25,11 +27,11 @@ CDN_HOSTED_EXTENSIONS = (
class BunkrAlbumExtractor(LolisafeAlbumExtractor): class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for bunkrr.su albums""" """Extractor for bunkr.sk albums"""
category = "bunkr" category = "bunkr"
root = "https://bunkrr.su" root = "https://bunkr.sk"
pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://bunkrr.su/a/ID" example = "https://bunkr.sk/a/ID"
def fetch_album(self, album_id): def fetch_album(self, album_id):
# album metadata # album metadata
@ -38,36 +40,67 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
page, "<h1", "</div>").partition(">")[2]) page, "<h1", "</div>").partition(">")[2])
count, _, size = info[1].split(None, 2) count, _, size = info[1].split(None, 2)
# files
cdn = None
files = []
append = files.append
pos = page.index('class="grid-images') pos = page.index('class="grid-images')
for url in text.extract_iter(page, '<a href="', '"', pos): urls = list(text.extract_iter(page, '<a href="', '"', pos))
if url.startswith("/"):
if not cdn: return self._extract_files(urls), {
# fetch cdn root from download page
durl = "{}/d/{}".format(self.root, url[3:])
cdn = text.extr(self.request(
durl).text, 'link.href = "', '"')
cdn = cdn[:cdn.index("/", 8)]
url = cdn + url[2:]
url = text.unescape(url)
if url.lower().endswith(CDN_HOSTED_EXTENSIONS):
scheme, domain, path, query, fragment = urlsplit(url)
if domain in MEDIA_DOMAIN_OVERRIDES:
domain = MEDIA_DOMAIN_OVERRIDES[domain]
else:
domain = domain.replace("cdn", "media-files", 1)
url = urlunsplit((scheme, domain, path, query, fragment))
append({"file": url})
return files, {
"album_id" : self.album_id, "album_id" : self.album_id,
"album_name" : text.unescape(info[0]), "album_name" : text.unescape(info[0]),
"album_size" : size[1:-1], "album_size" : size[1:-1],
"description": text.unescape(info[2]) if len(info) > 2 else "", "description": text.unescape(info[2]) if len(info) > 2 else "",
"count" : len(files), "count" : len(urls),
}
def _extract_files(self, urls):
for url in urls:
if url.startswith("/"):
try:
url = self._extract_file(text.unescape(url))
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
continue
else:
if url.lower().endswith(CDN_HOSTED_EXTENSIONS):
scheme, domain, path, query, fragment = urlsplit(url)
if domain in MEDIA_DOMAIN_OVERRIDES:
domain = MEDIA_DOMAIN_OVERRIDES[domain]
else:
domain = domain.replace("cdn", "media-files", 1)
url = urlunsplit((scheme, domain, path, query, fragment))
yield {"file": text.unescape(url)}
def _extract_file(self, path):
page = self.request(self.root + path).text
if path[1] == "v":
url = text.extr(page, '<source src="', '"')
else:
url = text.extr(page, '<img src="', '"')
if not url:
url = text.rextract(
page, ' href="', '"', page.rindex("Download"))[0]
return url
class BunkrMediaExtractor(BunkrAlbumExtractor):
"""Extractor for bunkr.sk media links"""
subcategory = "media"
directory_fmt = ("{category}",)
pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)"
example = "https://bunkr.sk/v/FILENAME"
def fetch_album(self, album_id):
try:
url = self._extract_file(urlsplit(self.url).path)
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
return ({"file": text.unescape(url)},), {
"album_id" : "",
"album_name" : "",
"album_size" : -1,
"description": "",
"count" : 1,
} }

@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Chevereto galleries"""
from .common import BaseExtractor, Message
from .. import text
class CheveretoExtractor(BaseExtractor):
"""Base class for chevereto extractors"""
basecategory = "chevereto"
directory_fmt = ("{category}", "{user}", "{album}",)
archive_fmt = "{id}"
def __init__(self, match):
BaseExtractor.__init__(self, match)
self.path = match.group(match.lastindex)
def _pagination(self, url):
while url:
page = self.request(url).text
for item in text.extract_iter(
page, '<div class="list-item-image ', 'image-container'):
yield text.extr(item, '<a href="', '"')
url = text.extr(page, '<a data-pagination="next" href="', '" ><')
BASE_PATTERN = CheveretoExtractor.update({
"jpgfish": {
"root": "https://jpg4.su",
"pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
},
"imgkiwi": {
"root": "https://img.kiwi",
"pattern": r"img\.kiwi",
},
"deltaporno": {
"root": "https://gallery.deltaporno.com",
"pattern": r"gallery\.deltaporno\.com",
},
})
class CheveretoImageExtractor(CheveretoExtractor):
"""Extractor for chevereto Images"""
subcategory = "image"
pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)"
example = "https://jpg2.su/img/TITLE.ID"
def items(self):
url = self.root + self.path
extr = text.extract_from(self.request(url).text)
image = {
"id" : self.path.rpartition(".")[2],
"url" : extr('<meta property="og:image" content="', '"'),
"album": text.extr(extr("Added to <a", "/a>"), ">", "<"),
"user" : extr('username: "', '"'),
}
text.nameext_from_url(image["url"], image)
yield Message.Directory, image
yield Message.Url, image["url"], image
class CheveretoAlbumExtractor(CheveretoExtractor):
"""Extractor for chevereto Albums"""
subcategory = "album"
pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
example = "https://jpg2.su/album/TITLE.ID"
def items(self):
url = self.root + self.path
data = {"_extractor": CheveretoImageExtractor}
if self.path.endswith("/sub"):
albums = self._pagination(url)
else:
albums = (url,)
for album in albums:
for image in self._pagination(album):
yield Message.Queue, image, data
class CheveretoUserExtractor(CheveretoExtractor):
"""Extractor for chevereto Users"""
subcategory = "user"
pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)"
example = "https://jpg2.su/USER"
def items(self):
url = self.root + self.path
if self.path.endswith("/albums"):
data = {"_extractor": CheveretoAlbumExtractor}
else:
data = {"_extractor": CheveretoImageExtractor}
for url in self._pagination(url):
yield Message.Queue, url, data

@ -32,13 +32,15 @@ class Extractor():
directory_fmt = ("{category}",) directory_fmt = ("{category}",)
filename_fmt = "{filename}.{extension}" filename_fmt = "{filename}.{extension}"
archive_fmt = "" archive_fmt = ""
root = ""
cookies_domain = "" cookies_domain = ""
referer = True
ciphers = None
tls12 = True
browser = None browser = None
root = ""
request_interval = 0.0 request_interval = 0.0
request_interval_min = 0.0 request_interval_min = 0.0
request_timestamp = 0.0 request_timestamp = 0.0
tls12 = True
def __init__(self, match): def __init__(self, match):
self.log = logging.getLogger(self.category) self.log = logging.getLogger(self.category)
@ -76,6 +78,12 @@ class Extractor():
def config(self, key, default=None): def config(self, key, default=None):
return config.interpolate(self._cfgpath, key, default) return config.interpolate(self._cfgpath, key, default)
def config2(self, key, key2, default=None, sentinel=util.SENTINEL):
value = self.config(key, sentinel)
if value is not sentinel:
return value
return self.config(key2, default)
def config_deprecated(self, key, deprecated, default=None, def config_deprecated(self, key, deprecated, default=None,
sentinel=util.SENTINEL, history=set()): sentinel=util.SENTINEL, history=set()):
value = self.config(deprecated, sentinel) value = self.config(deprecated, sentinel)
@ -94,6 +102,9 @@ class Extractor():
def config_accumulate(self, key): def config_accumulate(self, key):
return config.accumulate(self._cfgpath, key) return config.accumulate(self._cfgpath, key)
def config_instance(self, key, default=None):
return default
def _config_shared(self, key, default=None): def _config_shared(self, key, default=None):
return config.interpolate_common( return config.interpolate_common(
("extractor",), self._cfgpath, key, default) ("extractor",), self._cfgpath, key, default)
@ -128,6 +139,18 @@ class Extractor():
kwargs["timeout"] = self._timeout kwargs["timeout"] = self._timeout
if "verify" not in kwargs: if "verify" not in kwargs:
kwargs["verify"] = self._verify kwargs["verify"] = self._verify
if "json" in kwargs:
json = kwargs["json"]
if json is not None:
kwargs["data"] = util.json_dumps(json).encode()
del kwargs["json"]
headers = kwargs.get("headers")
if headers:
headers["Content-Type"] = "application/json"
else:
kwargs["headers"] = {"Content-Type": "application/json"}
response = None response = None
tries = 1 tries = 1
@ -225,7 +248,7 @@ class Extractor():
password = None password = None
if username: if username:
password = self.config("password") password = self.config("password") or util.LazyPrompt()
elif self.config("netrc", False): elif self.config("netrc", False):
try: try:
info = netrc.netrc().authenticators(self.category) info = netrc.netrc().authenticators(self.category)
@ -304,16 +327,17 @@ class Extractor():
headers["User-Agent"] = useragent headers["User-Agent"] = useragent
headers["Accept"] = "*/*" headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5" headers["Accept-Language"] = "en-US,en;q=0.5"
ssl_ciphers = self.ciphers
if BROTLI: if BROTLI:
headers["Accept-Encoding"] = "gzip, deflate, br" headers["Accept-Encoding"] = "gzip, deflate, br"
else: else:
headers["Accept-Encoding"] = "gzip, deflate" headers["Accept-Encoding"] = "gzip, deflate"
custom_referer = self.config("referer", True) referer = self.config("referer", self.referer)
if custom_referer: if referer:
if isinstance(custom_referer, str): if isinstance(referer, str):
headers["Referer"] = custom_referer headers["Referer"] = referer
elif self.root: elif self.root:
headers["Referer"] = self.root + "/" headers["Referer"] = self.root + "/"
@ -505,12 +529,15 @@ class Extractor():
if include == "all": if include == "all":
include = extractors include = extractors
elif isinstance(include, str): elif isinstance(include, str):
include = include.split(",") include = include.replace(" ", "").split(",")
result = [(Message.Version, 1)] result = [(Message.Version, 1)]
for category in include: for category in include:
if category in extractors: try:
extr, url = extractors[category] extr, url = extractors[category]
except KeyError:
self.log.warning("Invalid include '%s'", category)
else:
result.append((Message.Queue, url, {"_extractor": extr})) result.append((Message.Queue, url, {"_extractor": extr}))
return iter(result) return iter(result)
@ -711,9 +738,10 @@ class BaseExtractor(Extractor):
for index, group in enumerate(match.groups()): for index, group in enumerate(match.groups()):
if group is not None: if group is not None:
if index: if index:
self.category, self.root = self.instances[index-1] self.category, self.root, info = self.instances[index-1]
if not self.root: if not self.root:
self.root = text.root_from_url(match.group(0)) self.root = text.root_from_url(match.group(0))
self.config_instance = info.get
else: else:
self.root = group self.root = group
self.category = group.partition("://")[2] self.category = group.partition("://")[2]
@ -733,7 +761,7 @@ class BaseExtractor(Extractor):
root = info["root"] root = info["root"]
if root: if root:
root = root.rstrip("/") root = root.rstrip("/")
instance_list.append((category, root)) instance_list.append((category, root, info))
pattern = info.get("pattern") pattern = info.get("pattern")
if not pattern: if not pattern:

@ -7,6 +7,7 @@
"""Extractors for https://cyberdrop.me/""" """Extractors for https://cyberdrop.me/"""
from . import lolisafe from . import lolisafe
from .common import Message
from .. import text from .. import text
@ -16,24 +17,43 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)"
example = "https://cyberdrop.me/a/ID" example = "https://cyberdrop.me/a/ID"
def items(self):
files, data = self.fetch_album(self.album_id)
yield Message.Directory, data
for data["num"], file in enumerate(files, 1):
file.update(data)
text.nameext_from_url(file["name"], file)
file["name"], sep, file["id"] = file["filename"].rpartition("-")
yield Message.Url, file["url"], file
def fetch_album(self, album_id): def fetch_album(self, album_id):
url = self.root + "/a/" + self.album_id url = "{}/a/{}".format(self.root, album_id)
extr = text.extract_from(self.request(url).text) page = self.request(url).text
extr = text.extract_from(page)
files = []
append = files.append desc = extr('property="og:description" content="', '"')
while True: if desc.startswith("A privacy-focused censorship-resistant file "
url = text.unescape(extr('id="file" href="', '"')) "sharing platform free for everyone."):
if not url: desc = ""
break extr('id="title"', "")
append({"file": url,
"_fallback": (self.root + url[url.find("/", 8):],)}) album = {
return files, {
"album_id" : self.album_id, "album_id" : self.album_id,
"album_name" : extr("name: '", "'"), "album_name" : text.unescape(extr('title="', '"')),
"date" : text.parse_timestamp(extr("timestamp: ", ",")), "album_size" : text.parse_bytes(extr(
"album_size" : text.parse_int(extr("totalSize: ", ",")), '<p class="title">', "B")),
"description": extr("description: `", "`"), "date" : text.parse_datetime(extr(
"count" : len(files), '<p class="title">', '<'), "%d.%m.%Y"),
"description": text.unescape(text.unescape( # double
desc.rpartition(" [R")[0])),
} }
file_ids = list(text.extract_iter(page, 'id="file" href="/f/', '"'))
album["count"] = len(file_ids)
return self._extract_files(file_ids), album
def _extract_files(self, file_ids):
for file_id in file_ids:
url = "{}/api/f/{}".format(self.root, file_id)
yield self.request(url).json()

@ -20,7 +20,7 @@ class DanbooruExtractor(BaseExtractor):
page_limit = 1000 page_limit = 1000
page_start = None page_start = None
per_page = 200 per_page = 200
request_interval = 1.0 request_interval = (0.5, 1.5)
def _init(self): def _init(self):
self.ugoira = self.config("ugoira", False) self.ugoira = self.config("ugoira", False)
@ -36,7 +36,7 @@ class DanbooruExtractor(BaseExtractor):
username, api_key = self._get_auth_info() username, api_key = self._get_auth_info()
if username: if username:
self.log.debug("Using HTTP Basic Auth for user '%s'", username) self.log.debug("Using HTTP Basic Auth for user '%s'", username)
self.session.auth = (username, api_key) self.session.auth = util.HTTPBasicAuth(username, api_key)
def skip(self, num): def skip(self, num):
pages = num // self.per_page pages = num // self.per_page
@ -72,6 +72,25 @@ class DanbooruExtractor(BaseExtractor):
post["date"] = text.parse_datetime( post["date"] = text.parse_datetime(
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
post["tags"] = (
post["tag_string"].split(" ")
if post["tag_string"] else ())
post["tags_artist"] = (
post["tag_string_artist"].split(" ")
if post["tag_string_artist"] else ())
post["tags_character"] = (
post["tag_string_character"].split(" ")
if post["tag_string_character"] else ())
post["tags_copyright"] = (
post["tag_string_copyright"].split(" ")
if post["tag_string_copyright"] else ())
post["tags_general"] = (
post["tag_string_general"].split(" ")
if post["tag_string_general"] else ())
post["tags_meta"] = (
post["tag_string_meta"].split(" ")
if post["tag_string_meta"] else ())
if post["extension"] == "zip": if post["extension"] == "zip":
if self.ugoira: if self.ugoira:
post["frames"] = self._ugoira_frames(post) post["frames"] = self._ugoira_frames(post)
@ -150,7 +169,8 @@ class DanbooruExtractor(BaseExtractor):
BASE_PATTERN = DanbooruExtractor.update({ BASE_PATTERN = DanbooruExtractor.update({
"danbooru": { "danbooru": {
"root": None, "root": None,
"pattern": r"(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us", "pattern": r"(?:(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us"
r"|donmai\.moe)",
}, },
"atfbooru": { "atfbooru": {
"root": "https://booru.allthefallen.moe", "root": "https://booru.allthefallen.moe",
@ -158,7 +178,7 @@ BASE_PATTERN = DanbooruExtractor.update({
}, },
"aibooru": { "aibooru": {
"root": None, "root": None,
"pattern": r"(?:safe.)?aibooru\.online", "pattern": r"(?:safe\.)?aibooru\.online",
}, },
"booruvar": { "booruvar": {
"root": "https://booru.borvar.art", "root": "https://booru.borvar.art",

@ -38,15 +38,17 @@ class DeviantartExtractor(Extractor):
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.user = match.group(1) or match.group(2) self.user = (match.group(1) or match.group(2) or "").lower()
self.offset = 0 self.offset = 0
def _init(self): def _init(self):
self.jwt = self.config("jwt", False) self.jwt = self.config("jwt", False)
self.flat = self.config("flat", True) self.flat = self.config("flat", True)
self.extra = self.config("extra", False) self.extra = self.config("extra", False)
self.quality = self.config("quality", "100")
self.original = self.config("original", True) self.original = self.config("original", True)
self.comments = self.config("comments", False) self.comments = self.config("comments", False)
self.intermediary = self.config("intermediary", True)
self.api = DeviantartOAuthAPI(self) self.api = DeviantartOAuthAPI(self)
self.group = False self.group = False
@ -59,6 +61,9 @@ class DeviantartExtractor(Extractor):
else: else:
self.unwatch = None self.unwatch = None
if self.quality:
self.quality = ",q_{}".format(self.quality)
if self.original != "image": if self.original != "image":
self._update_content = self._update_content_default self._update_content = self._update_content_default
else: else:
@ -87,14 +92,19 @@ class DeviantartExtractor(Extractor):
return True return True
def items(self): def items(self):
if self.user and self.config("group", True): if self.user:
profile = self.api.user_profile(self.user) group = self.config("group", True)
self.group = not profile if group:
if self.group: profile = self.api.user_profile(self.user)
self.subcategory = "group-" + self.subcategory if profile:
self.user = self.user.lower() self.user = profile["user"]["username"]
else: self.group = False
self.user = profile["user"]["username"] elif group == "skip":
self.log.info("Skipping group '%s'", self.user)
raise exception.StopExtraction()
else:
self.subcategory = "group-" + self.subcategory
self.group = True
for deviation in self.deviations(): for deviation in self.deviations():
if isinstance(deviation, tuple): if isinstance(deviation, tuple):
@ -125,6 +135,19 @@ class DeviantartExtractor(Extractor):
self._update_content(deviation, content) self._update_content(deviation, content)
elif self.jwt: elif self.jwt:
self._update_token(deviation, content) self._update_token(deviation, content)
elif content["src"].startswith("https://images-wixmp-"):
if self.intermediary and deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*",
r"/intermediary\1", content["src"], 1)
if count:
deviation["is_original"] = False
deviation["_fallback"] = (content["src"],)
content["src"] = intermediary
if self.quality:
content["src"] = re.sub(
r",q_\d+", self.quality, content["src"], 1)
yield self.commit(deviation, content) yield self.commit(deviation, content)
@ -212,7 +235,7 @@ class DeviantartExtractor(Extractor):
if self.comments: if self.comments:
deviation["comments"] = ( deviation["comments"] = (
self.api.comments(deviation["deviationid"], target="deviation") self._extract_comments(deviation["deviationid"], "deviation")
if deviation["stats"]["comments"] else () if deviation["stats"]["comments"] else ()
) )
@ -332,7 +355,11 @@ class DeviantartExtractor(Extractor):
yield url, folder yield url, folder
def _update_content_default(self, deviation, content): def _update_content_default(self, deviation, content):
public = False if "premium_folder_data" in deviation else None if "premium_folder_data" in deviation or deviation.get("is_mature"):
public = False
else:
public = None
data = self.api.deviation_download(deviation["deviationid"], public) data = self.api.deviation_download(deviation["deviationid"], public)
content.update(data) content.update(data)
deviation["is_original"] = True deviation["is_original"] = True
@ -355,6 +382,9 @@ class DeviantartExtractor(Extractor):
if not sep: if not sep:
return return
# 'images-wixmp' returns 401 errors, but just 'wixmp' still works
url = url.replace("//images-wixmp", "//wixmp", 1)
# header = b'{"typ":"JWT","alg":"none"}' # header = b'{"typ":"JWT","alg":"none"}'
payload = ( payload = (
b'{"sub":"urn:app:","iss":"urn:app:","obj":[[{"path":"/f/' + b'{"sub":"urn:app:","iss":"urn:app:","obj":[[{"path":"/f/' +
@ -363,14 +393,37 @@ class DeviantartExtractor(Extractor):
) )
deviation["_fallback"] = (content["src"],) deviation["_fallback"] = (content["src"],)
deviation["is_original"] = True
content["src"] = ( content["src"] = (
"{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format( "{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format(
url, url,
# base64 of 'header' is precomputed as 'eyJ0eX...' # base64 of 'header' is precomputed as 'eyJ0eX...'
# binascii.a2b_base64(header).rstrip(b"=\n").decode(), # binascii.b2a_base64(header).rstrip(b"=\n").decode(),
binascii.b2a_base64(payload).rstrip(b"=\n").decode()) binascii.b2a_base64(payload).rstrip(b"=\n").decode())
) )
def _extract_comments(self, target_id, target_type="deviation"):
results = None
comment_ids = [None]
while comment_ids:
comments = self.api.comments(
target_id, target_type, comment_ids.pop())
if results:
results.extend(comments)
else:
results = comments
# parent comments, i.e. nodes with at least one child
parents = {c["parentid"] for c in comments}
# comments with more than one reply
replies = {c["commentid"] for c in comments if c["replies"]}
# add comment UUIDs with replies that are not parent to any node
comment_ids.extend(replies - parents)
return results
def _limited_request(self, url, **kwargs): def _limited_request(self, url, **kwargs):
"""Limits HTTP requests to one every 2 seconds""" """Limits HTTP requests to one every 2 seconds"""
kwargs["fatal"] = None kwargs["fatal"] = None
@ -399,9 +452,11 @@ class DeviantartExtractor(Extractor):
return None return None
dev = self.api.deviation(deviation["deviationid"], False) dev = self.api.deviation(deviation["deviationid"], False)
folder = dev["premium_folder_data"] folder = deviation["premium_folder_data"]
username = dev["author"]["username"] username = dev["author"]["username"]
has_access = folder["has_access"]
# premium_folder_data is no longer present when user has access (#5063)
has_access = ("premium_folder_data" not in dev) or folder["has_access"]
if not has_access and folder["type"] == "watchers" and \ if not has_access and folder["type"] == "watchers" and \
self.config("auto-watch"): self.config("auto-watch"):
@ -459,11 +514,13 @@ class DeviantartUserExtractor(DeviantartExtractor):
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.user) base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors(( return self._dispatch_extractors((
(DeviantartGalleryExtractor , base + "gallery"), (DeviantartAvatarExtractor , base + "avatar"),
(DeviantartScrapsExtractor , base + "gallery/scraps"), (DeviantartBackgroundExtractor, base + "banner"),
(DeviantartJournalExtractor , base + "posts"), (DeviantartGalleryExtractor , base + "gallery"),
(DeviantartStatusExtractor , base + "posts/statuses"), (DeviantartScrapsExtractor , base + "gallery/scraps"),
(DeviantartFavoriteExtractor, base + "favourites"), (DeviantartJournalExtractor , base + "posts"),
(DeviantartStatusExtractor , base + "posts/statuses"),
(DeviantartFavoriteExtractor , base + "favourites"),
), ("gallery",)) ), ("gallery",))
@ -484,6 +541,70 @@ class DeviantartGalleryExtractor(DeviantartExtractor):
return self._folder_urls(folders, "gallery", DeviantartFolderExtractor) return self._folder_urls(folders, "gallery", DeviantartFolderExtractor)
class DeviantartAvatarExtractor(DeviantartExtractor):
"""Extractor for an artist's avatar"""
subcategory = "avatar"
archive_fmt = "a_{_username}_{index}"
pattern = BASE_PATTERN + r"/avatar"
example = "https://www.deviantart.com/USER/avatar/"
def deviations(self):
name = self.user.lower()
profile = self.api.user_profile(name)
if not profile:
return ()
user = profile["user"]
icon = user["usericon"]
index = icon.rpartition("?")[2]
formats = self.config("formats")
if not formats:
url = icon.replace("/avatars/", "/avatars-big/", 1)
return (self._make_deviation(url, user, index, ""),)
if isinstance(formats, str):
formats = formats.replace(" ", "").split(",")
results = []
for fmt in formats:
fmt, _, ext = fmt.rpartition(".")
if fmt:
fmt = "-" + fmt
url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format(
fmt, name[0], name[1], name, ext, index)
results.append(self._make_deviation(url, user, index, fmt))
return results
def _make_deviation(self, url, user, index, fmt):
return {
"author" : user,
"category" : "avatar",
"index" : text.parse_int(index),
"is_deleted" : False,
"is_downloadable": False,
"published_time" : 0,
"title" : "avatar" + fmt,
"stats" : {"comments": 0},
"content" : {"src": url},
}
class DeviantartBackgroundExtractor(DeviantartExtractor):
"""Extractor for an artist's banner"""
subcategory = "background"
archive_fmt = "b_{index}"
pattern = BASE_PATTERN + r"/ba(?:nner|ckground)"
example = "https://www.deviantart.com/USER/banner/"
def deviations(self):
try:
return (self.api.user_profile(self.user.lower())
["cover_deviation"]["cover_deviation"],)
except Exception:
return ()
class DeviantartFolderExtractor(DeviantartExtractor): class DeviantartFolderExtractor(DeviantartExtractor):
"""Extractor for deviations inside an artist's gallery folder""" """Extractor for deviations inside an artist's gallery folder"""
subcategory = "folder" subcategory = "folder"
@ -674,7 +795,7 @@ class DeviantartStatusExtractor(DeviantartExtractor):
deviation["stats"] = {"comments": comments_count} deviation["stats"] = {"comments": comments_count}
if self.comments: if self.comments:
deviation["comments"] = ( deviation["comments"] = (
self.api.comments(deviation["statusid"], target="status") self._extract_comments(deviation["statusid"], "status")
if comments_count else () if comments_count else ()
) )
@ -951,8 +1072,9 @@ class DeviantartOAuthAPI():
self.strategy = extractor.config("pagination") self.strategy = extractor.config("pagination")
self.public = extractor.config("public", True) self.public = extractor.config("public", True)
self.client_id = extractor.config("client-id") client_id = extractor.config("client-id")
if self.client_id: if client_id:
self.client_id = str(client_id)
self.client_secret = extractor.config("client-secret") self.client_secret = extractor.config("client-secret")
else: else:
self.client_id = self.CLIENT_ID self.client_id = self.CLIENT_ID
@ -960,7 +1082,7 @@ class DeviantartOAuthAPI():
token = extractor.config("refresh-token") token = extractor.config("refresh-token")
if token is None or token == "cache": if token is None or token == "cache":
token = "#" + str(self.client_id) token = "#" + self.client_id
if not _refresh_token_cache(token): if not _refresh_token_cache(token):
token = None token = None
self.refresh_token_key = token self.refresh_token_key = token
@ -1048,17 +1170,28 @@ class DeviantartOAuthAPI():
"mature_content": self.mature} "mature_content": self.mature}
return self._pagination_list(endpoint, params) return self._pagination_list(endpoint, params)
def comments(self, id, target, offset=0): def comments(self, target_id, target_type="deviation",
comment_id=None, offset=0):
"""Fetch comments posted on a target""" """Fetch comments posted on a target"""
endpoint = "/comments/{}/{}".format(target, id) endpoint = "/comments/{}/{}".format(target_type, target_id)
params = {"maxdepth": "5", "offset": offset, "limit": 50, params = {
"mature_content": self.mature} "commentid" : comment_id,
"maxdepth" : "5",
"offset" : offset,
"limit" : 50,
"mature_content": self.mature,
}
return self._pagination_list(endpoint, params=params, key="thread") return self._pagination_list(endpoint, params=params, key="thread")
def deviation(self, deviation_id, public=None): def deviation(self, deviation_id, public=None):
"""Query and return info about a single Deviation""" """Query and return info about a single Deviation"""
endpoint = "/deviation/" + deviation_id endpoint = "/deviation/" + deviation_id
deviation = self._call(endpoint, public=public) deviation = self._call(endpoint, public=public)
if deviation.get("is_mature") and public is None and \
self.refresh_token_key:
deviation = self._call(endpoint, public=False)
if self.metadata: if self.metadata:
self._metadata((deviation,)) self._metadata((deviation,))
if self.folders: if self.folders:
@ -1176,7 +1309,7 @@ class DeviantartOAuthAPI():
self.log.info("Requesting public access token") self.log.info("Requesting public access token")
data = {"grant_type": "client_credentials"} data = {"grant_type": "client_credentials"}
auth = (self.client_id, self.client_secret) auth = util.HTTPBasicAuth(self.client_id, self.client_secret)
response = self.extractor.request( response = self.extractor.request(
url, method="POST", data=data, auth=auth, fatal=False) url, method="POST", data=data, auth=auth, fatal=False)
data = response.json() data = response.json()
@ -1214,8 +1347,12 @@ class DeviantartOAuthAPI():
return data return data
if not fatal and status != 429: if not fatal and status != 429:
return None return None
if data.get("error_description") == "User not found.":
error = data.get("error_description")
if error == "User not found.":
raise exception.NotFoundError("user or group") raise exception.NotFoundError("user or group")
if error == "Deviation not downloadable.":
raise exception.AuthorizationError()
self.log.debug(response.text) self.log.debug(response.text)
msg = "API responded with {} {}".format( msg = "API responded with {} {}".format(
@ -1239,6 +1376,17 @@ class DeviantartOAuthAPI():
self.log.error(msg) self.log.error(msg)
return data return data
def _switch_tokens(self, results, params):
if len(results) < params["limit"]:
return True
if not self.extractor.jwt:
for item in results:
if item.get("is_mature"):
return True
return False
def _pagination(self, endpoint, params, def _pagination(self, endpoint, params,
extend=True, public=None, unpack=False, key="results"): extend=True, public=None, unpack=False, key="results"):
warn = True warn = True
@ -1257,7 +1405,7 @@ class DeviantartOAuthAPI():
results = [item["journal"] for item in results results = [item["journal"] for item in results
if "journal" in item] if "journal" in item]
if extend: if extend:
if public and len(results) < params["limit"]: if public and self._switch_tokens(results, params):
if self.refresh_token_key: if self.refresh_token_key:
self.log.debug("Switching to private access token") self.log.debug("Switching to private access token")
public = False public = False
@ -1265,9 +1413,10 @@ class DeviantartOAuthAPI():
elif data["has_more"] and warn: elif data["has_more"] and warn:
warn = False warn = False
self.log.warning( self.log.warning(
"Private deviations detected! Run 'gallery-dl " "Private or mature deviations detected! "
"oauth:deviantart' and follow the instructions to " "Run 'gallery-dl oauth:deviantart' and follow the "
"be able to access them.") "instructions to be able to access them.")
# "statusid" cannot be used instead # "statusid" cannot be used instead
if results and "deviationid" in results[0]: if results and "deviationid" in results[0]:
if self.metadata: if self.metadata:
@ -1377,12 +1526,14 @@ class DeviantartEclipseAPI():
self.csrf_token = None self.csrf_token = None
def deviation_extended_fetch(self, deviation_id, user, kind=None): def deviation_extended_fetch(self, deviation_id, user, kind=None):
endpoint = "/_napi/da-browse/shared_api/deviation/extended_fetch" endpoint = "/_puppy/dadeviation/init"
params = { params = {
"deviationid" : deviation_id, "deviationid" : deviation_id,
"username" : user, "username" : user,
"type" : kind, "type" : kind,
"include_session": "false", "include_session" : "false",
"expand" : "deviation.related",
"da_minor_version": "20230710",
} }
return self._call(endpoint, params) return self._call(endpoint, params)
@ -1410,7 +1561,7 @@ class DeviantartEclipseAPI():
return self._pagination(endpoint, params) return self._pagination(endpoint, params)
def search_deviations(self, params): def search_deviations(self, params):
endpoint = "/_napi/da-browse/api/networkbar/search/deviations" endpoint = "/_puppy/dabrowse/search/deviations"
return self._pagination(endpoint, params, key="deviations") return self._pagination(endpoint, params, key="deviations")
def user_info(self, user, expand=False): def user_info(self, user, expand=False):
@ -1497,7 +1648,7 @@ class DeviantartEclipseAPI():
return token return token
@cache(maxage=100*365*86400, keyarg=0) @cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(token): def _refresh_token_cache(token):
if token and token[0] == "#": if token and token[0] == "#":
return None return None

@ -44,20 +44,26 @@ class EromeExtractor(Extractor):
pos = page.index('<div class="user-profile', pos) pos = page.index('<div class="user-profile', pos)
user, pos = text.extract( user, pos = text.extract(
page, 'href="https://www.erome.com/', '"', pos) page, 'href="https://www.erome.com/', '"', pos)
urls = []
groups = page.split('<div class="media-group"')
for group in util.advance(groups, 1):
url = (text.extr(group, '<source src="', '"') or
text.extr(group, 'data-src="', '"'))
if url:
urls.append(url)
data = { data = {
"album_id" : album_id, "album_id" : album_id,
"title" : text.unescape(title), "title" : text.unescape(title),
"user" : text.unquote(user), "user" : text.unquote(user),
"count" : len(urls),
"_http_headers": {"Referer": url}, "_http_headers": {"Referer": url},
} }
yield Message.Directory, data yield Message.Directory, data
groups = page.split('<div class="media-group"') for data["num"], url in enumerate(urls, 1):
for data["num"], group in enumerate(util.advance(groups, 1), 1): yield Message.Url, url, text.nameext_from_url(url, data)
url = (text.extr(group, '<source src="', '"') or
text.extr(group, 'data-src="', '"'))
if url:
yield Message.Url, url, text.nameext_from_url(url, data)
def albums(self): def albums(self):
return () return ()

@ -26,7 +26,8 @@ class ExhentaiExtractor(Extractor):
cookies_domain = ".exhentai.org" cookies_domain = ".exhentai.org"
cookies_names = ("ipb_member_id", "ipb_pass_hash") cookies_names = ("ipb_member_id", "ipb_pass_hash")
root = "https://exhentai.org" root = "https://exhentai.org"
request_interval = 5.0 request_interval = (3.0, 6.0)
ciphers = "DEFAULT:!DH"
LIMIT = False LIMIT = False
@ -39,20 +40,13 @@ class ExhentaiExtractor(Extractor):
if domain == "auto": if domain == "auto":
domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" domain = ("ex" if self.version == "ex" else "e-") + "hentai.org"
self.root = "https://" + domain self.root = "https://" + domain
self.api_url = self.root + "/api.php"
self.cookies_domain = "." + domain self.cookies_domain = "." + domain
Extractor.initialize(self) Extractor.initialize(self)
if self.version != "ex": if self.version != "ex":
self.cookies.set("nw", "1", domain=self.cookies_domain) self.cookies.set("nw", "1", domain=self.cookies_domain)
self.original = self.config("original", True)
limits = self.config("limits", False)
if limits and limits.__class__ is int:
self.limits = limits
self._remaining = 0
else:
self.limits = False
def request(self, url, **kwargs): def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs) response = Extractor.request(self, url, **kwargs)
@ -73,16 +67,18 @@ class ExhentaiExtractor(Extractor):
if username: if username:
return self.cookies_update(self._login_impl(username, password)) return self.cookies_update(self._login_impl(username, password))
self.log.info("no username given; using e-hentai.org") if self.version == "ex":
self.root = "https://e-hentai.org" self.log.info("No username or cookies given; using e-hentai.org")
self.cookies_domain = ".e-hentai.org" self.root = "https://e-hentai.org"
self.cookies.set("nw", "1", domain=self.cookies_domain) self.cookies_domain = ".e-hentai.org"
self.cookies.set("nw", "1", domain=self.cookies_domain)
self.original = False self.original = False
self.limits = False self.limits = False
@cache(maxage=90*24*3600, keyarg=1) @cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01" url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
headers = { headers = {
"Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1", "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1",
@ -96,10 +92,19 @@ class ExhentaiExtractor(Extractor):
"ipb_login_submit": "Login!", "ipb_login_submit": "Login!",
} }
self.cookies.clear()
response = self.request(url, method="POST", headers=headers, data=data) response = self.request(url, method="POST", headers=headers, data=data)
if b"You are now logged in as:" not in response.content: if b"You are now logged in as:" not in response.content:
raise exception.AuthenticationError() raise exception.AuthenticationError()
return {c: response.cookies[c] for c in self.cookies_names}
# collect more cookies
url = self.root + "/favorites.php"
response = self.request(url)
if response.history:
self.request(url)
return self.cookies
class ExhentaiGalleryExtractor(ExhentaiExtractor): class ExhentaiGalleryExtractor(ExhentaiExtractor):
@ -112,18 +117,38 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def __init__(self, match): def __init__(self, match):
ExhentaiExtractor.__init__(self, match) ExhentaiExtractor.__init__(self, match)
self.key = {}
self.count = 0
self.gallery_id = text.parse_int(match.group(2) or match.group(5)) self.gallery_id = text.parse_int(match.group(2) or match.group(5))
self.gallery_token = match.group(3) self.gallery_token = match.group(3)
self.image_token = match.group(4) self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1) self.image_num = text.parse_int(match.group(6), 1)
self.key_start = None
self.key_show = None
self.key_next = None
self.count = 0
self.data = None
def _init(self): def _init(self):
source = self.config("source") source = self.config("source")
if source == "hitomi": if source == "hitomi":
self.items = self._items_hitomi self.items = self._items_hitomi
limits = self.config("limits", False)
if limits and limits.__class__ is int:
self.limits = limits
self._remaining = 0
else:
self.limits = False
self.fallback_retries = self.config("fallback-retries", 2)
self.original = self.config("original", True)
def finalize(self):
if self.data:
self.log.info("Use '%s/s/%s/%s-%s' as input URL "
"to continue downloading from the current position",
self.root, self.data["image_token"],
self.gallery_id, self.data["num"])
def favorite(self, slot="0"): def favorite(self, slot="0"):
url = self.root + "/gallerypopups.php" url = self.root + "/gallerypopups.php"
params = { params = {
@ -145,39 +170,32 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
gpage = self._gallery_page() gpage = self._gallery_page()
self.image_token = text.extr(gpage, 'hentai.org/s/', '"') self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
if not self.image_token: if not self.image_token:
self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage) self.log.debug("Page content:\n%s", gpage)
return raise exception.StopExtraction(
"Failed to extract initial image token")
ipage = self._image_page() ipage = self._image_page()
else: else:
ipage = self._image_page() ipage = self._image_page()
part = text.extr(ipage, 'hentai.org/g/', '"') part = text.extr(ipage, 'hentai.org/g/', '"')
if not part: if not part:
self.log.error("Failed to extract gallery token")
self.log.debug("Page content:\n%s", ipage) self.log.debug("Page content:\n%s", ipage)
return raise exception.StopExtraction(
"Failed to extract gallery token")
self.gallery_token = part.split("/")[1] self.gallery_token = part.split("/")[1]
gpage = self._gallery_page() gpage = self._gallery_page()
data = self.get_metadata(gpage) self.data = data = self.get_metadata(gpage)
self.count = text.parse_int(data["filecount"]) self.count = text.parse_int(data["filecount"])
yield Message.Directory, data yield Message.Directory, data
def _validate_response(response):
# declared inside 'items()' to be able to access 'data'
if not response.history and response.headers.get(
"content-type", "").startswith("text/html"):
self._report_limits(data)
return True
images = itertools.chain( images = itertools.chain(
(self.image_from_page(ipage),), self.images_from_api()) (self.image_from_page(ipage),), self.images_from_api())
for url, image in images: for url, image in images:
data.update(image) data.update(image)
if self.limits: if self.limits:
self._check_limits(data) self._check_limits(data)
if "/fullimg.php" in url: if "/fullimg" in url:
data["_http_validate"] = _validate_response data["_http_validate"] = self._validate_response
else: else:
data["_http_validate"] = None data["_http_validate"] = None
yield Message.Url, url, data yield Message.Url, url, data
@ -185,6 +203,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
fav = self.config("fav") fav = self.config("fav")
if fav is not None: if fav is not None:
self.favorite(fav) self.favorite(fav)
self.data = None
def _items_hitomi(self): def _items_hitomi(self):
if self.config("metadata", False): if self.config("metadata", False):
@ -208,6 +227,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def metadata_from_page(self, page): def metadata_from_page(self, page):
extr = text.extract_from(page) extr = text.extract_from(page)
api_url = extr('var api_url = "', '"')
if api_url:
self.api_url = api_url
data = { data = {
"gid" : self.gallery_id, "gid" : self.gallery_id,
"token" : self.gallery_token, "token" : self.gallery_token,
@ -225,7 +249,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
'>Visible:</td><td class="gdt2">', '<'), '>Visible:</td><td class="gdt2">', '<'),
"language" : extr('>Language:</td><td class="gdt2">', ' '), "language" : extr('>Language:</td><td class="gdt2">', ' '),
"filesize" : text.parse_bytes(extr( "filesize" : text.parse_bytes(extr(
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")), '>File Size:</td><td class="gdt2">', '<').rstrip("Bbi")),
"filecount" : extr('>Length:</td><td class="gdt2">', ' '), "filecount" : extr('>Length:</td><td class="gdt2">', ' '),
"favorites" : extr('id="favcount">', ' '), "favorites" : extr('id="favcount">', ' '),
"rating" : extr(">Average: ", "<"), "rating" : extr(">Average: ", "<"),
@ -251,14 +275,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
return data return data
def metadata_from_api(self): def metadata_from_api(self):
url = self.root + "/api.php"
data = { data = {
"method": "gdata", "method" : "gdata",
"gidlist": ((self.gallery_id, self.gallery_token),), "gidlist" : ((self.gallery_id, self.gallery_token),),
"namespace": 1, "namespace": 1,
} }
data = self.request(url, method="POST", json=data).json() data = self.request(self.api_url, method="POST", json=data).json()
if "error" in data: if "error" in data:
raise exception.StopExtraction(data["error"]) raise exception.StopExtraction(data["error"])
@ -269,54 +292,71 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
pos = page.index('<div id="i3"><a onclick="return load_image(') + 26 pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
extr = text.extract_from(page, pos) extr = text.extract_from(page, pos)
self.key["next"] = extr("'", "'") self.key_next = extr("'", "'")
iurl = extr('<img id="img" src="', '"') iurl = extr('<img id="img" src="', '"')
orig = extr('hentai.org/fullimg.php', '"') nl = extr(" nl(", ")").strip("\"'")
orig = extr('hentai.org/fullimg', '"')
try: try:
if self.original and orig: if self.original and orig:
url = self.root + "/fullimg.php" + text.unescape(orig) url = self.root + "/fullimg" + text.unescape(orig)
data = self._parse_original_info(extr('ownload original', '<')) data = self._parse_original_info(extr('ownload original', '<'))
data["_fallback"] = self._fallback_original(nl, url)
else: else:
url = iurl url = iurl
data = self._parse_image_info(url) data = self._parse_image_info(url)
data["_fallback"] = self._fallback_1280(nl, self.image_num)
except IndexError: except IndexError:
self.log.debug("Page content:\n%s", page) self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction( raise exception.StopExtraction(
"Unable to parse image info for '%s'", url) "Unable to parse image info for '%s'", url)
data["num"] = self.image_num data["num"] = self.image_num
data["image_token"] = self.key["start"] = extr('var startkey="', '";') data["image_token"] = self.key_start = extr('var startkey="', '";')
self.key["show"] = extr('var showkey="', '";') data["_url_1280"] = iurl
data["_nl"] = nl
self.key_show = extr('var showkey="', '";')
self._check_509(iurl, data) self._check_509(iurl)
return url, text.nameext_from_url(iurl, data) return url, text.nameext_from_url(url, data)
def images_from_api(self): def images_from_api(self):
"""Get image url and data from api calls""" """Get image url and data from api calls"""
api_url = self.root + "/api.php" api_url = self.api_url
nextkey = self.key["next"] nextkey = self.key_next
request = { request = {
"method" : "showpage", "method" : "showpage",
"gid" : self.gallery_id, "gid" : self.gallery_id,
"page" : 0,
"imgkey" : nextkey, "imgkey" : nextkey,
"showkey": self.key["show"], "showkey": self.key_show,
} }
for request["page"] in range(self.image_num + 1, self.count + 1): for request["page"] in range(self.image_num + 1, self.count + 1):
page = self.request(api_url, method="POST", json=request).json() page = self.request(api_url, method="POST", json=request).json()
i3 = page["i3"]
i6 = page["i6"]
imgkey = nextkey imgkey = nextkey
nextkey, pos = text.extract(page["i3"], "'", "'") nextkey, pos = text.extract(i3, "'", "'")
imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos) imgurl , pos = text.extract(i3, 'id="img" src="', '"', pos)
origurl, pos = text.extract(page["i7"], '<a href="', '"') nl , pos = text.extract(i3, " nl(", ")", pos)
nl = (nl or "").strip("\"'")
try: try:
if self.original and origurl: pos = i6.find("hentai.org/fullimg")
if self.original and pos >= 0:
origurl, pos = text.rextract(i6, '"', '"', pos)
url = text.unescape(origurl) url = text.unescape(origurl)
data = self._parse_original_info(text.extract( data = self._parse_original_info(text.extract(
page["i7"], "ownload original", "<", pos)[0]) i6, "ownload original", "<", pos)[0])
data["_fallback"] = self._fallback_original(nl, url)
else: else:
url = imgurl url = imgurl
data = self._parse_image_info(url) data = self._parse_image_info(url)
data["_fallback"] = self._fallback_1280(
nl, request["page"], imgkey)
except IndexError: except IndexError:
self.log.debug("Page content:\n%s", page) self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction( raise exception.StopExtraction(
@ -324,34 +364,54 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["num"] = request["page"] data["num"] = request["page"]
data["image_token"] = imgkey data["image_token"] = imgkey
data["_url_1280"] = imgurl
data["_nl"] = nl
self._check_509(imgurl, data) self._check_509(imgurl)
yield url, text.nameext_from_url(imgurl, data) yield url, text.nameext_from_url(url, data)
request["imgkey"] = nextkey request["imgkey"] = nextkey
def _report_limits(self, data): def _validate_response(self, response):
if not response.history and response.headers.get(
"content-type", "").startswith("text/html"):
page = response.text
self.log.warning("'%s'", page)
if " requires GP" in page:
gp = self.config("gp")
if gp == "stop":
raise exception.StopExtraction("Not enough GP")
elif gp == "wait":
input("Press ENTER to continue.")
return response.url
self.log.info("Falling back to non-original downloads")
self.original = False
return self.data["_url_1280"]
self._report_limits()
return True
def _report_limits(self):
ExhentaiExtractor.LIMIT = True ExhentaiExtractor.LIMIT = True
raise exception.StopExtraction( raise exception.StopExtraction("Image limit reached!")
"Image limit reached! "
"Continue with '%s/s/%s/%s-%s' as URL after resetting it.",
self.root, data["image_token"], self.gallery_id, data["num"])
def _check_limits(self, data): def _check_limits(self, data):
if not self._remaining or data["num"] % 25 == 0: if not self._remaining or data["num"] % 25 == 0:
self._update_limits() self._update_limits()
self._remaining -= data["cost"] self._remaining -= data["cost"]
if self._remaining <= 0: if self._remaining <= 0:
self._report_limits(data) self._report_limits()
def _check_509(self, url, data): def _check_509(self, url):
# full 509.gif URLs # full 509.gif URLs
# - https://exhentai.org/img/509.gif # - https://exhentai.org/img/509.gif
# - https://ehgt.org/g/509.gif # - https://ehgt.org/g/509.gif
if url.endswith(("hentai.org/img/509.gif", if url.endswith(("hentai.org/img/509.gif",
"ehgt.org/g/509.gif")): "ehgt.org/g/509.gif")):
self.log.debug(url) self.log.debug(url)
self._report_limits(data) self._report_limits()
def _update_limits(self): def _update_limits(self):
url = "https://e-hentai.org/home.php" url = "https://e-hentai.org/home.php"
@ -390,6 +450,27 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.NotFoundError("image page") raise exception.NotFoundError("image page")
return page return page
def _fallback_original(self, nl, fullimg):
url = "{}?nl={}".format(fullimg, nl)
for _ in util.repeat(self.fallback_retries):
yield url
def _fallback_1280(self, nl, num, token=None):
if not token:
token = self.key_start
for _ in util.repeat(self.fallback_retries):
url = "{}/s/{}/{}-{}?nl={}".format(
self.root, token, self.gallery_id, num, nl)
page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
return
url, data = self.image_from_page(page)
yield url
nl = data["_nl"]
@staticmethod @staticmethod
def _parse_image_info(url): def _parse_image_info(url):
for part in url.split("/")[4:]: for part in url.split("/")[4:]:

@ -8,6 +8,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text
from ..cache import memcache
import re import re
BASE_PATTERN = ( BASE_PATTERN = (
@ -27,8 +28,20 @@ class FanboxExtractor(Extractor):
_warning = True _warning = True
def _init(self): def _init(self):
self.headers = {"Origin": self.root}
self.embeds = self.config("embeds", True) self.embeds = self.config("embeds", True)
includes = self.config("metadata")
if includes:
if isinstance(includes, str):
includes = includes.split(",")
elif not isinstance(includes, (list, tuple)):
includes = ("user", "plan")
self._meta_user = ("user" in includes)
self._meta_plan = ("plan" in includes)
else:
self._meta_user = self._meta_plan = False
if self._warning: if self._warning:
if not self.cookies_check(("FANBOXSESSID",)): if not self.cookies_check(("FANBOXSESSID",)):
self.log.warning("no 'FANBOXSESSID' cookie set") self.log.warning("no 'FANBOXSESSID' cookie set")
@ -43,11 +56,9 @@ class FanboxExtractor(Extractor):
"""Return all relevant post objects""" """Return all relevant post objects"""
def _pagination(self, url): def _pagination(self, url):
headers = {"Origin": self.root}
while url: while url:
url = text.ensure_http_scheme(url) url = text.ensure_http_scheme(url)
body = self.request(url, headers=headers).json()["body"] body = self.request(url, headers=self.headers).json()["body"]
for item in body["items"]: for item in body["items"]:
try: try:
yield self._get_post_data(item["id"]) yield self._get_post_data(item["id"])
@ -58,9 +69,8 @@ class FanboxExtractor(Extractor):
def _get_post_data(self, post_id): def _get_post_data(self, post_id):
"""Fetch and process post data""" """Fetch and process post data"""
headers = {"Origin": self.root}
url = "https://api.fanbox.cc/post.info?postId="+post_id url = "https://api.fanbox.cc/post.info?postId="+post_id
post = self.request(url, headers=headers).json()["body"] post = self.request(url, headers=self.headers).json()["body"]
content_body = post.pop("body", None) content_body = post.pop("body", None)
if content_body: if content_body:
@ -98,8 +108,47 @@ class FanboxExtractor(Extractor):
post["text"] = content_body.get("text") if content_body else None post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False post["isCoverImage"] = False
if self._meta_user:
post["user"] = self._get_user_data(post["creatorId"])
if self._meta_plan:
plans = self._get_plan_data(post["creatorId"])
post["plan"] = plans[post["feeRequired"]]
return content_body, post return content_body, post
@memcache(keyarg=1)
def _get_user_data(self, creator_id):
url = "https://api.fanbox.cc/creator.get"
params = {"creatorId": creator_id}
data = self.request(url, params=params, headers=self.headers).json()
user = data["body"]
user.update(user.pop("user"))
return user
@memcache(keyarg=1)
def _get_plan_data(self, creator_id):
url = "https://api.fanbox.cc/plan.listCreator"
params = {"creatorId": creator_id}
data = self.request(url, params=params, headers=self.headers).json()
plans = {0: {
"id" : "",
"title" : "",
"fee" : 0,
"description" : "",
"coverImageUrl" : "",
"creatorId" : creator_id,
"hasAdultContent": None,
"paymentMethod" : None,
}}
for plan in data["body"]:
del plan["user"]
plans[plan["fee"]] = plan
return plans
def _get_urls_from_post(self, content_body, post): def _get_urls_from_post(self, content_body, post):
num = 0 num = 0
cover_image = post.get("coverImageUrl") cover_image = post.get("coverImageUrl")

@ -42,7 +42,11 @@ class FantiaExtractor(Extractor):
post = self._get_post_data(post_id) post = self._get_post_data(post_id)
post["num"] = 0 post["num"] = 0
for content in self._get_post_contents(post): contents = self._get_post_contents(post)
post["content_count"] = len(contents)
post["content_num"] = 0
for content in contents:
files = self._process_content(post, content) files = self._process_content(post, content)
yield Message.Directory, post yield Message.Directory, post
@ -59,6 +63,8 @@ class FantiaExtractor(Extractor):
post["content_filename"] or file["file_url"], post) post["content_filename"] or file["file_url"], post)
yield Message.Url, file["file_url"], post yield Message.Url, file["file_url"], post
post["content_num"] += 1
def posts(self): def posts(self):
"""Return post IDs""" """Return post IDs"""
@ -102,7 +108,7 @@ class FantiaExtractor(Extractor):
"fanclub_user_name": resp["fanclub"]["user"]["name"], "fanclub_user_name": resp["fanclub"]["user"]["name"],
"fanclub_name": resp["fanclub"]["name"], "fanclub_name": resp["fanclub"]["name"],
"fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]),
"tags": resp["tags"], "tags": [t["name"] for t in resp["tags"]],
"_data": resp, "_data": resp,
} }
@ -131,6 +137,7 @@ class FantiaExtractor(Extractor):
post["content_filename"] = content.get("filename") or "" post["content_filename"] = content.get("filename") or ""
post["content_id"] = content["id"] post["content_id"] = content["id"]
post["content_comment"] = content.get("comment") or "" post["content_comment"] = content.get("comment") or ""
post["content_num"] += 1
post["plan"] = content["plan"] or self._empty_plan post["plan"] = content["plan"] or self._empty_plan
files = [] files = []

@ -10,6 +10,9 @@ from .common import Extractor, Message
from .. import text, exception from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?fapello\.(?:com|su)"
class FapelloPostExtractor(Extractor): class FapelloPostExtractor(Extractor):
"""Extractor for individual posts on fapello.com""" """Extractor for individual posts on fapello.com"""
category = "fapello" category = "fapello"
@ -17,16 +20,16 @@ class FapelloPostExtractor(Extractor):
directory_fmt = ("{category}", "{model}") directory_fmt = ("{category}", "{model}")
filename_fmt = "{model}_{id}.{extension}" filename_fmt = "{model}_{id}.{extension}"
archive_fmt = "{type}_{model}_{id}" archive_fmt = "{type}_{model}_{id}"
pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)"
r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)")
example = "https://fapello.com/MODEL/12345/" example = "https://fapello.com/MODEL/12345/"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.root = text.root_from_url(match.group(0))
self.model, self.id = match.groups() self.model, self.id = match.groups()
def items(self): def items(self):
url = "https://fapello.com/{}/{}/".format(self.model, self.id) url = "{}/{}/{}/".format(self.root, self.model, self.id)
page = text.extr( page = text.extr(
self.request(url, allow_redirects=False).text, self.request(url, allow_redirects=False).text,
'class="uk-align-center"', "</div>", None) 'class="uk-align-center"', "</div>", None)
@ -48,27 +51,29 @@ class FapelloModelExtractor(Extractor):
"""Extractor for all posts from a fapello model""" """Extractor for all posts from a fapello model"""
category = "fapello" category = "fapello"
subcategory = "model" subcategory = "model"
pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos"
r"/(?!top-(?:likes|followers)|popular_videos"
r"|videos|trending|search/?$)" r"|videos|trending|search/?$)"
r"([^/?#]+)/?$") r"([^/?#]+)/?$")
example = "https://fapello.com/model/" example = "https://fapello.com/model/"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.root = text.root_from_url(match.group(0))
self.model = match.group(1) self.model = match.group(1)
def items(self): def items(self):
num = 1 num = 1
data = {"_extractor": FapelloPostExtractor} data = {"_extractor": FapelloPostExtractor}
while True: while True:
url = "https://fapello.com/ajax/model/{}/page-{}/".format( url = "{}/ajax/model/{}/page-{}/".format(
self.model, num) self.root, self.model, num)
page = self.request(url).text page = self.request(url).text
if not page: if not page:
return return
for url in text.extract_iter(page, '<a href="', '"'): for url in text.extract_iter(page, '<a href="', '"'):
if url == "javascript:void(0);":
continue
yield Message.Queue, url, data yield Message.Queue, url, data
num += 1 num += 1
@ -77,13 +82,14 @@ class FapelloPathExtractor(Extractor):
"""Extractor for models and posts from fapello.com paths""" """Extractor for models and posts from fapello.com paths"""
category = "fapello" category = "fapello"
subcategory = "path" subcategory = "path"
pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" pattern = (BASE_PATTERN +
r"/(?!search/?$)(top-(?:likes|followers)|videos|trending" r"/(?!search/?$)(top-(?:likes|followers)|videos|trending"
r"|popular_videos/[^/?#]+)/?$") r"|popular_videos/[^/?#]+)/?$")
example = "https://fapello.com/trending/" example = "https://fapello.com/trending/"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.root = text.root_from_url(match.group(0))
self.path = match.group(1) self.path = match.group(1)
def items(self): def items(self):
@ -93,9 +99,14 @@ class FapelloPathExtractor(Extractor):
else: else:
data = {"_extractor": FapelloPostExtractor} data = {"_extractor": FapelloPostExtractor}
if "fapello.su" in self.root:
self.path = self.path.replace("-", "/")
if self.path == "trending":
data = {"_extractor": FapelloModelExtractor}
while True: while True:
page = self.request("https://fapello.com/ajax/{}/page-{}/".format( page = self.request("{}/ajax/{}/page-{}/".format(
self.path, num)).text self.root, self.path, num)).text
if not page: if not page:
return return

@ -24,6 +24,8 @@ class FoolfuukaExtractor(BaseExtractor):
BaseExtractor.__init__(self, match) BaseExtractor.__init__(self, match)
if self.category == "b4k": if self.category == "b4k":
self.remote = self._remote_direct self.remote = self._remote_direct
elif self.category == "archivedmoe":
self.referer = False
def items(self): def items(self):
yield Message.Directory, self.metadata() yield Message.Directory, self.metadata()
@ -53,9 +55,12 @@ class FoolfuukaExtractor(BaseExtractor):
def remote(self, media): def remote(self, media):
"""Resolve a remote media link""" """Resolve a remote media link"""
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text page = self.request(media["remote_media_link"]).text
return text.extr(page, needle, '"') url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"')
if url.endswith(".webm") and \
url.startswith("https://thebarchive.com/"):
return url[:-1]
return url
@staticmethod @staticmethod
def _remote_direct(media): def _remote_direct(media):
@ -169,7 +174,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
directory_fmt = ("{category}", "search", "{search}") directory_fmt = ("{category}", "search", "{search}")
pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
example = "https://archived.moe/_/search/text/QUERY/" example = "https://archived.moe/_/search/text/QUERY/"
request_interval = 1.0 request_interval = (0.5, 1.5)
def __init__(self, match): def __init__(self, match):
FoolfuukaExtractor.__init__(self, match) FoolfuukaExtractor.__init__(self, match)

@ -38,10 +38,6 @@ class FoolslideExtractor(BaseExtractor):
BASE_PATTERN = FoolslideExtractor.update({ BASE_PATTERN = FoolslideExtractor.update({
"powermanga": {
"root": "https://read.powermanga.org",
"pattern": r"read(?:er)?\.powermanga\.org",
},
}) })

@ -22,7 +22,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
def __init__(self, match): def __init__(self, match):
self.gallery_hash = match.group(1) self.gallery_hash = match.group(1)
url = "{}/thumbs/{}/".format(self.root, self.gallery_hash) url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash)
GalleryExtractor.__init__(self, match, url) GalleryExtractor.__init__(self, match, url)
def metadata(self, page): def metadata(self, page):
@ -50,15 +50,16 @@ class FuskatorGalleryExtractor(GalleryExtractor):
"gallery_id" : text.parse_int(gallery_id), "gallery_id" : text.parse_int(gallery_id),
"gallery_hash": self.gallery_hash, "gallery_hash": self.gallery_hash,
"title" : text.unescape(title[:-15]), "title" : text.unescape(title[:-15]),
"views" : data["hits"], "views" : data.get("hits"),
"score" : data["rating"], "score" : data.get("rating"),
"tags" : data["tags"].split(","), "tags" : (data.get("tags") or "").split(","),
"count" : len(data["images"]),
} }
def images(self, page): def images(self, page):
for image in self.data["images"]: return [
yield "https:" + image["imageUrl"], image ("https:" + image["imageUrl"], image)
for image in self.data["images"]
]
class FuskatorSearchExtractor(Extractor): class FuskatorSearchExtractor(Extractor):

@ -23,7 +23,7 @@ class GelbooruBase():
root = "https://gelbooru.com" root = "https://gelbooru.com"
offset = 0 offset = 0
def _api_request(self, params, key="post"): def _api_request(self, params, key="post", log=False):
if "s" not in params: if "s" not in params:
params["s"] = "post" params["s"] = "post"
params["api_key"] = self.api_key params["api_key"] = self.api_key
@ -32,10 +32,14 @@ class GelbooruBase():
url = self.root + "/index.php?page=dapi&q=index&json=1" url = self.root + "/index.php?page=dapi&q=index&json=1"
data = self.request(url, params=params).json() data = self.request(url, params=params).json()
if key not in data: try:
return () posts = data[key]
except KeyError:
if log:
self.log.error("Incomplete API response (missing '%s')", key)
self.log.debug("%s", data)
return []
posts = data[key]
if not isinstance(posts, list): if not isinstance(posts, list):
return (posts,) return (posts,)
return posts return posts
@ -114,7 +118,7 @@ class GelbooruBase():
class GelbooruTagExtractor(GelbooruBase, class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor): gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags""" """Extractor for images from gelbooru.com based on search-tags"""
pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)" pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)"
example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG" example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG"
@ -165,15 +169,16 @@ class GelbooruFavoriteExtractor(GelbooruBase,
"id" : self.favorite_id, "id" : self.favorite_id,
"limit": "1", "limit": "1",
} }
count = self._api_request(params, "@attributes")[0]["count"]
count = self._api_request(params, "@attributes", True)[0]["count"]
if count <= self.offset: if count <= self.offset:
return return
pnum, last = divmod(count + 1, self.per_page)
if self.offset >= last: pnum, last = divmod(count-1, self.per_page)
if self.offset > last:
# page number change
self.offset -= last self.offset -= last
diff, self.offset = divmod(self.offset, self.per_page) diff, self.offset = divmod(self.offset-1, self.per_page)
pnum -= diff + 1 pnum -= diff + 1
skip = self.offset skip = self.offset
@ -182,9 +187,9 @@ class GelbooruFavoriteExtractor(GelbooruBase,
params["limit"] = self.per_page params["limit"] = self.per_page
while True: while True:
favs = self._api_request(params, "favorite") favs = self._api_request(params, "favorite", True)
favs.reverse() favs.reverse()
if skip: if skip:
favs = favs[skip:] favs = favs[skip:]
skip = 0 skip = 0

@ -22,14 +22,10 @@ class GelbooruV02Extractor(booru.BooruExtractor):
def _init(self): def _init(self):
self.api_key = self.config("api-key") self.api_key = self.config("api-key")
self.user_id = self.config("user-id") self.user_id = self.config("user-id")
self.api_root = self.config_instance("api_root") or self.root
try:
self.api_root = INSTANCES[self.category]["api_root"]
except KeyError:
self.api_root = self.root
if self.category == "realbooru": if self.category == "realbooru":
self.items = self._items_realbooru self._file_url = self._file_url_realbooru
self._tags = self._tags_realbooru self._tags = self._tags_realbooru
def _api_request(self, params): def _api_request(self, params):
@ -128,28 +124,6 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url return url
def _items_realbooru(self):
from .common import Message
data = self.metadata()
for post in self.posts():
try:
html = self._html(post)
url = post["file_url"] = text.rextract(
html, 'href="', '"', html.index(">Original<"))[0]
except Exception:
self.log.debug("Unable to fetch download URL for post %s "
"(md5: %s)", post.get("id"), post.get("md5"))
continue
text.nameext_from_url(url, post)
post.update(data)
self._prepare(post)
self._tags(post, html)
yield Message.Directory, post
yield Message.Url, url, post
def _tags_realbooru(self, post, page): def _tags_realbooru(self, post, page):
tag_container = text.extr(page, 'id="tagLink"', '</div>') tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
@ -161,14 +135,14 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["tags_" + key] = " ".join(value) post["tags_" + key] = " ".join(value)
INSTANCES = { BASE_PATTERN = GelbooruV02Extractor.update({
"realbooru": { "realbooru": {
"root": "https://realbooru.com", "root": "https://realbooru.com",
"pattern": r"realbooru\.com", "pattern": r"realbooru\.com",
}, },
"rule34": { "rule34": {
"root": "https://rule34.xxx", "root": "https://rule34.xxx",
"pattern": r"rule34\.xxx", "pattern": r"(?:www\.)?rule34\.xxx",
"api_root": "https://api.rule34.xxx", "api_root": "https://api.rule34.xxx",
}, },
"safebooru": { "safebooru": {
@ -187,16 +161,14 @@ INSTANCES = {
"root": "https://xbooru.com", "root": "https://xbooru.com",
"pattern": r"xbooru\.com", "pattern": r"xbooru\.com",
}, },
} })
BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES)
class GelbooruV02TagExtractor(GelbooruV02Extractor): class GelbooruV02TagExtractor(GelbooruV02Extractor):
subcategory = "tag" subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}") directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}" archive_fmt = "t_{search_tags}_{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG" example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG"
def __init__(self, match): def __init__(self, match):
@ -208,6 +180,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
return {"search_tags": self.tags} return {"search_tags": self.tags}
def posts(self): def posts(self):
if self.tags == "all":
self.tags = ""
return self._pagination({"tags": self.tags}) return self._pagination({"tags": self.tags})

@ -73,7 +73,7 @@ class GofileFolderExtractor(Extractor):
def _get_website_token(self): def _get_website_token(self):
self.log.debug("Fetching website token") self.log.debug("Fetching website token")
page = self.request(self.root + "/dist/js/alljs.js").text page = self.request(self.root + "/dist/js/alljs.js").text
return text.extr(page, 'fetchData.websiteToken = "', '"') return text.extr(page, 'fetchData.wt = "', '"')
def _get_content(self, content_id, password=None): def _get_content(self, content_id, password=None):
if password is not None: if password is not None:
@ -81,7 +81,7 @@ class GofileFolderExtractor(Extractor):
return self._api_request("getContent", { return self._api_request("getContent", {
"contentId" : content_id, "contentId" : content_id,
"token" : self.api_token, "token" : self.api_token,
"websiteToken": self.website_token, "wt" : self.website_token,
"password" : password, "password" : password,
}) })

@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://hatenablog.com"""
import re
from .common import Extractor, Message
from .. import text
BASE_PATTERN = (
r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
r"|hatenadiary\.com|hateblo\.jp)))"
)
QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
class HatenablogExtractor(Extractor):
"""Base class for HatenaBlog extractors"""
category = "hatenablog"
directory_fmt = ("{category}", "{domain}")
filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
archive_fmt = "{filename}"
def __init__(self, match):
Extractor.__init__(self, match)
self.domain = match.group(1) or match.group(2)
def _init(self):
self._find_img = re.compile(r'<img +([^>]+)').finditer
def _handle_article(self, article: str):
extr = text.extract_from(article)
date = text.parse_datetime(extr('<time datetime="', '"'))
entry_link = text.unescape(extr('<a href="', '"'))
entry = entry_link.partition("/entry/")[2]
title = text.unescape(extr('>', '<'))
content = extr(
'<div class="entry-content hatenablog-entry">', '</div>')
images = []
for i in self._find_img(content):
attributes = i.group(1)
if 'class="hatena-fotolife"' not in attributes:
continue
image = text.unescape(text.extr(attributes, 'src="', '"'))
images.append(image)
data = {
"domain": self.domain,
"date": date,
"entry": entry,
"title": title,
"count": len(images),
}
yield Message.Directory, data
for data["num"], url in enumerate(images, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
class HatenablogEntriesExtractor(HatenablogExtractor):
"""Base class for a list of entries"""
allowed_parameters = ()
def __init__(self, match):
HatenablogExtractor.__init__(self, match)
self.path = match.group(3)
self.query = {key: value for key, value in text.parse_query(
match.group(4)).items() if self._acceptable_query(key)}
def _init(self):
HatenablogExtractor._init(self)
self._find_pager_url = re.compile(
r' class="pager-next">\s*<a href="([^"]+)').search
def items(self):
url = "https://" + self.domain + self.path
query = self.query
while url:
page = self.request(url, params=query).text
extr = text.extract_from(page)
attributes = extr('<body ', '>')
if "page-archive" in attributes:
yield from self._handle_partial_articles(extr)
else:
yield from self._handle_full_articles(extr)
match = self._find_pager_url(page)
url = text.unescape(match.group(1)) if match else None
query = None
def _handle_partial_articles(self, extr):
while True:
section = extr('<section class="archive-entry', '</section>')
if not section:
break
url = "hatenablog:" + text.unescape(text.extr(
section, '<a class="entry-title-link" href="', '"'))
data = {"_extractor": HatenablogEntryExtractor}
yield Message.Queue, url, data
def _handle_full_articles(self, extr):
while True:
attributes = extr('<article ', '>')
if not attributes:
break
if "no-entry" in attributes:
continue
article = extr('', '</article>')
yield from self._handle_article(article)
def _acceptable_query(self, key):
return key == "page" or key in self.allowed_parameters
class HatenablogEntryExtractor(HatenablogExtractor):
"""Extractor for a single entry URL"""
subcategory = "entry"
pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
example = "https://BLOG.hatenablog.com/entry/PATH"
def __init__(self, match):
HatenablogExtractor.__init__(self, match)
self.path = match.group(3)
def items(self):
url = "https://" + self.domain + "/entry/" + self.path
page = self.request(url).text
extr = text.extract_from(page)
while True:
attributes = extr('<article ', '>')
if "no-entry" in attributes:
continue
article = extr('', '</article>')
return self._handle_article(article)
class HatenablogHomeExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's home page"""
subcategory = "home"
pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
example = "https://BLOG.hatenablog.com"
class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's archive page"""
subcategory = "archive"
pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
r"|/category/[^?#]+)?)" + QUERY_RE)
example = "https://BLOG.hatenablog.com/archive/2024"
class HatenablogSearchExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
example = "https://BLOG.hatenablog.com/search?q=QUERY"
allowed_parameters = ("q",)

@ -1,92 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.hbrowse.com/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text, util, exception
class HbrowseBase():
"""Base class for hbrowse extractors"""
category = "hbrowse"
root = "https://www.hbrowse.com"
def parse_page(self, page, data):
"""Parse metadata on 'page' and add it to 'data'"""
data, pos = text.extract_all(page, (
('manga' , '<td class="listLong">', '</td>'),
('artist', '<td class="listLong">', '</td>'),
('total' , '<td class="listLong">', ' '),
('origin', '<td class="listLong">', '</td>'),
), values=data)
if not data["manga"] and "<b>Warning</b>" in page:
msg = page.rpartition(">")[2].strip()
raise exception.StopExtraction("Site is not accessible: '%s'", msg)
tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
data["manga"] = text.unescape(data["manga"])
data["total"] = text.parse_int(data["total"])
data["artist"] = text.remove_html(data["artist"])
data["origin"] = text.remove_html(data["origin"])
data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
return data
class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
"""Extractor for manga-chapters from hbrowse.com"""
directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
"{page:>03}.{extension}")
archive_fmt = "{manga_id}_{chapter}_{page}"
pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
example = "https://www.hbrowse.com/12345/c00000"
def __init__(self, match):
self.path, self.gid, self.chapter = match.groups()
self.path += "/"
ChapterExtractor.__init__(self, match)
def metadata(self, page):
return self.parse_page(page, {
"manga_id": text.parse_int(self.gid),
"chapter": text.parse_int(self.chapter)
})
def images(self, page):
base = self.root + "/data" + self.path
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
return [(base + name, None) for name in util.json_loads(json_data)]
class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
"""Extractor for manga from hbrowse.com"""
chapterclass = HbrowseChapterExtractor
reverse = False
pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
example = "https://www.hbrowse.com/12345"
def chapters(self, page):
results = []
data = self.parse_page(page, {
"manga_id": text.parse_int(
self.manga_url.rstrip("/").rpartition("/")[2])
})
pos = 0
needle = '<td class="listMiddle">\n<a class="listLink" href="'
while True:
url, pos = text.extract(page, needle, '"', pos)
if not url:
return results
title, pos = text.extract(page, '>View ', '<', pos)
data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
data["title"] = title
results.append((text.urljoin(self.root, url), data.copy()))

@ -42,7 +42,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor):
def images(self, page): def images(self, page):
return [ return [
(url, None) (url.replace("http:", "https:", 1), None)
for url in text.extract_iter( for url in text.extract_iter(
page, '<amp-img class="auto-style" src="', '"') page, '<amp-img class="auto-style" src="', '"')
] ]

@ -72,13 +72,11 @@ class HentaifoundryExtractor(Extractor):
extr = text.extract_from(page, page.index('id="picBox"')) extr = text.extract_from(page, page.index('id="picBox"'))
data = { data = {
"index" : text.parse_int(path.rsplit("/", 2)[1]),
"title" : text.unescape(extr('class="imageTitle">', '<')), "title" : text.unescape(extr('class="imageTitle">', '<')),
"artist" : text.unescape(extr('/profile">', '<')), "artist" : text.unescape(extr('/profile">', '<')),
"width" : text.parse_int(extr('width="', '"')), "_body" : extr(
"height" : text.parse_int(extr('height="', '"')), '<div class="boxbody"', '<div class="boxfooter"'),
"index" : text.parse_int(path.rsplit("/", 2)[1]),
"src" : text.urljoin(self.root, text.unescape(extr(
'src="', '"'))),
"description": text.unescape(text.remove_html(extr( "description": text.unescape(text.remove_html(extr(
'>Description</div>', '</section>') '>Description</div>', '</section>')
.replace("\r\n", "\n"), "", "")), .replace("\r\n", "\n"), "", "")),
@ -92,6 +90,20 @@ class HentaifoundryExtractor(Extractor):
">Tags </span>", "</div>")), ">Tags </span>", "</div>")),
} }
body = data["_body"]
if "<object " in body:
data["src"] = text.urljoin(self.root, text.unescape(text.extr(
body, 'name="movie" value="', '"')))
data["width"] = text.parse_int(text.extr(
body, "name='width' value='", "'"))
data["height"] = text.parse_int(text.extr(
body, "name='height' value='", "'"))
else:
data["src"] = text.urljoin(self.root, text.unescape(text.extr(
body, 'src="', '"')))
data["width"] = text.parse_int(text.extr(body, 'width="', '"'))
data["height"] = text.parse_int(text.extr(body, 'height="', '"'))
return text.nameext_from_url(data["src"], data) return text.nameext_from_url(data["src"], data)
def _parse_story(self, html): def _parse_story(self, html):
@ -121,9 +133,25 @@ class HentaifoundryExtractor(Extractor):
return text.nameext_from_url(data["src"], data) return text.nameext_from_url(data["src"], data)
def _init_site_filters(self): def _request_check(self, url, **kwargs):
self.request = self._request_original
# check for Enter button / front page
# and update PHPSESSID and content filters if necessary
response = self.request(url, **kwargs)
content = response.content
if len(content) < 5000 and \
b'<div id="entryButtonContainer"' in content:
self._init_site_filters(False)
response = self.request(url, **kwargs)
return response
def _init_site_filters(self, check_cookies=True):
"""Set site-internal filters to show all images""" """Set site-internal filters to show all images"""
if self.cookies.get("PHPSESSID", domain=self.cookies_domain): if check_cookies and self.cookies.get(
"PHPSESSID", domain=self.cookies_domain):
self._request_original = self.request
self.request = self._request_check
return return
url = self.root + "/?enterAgree=1" url = self.root + "/?enterAgree=1"

@ -30,10 +30,10 @@ class HiperdexBase():
extr = text.extract_from(page) extr = text.extract_from(page)
return { return {
"manga" : text.unescape(extr(
"<title>", "<").rpartition(" Manga - ")[0].strip()),
"url" : text.unescape(extr( "url" : text.unescape(extr(
'property="og:url" content="', '"')), 'property="og:url" content="', '"')),
"manga" : text.unescape(extr(
'"headline": "', '"')),
"score" : text.parse_float(extr( "score" : text.parse_float(extr(
'id="averagerate">', '<')), 'id="averagerate">', '<')),
"author" : text.remove_html(extr( "author" : text.remove_html(extr(

@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
category = "hitomi" category = "hitomi"
root = "https://hitomi.la" root = "https://hitomi.la"
pattern = (r"(?:https?://)?hitomi\.la" pattern = (r"(?:https?://)?hitomi\.la"
r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)" r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)"
r"/(?:[^/?#]+-)?(\d+)") r"/(?:[^/?#]+-)?(\d+)")
example = "https://hitomi.la/manga/TITLE-867789.html" example = "https://hitomi.la/manga/TITLE-867789.html"

@ -15,14 +15,17 @@ from .. import text, util, exception
import collections import collections
import re import re
BASE_PATTERN = r"(?:https?://)?idol\.sankakucomplex\.com(?:/[a-z]{2})?"
class IdolcomplexExtractor(SankakuExtractor): class IdolcomplexExtractor(SankakuExtractor):
"""Base class for idolcomplex extractors""" """Base class for idolcomplex extractors"""
category = "idolcomplex" category = "idolcomplex"
root = "https://idol.sankakucomplex.com"
cookies_domain = "idol.sankakucomplex.com" cookies_domain = "idol.sankakucomplex.com"
cookies_names = ("login", "pass_hash") cookies_names = ("_idolcomplex_session",)
root = "https://" + cookies_domain referer = False
request_interval = 5.0 request_interval = (3.0, 6.0)
def __init__(self, match): def __init__(self, match):
SankakuExtractor.__init__(self, match) SankakuExtractor.__init__(self, match)
@ -31,14 +34,19 @@ class IdolcomplexExtractor(SankakuExtractor):
self.start_post = 0 self.start_post = 0
def _init(self): def _init(self):
self.extags = self.config("tags", False) self.find_pids = re.compile(
r" href=[\"#]/\w\w/posts/([0-9a-f]+)"
).findall
self.find_tags = re.compile(
r'tag-type-([^"]+)">\s*<a [^>]*?href="/[^?]*\?tags=([^"]+)'
).findall
def items(self): def items(self):
self.login() self.login()
data = self.metadata() data = self.metadata()
for post_id in util.advance(self.post_ids(), self.start_post): for post_id in util.advance(self.post_ids(), self.start_post):
post = self._parse_post(post_id) post = self._extract_post(post_id)
url = post["file_url"] url = post["file_url"]
post.update(data) post.update(data)
text.nameext_from_url(url, post) text.nameext_from_url(url, post)
@ -62,67 +70,79 @@ class IdolcomplexExtractor(SankakuExtractor):
self.logged_in = False self.logged_in = False
@cache(maxage=90*24*3600, keyarg=1) @cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = self.root + "/user/authenticate" url = self.root + "/users/login"
page = self.request(url).text
headers = {
"Referer": url,
}
url = self.root + (text.extr(page, '<form action="', '"') or
"/en/user/authenticate")
data = { data = {
"authenticity_token": text.unescape(text.extr(
page, 'name="authenticity_token" value="', '"')),
"url" : "", "url" : "",
"user[name]" : username, "user[name]" : username,
"user[password]": password, "user[password]": password,
"commit" : "Login", "commit" : "Login",
} }
response = self.request(url, method="POST", data=data) response = self.request(url, method="POST", headers=headers, data=data)
if not response.history or response.url != self.root + "/user/home": if not response.history or response.url.endswith("/user/home"):
raise exception.AuthenticationError() raise exception.AuthenticationError()
cookies = response.history[0].cookies return {c.name: c.value for c in response.history[0].cookies}
return {c: cookies[c] for c in self.cookies_names}
def _parse_post(self, post_id): def _extract_post(self, post_id):
"""Extract metadata of a single post""" url = self.root + "/posts/" + post_id
url = self.root + "/post/show/" + post_id
page = self.request(url, retries=10).text page = self.request(url, retries=10).text
extr = text.extract extr = text.extract_from(page)
tags , pos = extr(page, "<title>", " | ") tags = extr("<title>", " | ")
vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos) vavg = extr('itemprop="ratingValue">', "<")
vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos) vcnt = extr('itemprop="reviewCount">', "<")
_ , pos = extr(page, "Posted: <", "", pos) pid = extr(">Post ID:", "<")
created, pos = extr(page, ' title="', '"', pos) created = extr(' title="', '"')
rating = extr(page, "<li>Rating: ", "<", pos)[0]
file_url, pos = extr(page, '<li>Original: <a href="', '"', pos) file_url = extr('>Original:', 'id=')
if file_url: if file_url:
width , pos = extr(page, '>', 'x', pos) file_url = extr(' href="', '"')
height, pos = extr(page, '', ' ', pos) width = extr(">", "x")
height = extr("", " ")
else: else:
width , pos = extr(page, '<object width=', ' ', pos) width = extr('<object width=', ' ')
height, pos = extr(page, 'height=', '>', pos) height = extr('height=', '>')
file_url = extr(page, '<embed src="', '"', pos)[0] file_url = extr('<embed src="', '"')
rating = extr(">Rating:", "<br")
data = { data = {
"id": text.parse_int(post_id), "id" : text.parse_int(pid),
"md5": file_url.rpartition("/")[2].partition(".")[0], "md5" : file_url.rpartition("/")[2].partition(".")[0],
"tags": text.unescape(tags), "tags" : text.unescape(tags),
"vote_average": text.parse_float(vavg), "vote_average": text.parse_float(vavg),
"vote_count": text.parse_int(vcnt), "vote_count" : text.parse_int(vcnt),
"created_at": created, "created_at" : created,
"rating": (rating or "?")[0].lower(), "date" : text.parse_datetime(
"file_url": "https:" + text.unescape(file_url), created, "%Y-%m-%d %H:%M:%S.%f"),
"width": text.parse_int(width), "rating" : text.remove_html(rating).lower(),
"height": text.parse_int(height), "file_url" : "https:" + text.unescape(file_url),
"width" : text.parse_int(width),
"height" : text.parse_int(height),
} }
if self.extags: tags = collections.defaultdict(list)
tags = collections.defaultdict(list) tags_list = []
tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>') tags_html = text.extr(page, '<ul id="tag-sidebar"', '</ul>')
pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') for tag_type, tag_name in self.find_tags(tags_html or ""):
for tag_type, tag_name in pattern.findall(tags_html or ""): tags[tag_type].append(text.unquote(tag_name))
tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items():
for key, value in tags.items(): data["tags_" + key] = " ".join(value)
data["tags_" + key] = " ".join(value) tags_list += value
data["tags"] = " ".join(tags_list)
return data return data
@ -132,8 +152,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
subcategory = "tag" subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}") directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}" archive_fmt = "t_{search_tags}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" pattern = BASE_PATTERN + r"/(?:posts/?)?\?([^#]*)"
example = "https://idol.sankakucomplex.com/?tags=TAGS" example = "https://idol.sankakucomplex.com/en/posts?tags=TAGS"
per_page = 20 per_page = 20
def __init__(self, match): def __init__(self, match):
@ -177,15 +197,17 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
while True: while True:
page = self.request(self.root, params=params, retries=10).text page = self.request(self.root, params=params, retries=10).text
pos = page.find("<div id=more-popular-posts-link>") + 1 pos = ((page.find('id="more-popular-posts-link"') + 1) or
yield from text.extract_iter(page, '" id=p', '>', pos) (page.find('<span class="thumb') + 1))
yield from self.find_pids(page, pos)
next_url = text.extract(page, 'next-page-url="', '"', pos)[0] next_url = text.extract(page, 'next-page-url="', '"', pos)[0]
if not next_url: if not next_url:
return return
next_params = text.parse_query(text.unescape( next_params = text.parse_query(text.unescape(text.unescape(
next_url).lstrip("?/")) next_url).lstrip("?/")))
if "next" in next_params: if "next" in next_params:
# stop if the same "next" value occurs twice in a row (#265) # stop if the same "next" value occurs twice in a row (#265)
@ -200,8 +222,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
subcategory = "pool" subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}") directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}" archive_fmt = "p_{pool}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" pattern = BASE_PATTERN + r"/pools?/show/(\d+)"
example = "https://idol.sankakucomplex.com/pool/show/12345" example = "https://idol.sankakucomplex.com/pools/show/12345"
per_page = 24 per_page = 24
def __init__(self, match): def __init__(self, match):
@ -218,15 +240,16 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
return {"pool": self.pool_id} return {"pool": self.pool_id}
def post_ids(self): def post_ids(self):
url = self.root + "/pool/show/" + self.pool_id url = self.root + "/pools/show/" + self.pool_id
params = {"page": self.start_page} params = {"page": self.start_page}
while True: while True:
page = self.request(url, params=params, retries=10).text page = self.request(url, params=params, retries=10).text
ids = list(text.extract_iter(page, '" id=p', '>')) pos = page.find('id="pool-show"') + 1
post_ids = self.find_pids(page, pos)
yield from ids yield from post_ids
if len(ids) < self.per_page: if len(post_ids) < self.per_page:
return return
params["page"] += 1 params["page"] += 1
@ -235,8 +258,8 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor):
"""Extractor for single images from idol.sankakucomplex.com""" """Extractor for single images from idol.sankakucomplex.com"""
subcategory = "post" subcategory = "post"
archive_fmt = "{id}" archive_fmt = "{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" pattern = BASE_PATTERN + r"/posts?/(?:show/)?([0-9a-f]+)"
example = "https://idol.sankakucomplex.com/post/show/12345" example = "https://idol.sankakucomplex.com/posts/0123456789abcdef"
def __init__(self, match): def __init__(self, match):
IdolcomplexExtractor.__init__(self, match) IdolcomplexExtractor.__init__(self, match)

@ -44,7 +44,7 @@ class ImagechestGalleryExtractor(GalleryExtractor):
} }
def images(self, page): def images(self, page):
if " More Files</button>" in page: if ' load-all">' in page:
url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) url = "{}/p/{}/loadAll".format(self.root, self.gallery_id)
headers = { headers = {
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",

@ -126,14 +126,15 @@ class ImagefapImageExtractor(ImagefapExtractor):
url = "{}/photo/{}/".format(self.root, self.image_id) url = "{}/photo/{}/".format(self.root, self.image_id)
page = self.request(url).text page = self.request(url).text
url, pos = text.extract(
page, 'original="', '"')
info, pos = text.extract( info, pos = text.extract(
page, '<script type="application/ld+json">', '</script>') page, '<script type="application/ld+json">', '</script>', pos)
image_id, pos = text.extract( image_id, pos = text.extract(
page, 'id="imageid_input" value="', '"', pos) page, 'id="imageid_input" value="', '"', pos)
gallery_id, pos = text.extract( gallery_id, pos = text.extract(
page, 'id="galleryid_input" value="', '"', pos) page, 'id="galleryid_input" value="', '"', pos)
info = util.json_loads(info) info = util.json_loads(info)
url = info["contentUrl"]
return url, text.nameext_from_url(url, { return url, text.nameext_from_url(url, {
"title": text.unescape(info["name"]), "title": text.unescape(info["name"]),

@ -64,7 +64,7 @@ class ImgbbExtractor(Extractor):
if username: if username:
self.cookies_update(self._login_impl(username, password)) self.cookies_update(self._login_impl(username, password))
@cache(maxage=360*24*3600, keyarg=1) @cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
@ -84,6 +84,13 @@ class ImgbbExtractor(Extractor):
raise exception.AuthenticationError() raise exception.AuthenticationError()
return self.cookies return self.cookies
def _extract_resource(self, page):
return util.json_loads(text.extr(
page, "CHV.obj.resource=", "};") + "}")
def _extract_user(self, page):
return self._extract_resource(page).get("user") or {}
def _pagination(self, page, endpoint, params): def _pagination(self, page, endpoint, params):
data = None data = None
seek, pos = text.extract(page, 'data-seek="', '"') seek, pos = text.extract(page, 'data-seek="', '"')
@ -99,7 +106,7 @@ class ImgbbExtractor(Extractor):
for img in text.extract_iter(page, "data-object='", "'"): for img in text.extract_iter(page, "data-object='", "'"):
yield util.json_loads(text.unquote(img)) yield util.json_loads(text.unquote(img))
if data: if data:
if params["seek"] == data["seekEnd"]: if not data["seekEnd"] or params["seek"] == data["seekEnd"]:
return return
params["seek"] = data["seekEnd"] params["seek"] = data["seekEnd"]
params["page"] += 1 params["page"] += 1
@ -124,12 +131,14 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
self.page_url = "https://ibb.co/album/" + self.album_id self.page_url = "https://ibb.co/album/" + self.album_id
def metadata(self, page): def metadata(self, page):
album, pos = text.extract(page, '"og:title" content="', '"') album = text.extr(page, '"og:title" content="', '"')
user , pos = text.extract(page, 'rel="author">', '<', pos) user = self._extract_user(page)
return { return {
"album_id" : self.album_id, "album_id" : self.album_id,
"album_name": text.unescape(album), "album_name" : text.unescape(album),
"user" : user.lower() if user else "", "user" : user.get("username") or "",
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
} }
def images(self, page): def images(self, page):
@ -158,7 +167,12 @@ class ImgbbUserExtractor(ImgbbExtractor):
self.page_url = "https://{}.imgbb.com/".format(self.user) self.page_url = "https://{}.imgbb.com/".format(self.user)
def metadata(self, page): def metadata(self, page):
return {"user": self.user} user = self._extract_user(page)
return {
"user" : user.get("username") or self.user,
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
def images(self, page): def images(self, page):
user = text.extr(page, '.obj.resource={"id":"', '"') user = text.extr(page, '.obj.resource={"id":"', '"')
@ -181,15 +195,20 @@ class ImgbbImageExtractor(ImgbbExtractor):
def items(self): def items(self):
url = "https://ibb.co/" + self.image_id url = "https://ibb.co/" + self.image_id
extr = text.extract_from(self.request(url).text) page = self.request(url).text
extr = text.extract_from(page)
user = self._extract_user(page)
image = { image = {
"id" : self.image_id, "id" : self.image_id,
"title" : text.unescape(extr('"og:title" content="', '"')), "title" : text.unescape(extr(
'"og:title" content="', ' hosted at ImgBB"')),
"url" : extr('"og:image" content="', '"'), "url" : extr('"og:image" content="', '"'),
"width" : text.parse_int(extr('"og:image:width" content="', '"')), "width" : text.parse_int(extr('"og:image:width" content="', '"')),
"height": text.parse_int(extr('"og:image:height" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')),
"user" : extr('rel="author">', '<').lower(), "user" : user.get("username") or "",
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
} }
image["extension"] = text.ext_from_url(image["url"]) image["extension"] = text.ext_from_url(image["url"])

@ -103,7 +103,8 @@ class InkbunnyPoolExtractor(InkbunnyExtractor):
subcategory = "pool" subcategory = "pool"
pattern = (BASE_PATTERN + r"/(?:" pattern = (BASE_PATTERN + r"/(?:"
r"poolview_process\.php\?pool_id=(\d+)|" r"poolview_process\.php\?pool_id=(\d+)|"
r"submissionsviewall\.php\?([^#]+&mode=pool&[^#]+))") r"submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=pool(?:&[^#]+)?))")
example = "https://inkbunny.net/poolview_process.php?pool_id=12345" example = "https://inkbunny.net/poolview_process.php?pool_id=12345"
def __init__(self, match): def __init__(self, match):
@ -133,7 +134,8 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
subcategory = "favorite" subcategory = "favorite"
pattern = (BASE_PATTERN + r"/(?:" pattern = (BASE_PATTERN + r"/(?:"
r"userfavorites_process\.php\?favs_user_id=(\d+)|" r"userfavorites_process\.php\?favs_user_id=(\d+)|"
r"submissionsviewall\.php\?([^#]+&mode=userfavs&[^#]+))") r"submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=userfavs(?:&[^#]+)?))")
example = ("https://inkbunny.net/userfavorites_process.php" example = ("https://inkbunny.net/userfavorites_process.php"
"?favs_user_id=12345") "?favs_user_id=12345")
@ -161,11 +163,31 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
return self.api.search(params) return self.api.search(params)
class InkbunnyUnreadExtractor(InkbunnyExtractor):
"""Extractor for unread inkbunny submissions"""
subcategory = "unread"
pattern = (BASE_PATTERN + r"/submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=unreadsubs(?:&[^#]+)?)")
example = ("https://inkbunny.net/submissionsviewall.php"
"?text=&mode=unreadsubs&type=")
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
self.params = text.parse_query(match.group(1))
def posts(self):
params = self.params.copy()
params.pop("rid", None)
params.pop("mode", None)
params["unread_submissions"] = "yes"
return self.api.search(params)
class InkbunnySearchExtractor(InkbunnyExtractor): class InkbunnySearchExtractor(InkbunnyExtractor):
"""Extractor for inkbunny search results""" """Extractor for inkbunny search results"""
subcategory = "search" subcategory = "search"
pattern = (BASE_PATTERN + pattern = (BASE_PATTERN + r"/submissionsviewall\.php"
r"/submissionsviewall\.php\?([^#]+&mode=search&[^#]+)") r"\?((?:[^#]+&)?mode=search(?:&[^#]+)?)")
example = ("https://inkbunny.net/submissionsviewall.php" example = ("https://inkbunny.net/submissionsviewall.php"
"?text=TAG&mode=search&type=") "?text=TAG&mode=search&type=")
@ -201,7 +223,8 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor):
subcategory = "following" subcategory = "following"
pattern = (BASE_PATTERN + r"/(?:" pattern = (BASE_PATTERN + r"/(?:"
r"watchlist_process\.php\?mode=watching&user_id=(\d+)|" r"watchlist_process\.php\?mode=watching&user_id=(\d+)|"
r"usersviewall\.php\?([^#]+&mode=watching&[^#]+))") r"usersviewall\.php"
r"\?((?:[^#]+&)?mode=watching(?:&[^#]+)?))")
example = ("https://inkbunny.net/watchlist_process.php" example = ("https://inkbunny.net/watchlist_process.php"
"?mode=watching&user_id=12345") "?mode=watching&user_id=12345")
@ -324,6 +347,9 @@ class InkbunnyAPI():
while True: while True:
data = self._call("search", params) data = self._call("search", params)
if not data["submissions"]:
return
yield from self.detail(data["submissions"]) yield from self.detail(data["submissions"])
if data["page"] >= data["pages_count"]: if data["page"] >= data["pages_count"]:
@ -334,7 +360,7 @@ class InkbunnyAPI():
params["page"] += 1 params["page"] += 1
@cache(maxage=360*24*3600, keyarg=1) @cache(maxage=365*86400, keyarg=1)
def _authenticate_impl(api, username, password): def _authenticate_impl(api, username, password):
api.extractor.log.info("Logging in as %s", username) api.extractor.log.info("Logging in as %s", username)

@ -217,9 +217,10 @@ class InstagramExtractor(Extractor):
data["post_shortcode"]) data["post_shortcode"])
continue continue
if "video_versions" in item: video_versions = item.get("video_versions")
if video_versions:
video = max( video = max(
item["video_versions"], video_versions,
key=lambda x: (x["width"], x["height"], x["type"]), key=lambda x: (x["width"], x["height"], x["type"]),
) )
media = video media = video
@ -710,7 +711,8 @@ class InstagramRestAPI():
def user_by_name(self, screen_name): def user_by_name(self, screen_name):
endpoint = "/v1/users/web_profile_info/" endpoint = "/v1/users/web_profile_info/"
params = {"username": screen_name} params = {"username": screen_name}
return self._call(endpoint, params=params)["data"]["user"] return self._call(
endpoint, params=params, notfound="user")["data"]["user"]
@memcache(keyarg=1) @memcache(keyarg=1)
def user_by_id(self, user_id): def user_by_id(self, user_id):
@ -777,13 +779,15 @@ class InstagramRestAPI():
kwargs["headers"] = { kwargs["headers"] = {
"Accept" : "*/*", "Accept" : "*/*",
"X-CSRFToken" : extr.csrf_token, "X-CSRFToken" : extr.csrf_token,
"X-Instagram-AJAX": "1006242110",
"X-IG-App-ID" : "936619743392459", "X-IG-App-ID" : "936619743392459",
"X-ASBD-ID" : "198387", "X-ASBD-ID" : "129477",
"X-IG-WWW-Claim" : extr.www_claim, "X-IG-WWW-Claim" : extr.www_claim,
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",
"Alt-Used" : "www.instagram.com", "Connection" : "keep-alive",
"Referer" : extr.root + "/", "Referer" : extr.root + "/",
"Sec-Fetch-Dest" : "empty",
"Sec-Fetch-Mode" : "cors",
"Sec-Fetch-Site" : "same-origin",
} }
return extr.request(url, **kwargs).json() return extr.request(url, **kwargs).json()
@ -973,7 +977,7 @@ class InstagramGraphqlAPI():
variables["after"] = extr._update_cursor(info["end_cursor"]) variables["after"] = extr._update_cursor(info["end_cursor"])
@cache(maxage=90*24*3600, keyarg=1) @cache(maxage=90*86400, keyarg=1)
def _login_impl(extr, username, password): def _login_impl(extr, username, password):
extr.log.error("Login with username & password is no longer supported. " extr.log.error("Login with username & password is no longer supported. "
"Use browser cookies instead.") "Use browser cookies instead.")

@ -29,8 +29,9 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
example = "https://issuu.com/issuu/docs/TITLE/" example = "https://issuu.com/issuu/docs/TITLE/"
def metadata(self, page): def metadata(self, page):
pos = page.rindex('id="initial-data"')
data = util.json_loads(text.rextract( data = util.json_loads(text.rextract(
page, '<script data-json="', '"')[0].replace("&quot;", '"')) page, '<script data-json="', '"', pos)[0].replace("&quot;", '"'))
doc = data["initialDocumentData"]["document"] doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime( doc["date"] = text.parse_datetime(

@ -1,105 +0,0 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://jpg1.su/"""
from .common import Extractor, Message
from .. import text
BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)"
class JpgfishExtractor(Extractor):
"""Base class for jpgfish extractors"""
category = "jpgfish"
root = "https://jpg1.su"
directory_fmt = ("{category}", "{user}", "{album}",)
archive_fmt = "{id}"
def _pagination(self, url):
while url:
page = self.request(url).text
for item in text.extract_iter(
page, '<div class="list-item-image ', 'image-container'):
yield text.extract(item, '<a href="', '"')[0]
url = text.extract(
page, '<a data-pagination="next" href="', '" ><')[0]
class JpgfishImageExtractor(JpgfishExtractor):
"""Extractor for jpgfish Images"""
subcategory = "image"
pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))"
example = "https://jpg1.su/img/TITLE.ID"
def __init__(self, match):
JpgfishExtractor.__init__(self, match)
self.path, self.image_id = match.groups()
def items(self):
url = "{}/img/{}".format(self.root, self.path)
extr = text.extract_from(self.request(url).text)
image = {
"id" : self.image_id,
"url" : extr('<meta property="og:image" content="', '"'),
"album": text.extract(extr(
"Added to <a", "/a>"), ">", "<")[0] or "",
"user" : extr('username: "', '"'),
}
text.nameext_from_url(image["url"], image)
yield Message.Directory, image
yield Message.Url, image["url"], image
class JpgfishAlbumExtractor(JpgfishExtractor):
"""Extractor for jpgfish Albums"""
subcategory = "album"
pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?"
example = "https://jpg1.su/album/TITLE.ID"
def __init__(self, match):
JpgfishExtractor.__init__(self, match)
self.album, self.sub_albums = match.groups()
def items(self):
url = "{}/a/{}".format(self.root, self.album)
data = {"_extractor": JpgfishImageExtractor}
if self.sub_albums:
albums = self._pagination(url + "/sub")
else:
albums = (url,)
for album in albums:
for image in self._pagination(album):
yield Message.Queue, image, data
class JpgfishUserExtractor(JpgfishExtractor):
"""Extractor for jpgfish Users"""
subcategory = "user"
pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?"
example = "https://jpg1.su/USER"
def __init__(self, match):
JpgfishExtractor.__init__(self, match)
self.user, self.albums = match.groups()
def items(self):
url = "{}/{}".format(self.root, self.user)
if self.albums:
url += "/albums"
data = {"_extractor": JpgfishAlbumExtractor}
else:
data = {"_extractor": JpgfishImageExtractor}
for url in self._pagination(url):
yield Message.Queue, url, data

@ -9,9 +9,10 @@
"""Extractors for https://kemono.party/""" """Extractors for https://kemono.party/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache, memcache
import itertools import itertools
import json
import re import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
@ -24,7 +25,7 @@ class KemonopartyExtractor(Extractor):
category = "kemonoparty" category = "kemonoparty"
root = "https://kemono.party" root = "https://kemono.party"
directory_fmt = ("{category}", "{service}", "{user}") directory_fmt = ("{category}", "{service}", "{user}")
filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}" filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}"
archive_fmt = "{service}_{user}_{id}_{num}" archive_fmt = "{service}_{user}_{id}_{num}"
cookies_domain = ".kemono.party" cookies_domain = ".kemono.party"
@ -37,10 +38,16 @@ class KemonopartyExtractor(Extractor):
Extractor.__init__(self, match) Extractor.__init__(self, match)
def _init(self): def _init(self):
self.revisions = self.config("revisions")
if self.revisions:
self.revisions_unique = (self.revisions == "unique")
self._prepare_ddosguard_cookies() self._prepare_ddosguard_cookies()
self._find_inline = re.compile( self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder(
ensure_ascii=False, check_circular=False,
sort_keys=True, separators=(",", ":")).encode
def items(self): def items(self):
find_hash = re.compile(HASH_PATTERN).match find_hash = re.compile(HASH_PATTERN).match
@ -69,9 +76,9 @@ class KemonopartyExtractor(Extractor):
headers["Referer"] = "{}/{}/user/{}/post/{}".format( headers["Referer"] = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"]) self.root, post["service"], post["user"], post["id"])
post["_http_headers"] = headers post["_http_headers"] = headers
post["date"] = text.parse_datetime( post["date"] = self._parse_datetime(
post["published"] or post["added"], post["published"] or post["added"])
"%a, %d %b %Y %H:%M:%S %Z")
if username: if username:
post["username"] = username post["username"] = username
if comments: if comments:
@ -129,7 +136,7 @@ class KemonopartyExtractor(Extractor):
self.cookies_update(self._login_impl( self.cookies_update(self._login_impl(
(username, self.cookies_domain), password)) (username, self.cookies_domain), password))
@cache(maxage=28*24*3600, keyarg=1) @cache(maxage=28*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
username = username[0] username = username[0]
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
@ -197,14 +204,80 @@ class KemonopartyExtractor(Extractor):
dms = [] dms = []
for dm in text.extract_iter(page, "<article", "</article>"): for dm in text.extract_iter(page, "<article", "</article>"):
footer = text.extr(dm, "<footer", "</footer>")
dms.append({ dms.append({
"body": text.unescape(text.extract( "body": text.unescape(text.extr(
dm, "<pre>", "</pre></", dm, "<pre>", "</pre></",
)[0].strip()), ).strip()),
"date": text.extr(dm, 'datetime="', '"'), "date": text.extr(footer, 'Published: ', '\n'),
}) })
return dms return dms
def _parse_datetime(self, date_string):
if len(date_string) > 19:
date_string = date_string[:19]
return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S")
@memcache(keyarg=1)
def _discord_channels(self, server):
url = "{}/api/v1/discord/channel/lookup/{}".format(
self.root, server)
return self.request(url).json()
def _revisions_post(self, post, url):
post["revision_id"] = 0
try:
revs = self.request(url + "/revisions").json()
except exception.HttpError:
post["revision_hash"] = self._revision_hash(post)
post["revision_index"] = 1
return (post,)
revs.insert(0, post)
for rev in revs:
rev["revision_hash"] = self._revision_hash(rev)
if self.revisions_unique:
uniq = []
last = None
for rev in revs:
if last != rev["revision_hash"]:
last = rev["revision_hash"]
uniq.append(rev)
revs = uniq
idx = len(revs)
for rev in revs:
rev["revision_index"] = idx
idx -= 1
return revs
def _revisions_all(self, url):
revs = self.request(url + "/revisions").json()
idx = len(revs)
for rev in revs:
rev["revision_hash"] = self._revision_hash(rev)
rev["revision_index"] = idx
idx -= 1
return revs
def _revision_hash(self, revision):
rev = revision.copy()
rev.pop("revision_id", None)
rev.pop("added", None)
rev.pop("next", None)
rev.pop("prev", None)
rev["file"] = rev["file"].copy()
rev["file"].pop("name", None)
rev["attachments"] = [a.copy() for a in rev["attachments"]]
for a in rev["attachments"]:
a.pop("name", None)
return util.sha1(self._json_dumps(rev))
def _validate(response): def _validate(response):
return (response.headers["content-length"] != "9" or return (response.headers["content-length"] != "9" or
@ -214,48 +287,68 @@ def _validate(response):
class KemonopartyUserExtractor(KemonopartyExtractor): class KemonopartyUserExtractor(KemonopartyExtractor):
"""Extractor for all posts from a kemono.party user listing""" """Extractor for all posts from a kemono.party user listing"""
subcategory = "user" subcategory = "user"
pattern = USER_PATTERN + r"/?(?:\?o=(\d+))?(?:$|[?#])" pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])"
example = "https://kemono.party/SERVICE/user/12345" example = "https://kemono.party/SERVICE/user/12345"
def __init__(self, match): def __init__(self, match):
_, _, service, user_id, offset = match.groups() _, _, service, user_id, self.query = match.groups()
self.subcategory = service self.subcategory = service
KemonopartyExtractor.__init__(self, match) KemonopartyExtractor.__init__(self, match)
self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) self.api_url = "{}/api/v1/{}/user/{}".format(
self.root, service, user_id)
self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
self.offset = text.parse_int(offset)
def posts(self): def posts(self):
url = self.api_url url = self.api_url
params = {"o": self.offset} params = text.parse_query(self.query)
params["o"] = text.parse_int(params.get("o"))
while True: while True:
posts = self.request(url, params=params).json() posts = self.request(url, params=params).json()
yield from posts
cnt = len(posts) if self.revisions:
if cnt < 25: for post in posts:
return post_url = "{}/post/{}".format(self.api_url, post["id"])
params["o"] += cnt yield from self._revisions_post(post, post_url)
else:
yield from posts
if len(posts) < 50:
break
params["o"] += 50
class KemonopartyPostExtractor(KemonopartyExtractor): class KemonopartyPostExtractor(KemonopartyExtractor):
"""Extractor for a single kemono.party post""" """Extractor for a single kemono.party post"""
subcategory = "post" subcategory = "post"
pattern = USER_PATTERN + r"/post/([^/?#]+)" pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?"
example = "https://kemono.party/SERVICE/user/12345/post/12345" example = "https://kemono.party/SERVICE/user/12345/post/12345"
def __init__(self, match): def __init__(self, match):
_, _, service, user_id, post_id = match.groups() _, _, service, user_id, post_id, self.revision, self.revision_id = \
match.groups()
self.subcategory = service self.subcategory = service
KemonopartyExtractor.__init__(self, match) KemonopartyExtractor.__init__(self, match)
self.api_url = "{}/api/{}/user/{}/post/{}".format( self.api_url = "{}/api/v1/{}/user/{}/post/{}".format(
self.root, service, user_id, post_id) self.root, service, user_id, post_id)
self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
def posts(self): def posts(self):
posts = self.request(self.api_url).json() if not self.revision:
return (posts[0],) if len(posts) > 1 else posts post = self.request(self.api_url).json()
if self.revisions:
return self._revisions_post(post, self.api_url)
return (post,)
revs = self._revisions_all(self.api_url)
if not self.revision_id:
return revs
for rev in revs:
if str(rev["revision_id"]) == self.revision_id:
return (rev,)
raise exception.NotFoundError("revision")
class KemonopartyDiscordExtractor(KemonopartyExtractor): class KemonopartyDiscordExtractor(KemonopartyExtractor):
@ -270,11 +363,29 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
def __init__(self, match): def __init__(self, match):
KemonopartyExtractor.__init__(self, match) KemonopartyExtractor.__init__(self, match)
_, _, self.server, self.channel, self.channel_name = match.groups() _, _, self.server, self.channel_id, self.channel = match.groups()
self.channel_name = ""
def items(self): def items(self):
self._prepare_ddosguard_cookies() self._prepare_ddosguard_cookies()
if self.channel_id:
self.channel_name = self.channel
else:
if self.channel.isdecimal() and len(self.channel) >= 16:
key = "id"
else:
key = "name"
for channel in self._discord_channels(self.server):
if channel[key] == self.channel:
break
else:
raise exception.NotFoundError("channel")
self.channel_id = channel["id"]
self.channel_name = channel["name"]
find_inline = re.compile( find_inline = re.compile(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
@ -298,8 +409,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
"name": path, "type": "inline", "hash": ""}) "name": path, "type": "inline", "hash": ""})
post["channel_name"] = self.channel_name post["channel_name"] = self.channel_name
post["date"] = text.parse_datetime( post["date"] = self._parse_datetime(post["published"])
post["published"], "%a, %d %b %Y %H:%M:%S %Z")
post["count"] = len(files) post["count"] = len(files)
yield Message.Directory, post yield Message.Directory, post
@ -319,27 +429,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
yield Message.Url, url, post yield Message.Url, url, post
def posts(self): def posts(self):
if self.channel is None: url = "{}/api/v1/discord/channel/{}".format(
url = "{}/api/discord/channels/lookup?q={}".format( self.root, self.channel_id)
self.root, self.server) params = {"o": 0}
for channel in self.request(url).json():
if channel["name"] == self.channel_name:
self.channel = channel["id"]
break
else:
raise exception.NotFoundError("channel")
url = "{}/api/discord/channel/{}".format(self.root, self.channel)
params = {"skip": 0}
while True: while True:
posts = self.request(url, params=params).json() posts = self.request(url, params=params).json()
yield from posts yield from posts
cnt = len(posts) if len(posts) < 150:
if cnt < 25:
break break
params["skip"] += cnt params["o"] += 150
class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
@ -352,11 +452,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
self.server = match.group(3) self.server = match.group(3)
def items(self): def items(self):
url = "{}/api/discord/channels/lookup?q={}".format( for channel in self._discord_channels(self.server):
self.root, self.server)
channels = self.request(url).json()
for channel in channels:
url = "{}/discord/server/{}/channel/{}#{}".format( url = "{}/discord/server/{}/channel/{}#{}".format(
self.root, self.server, channel["id"], channel["name"]) self.root, self.server, channel["id"], channel["name"])
channel["_extractor"] = KemonopartyDiscordExtractor channel["_extractor"] = KemonopartyDiscordExtractor

@ -6,19 +6,19 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extractors for https://komikcast.site/""" """Extractors for https://komikcast.lol/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text
import re import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)" BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)"
class KomikcastBase(): class KomikcastBase():
"""Base class for komikcast extractors""" """Base class for komikcast extractors"""
category = "komikcast" category = "komikcast"
root = "https://komikcast.site" root = "https://komikcast.lol"
@staticmethod @staticmethod
def parse_chapter_string(chapter_string, data=None): def parse_chapter_string(chapter_string, data=None):
@ -46,9 +46,9 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for manga-chapters from komikcast.site""" """Extractor for manga-chapters from komikcast.lol"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
example = "https://komikcast.site/chapter/TITLE/" example = "https://komikcast.lol/chapter/TITLE/"
def metadata(self, page): def metadata(self, page):
info = text.extr(page, "<title>", " - Komikcast<") info = text.extr(page, "<title>", " - Komikcast<")
@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for manga from komikcast.site""" """Extractor for manga from komikcast.lol"""
chapterclass = KomikcastChapterExtractor chapterclass = KomikcastChapterExtractor
pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
example = "https://komikcast.site/komik/TITLE" example = "https://komikcast.lol/komik/TITLE"
def chapters(self, page): def chapters(self, page):
results = [] results = []
@ -76,8 +76,10 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
for item in text.extract_iter( for item in text.extract_iter(
page, '<a class="chapter-link-item" href="', '</a'): page, '<a class="chapter-link-item" href="', '</a'):
url, _, chapter_string = item.rpartition('">Chapter ') url, _, chapter = item.rpartition('">Chapter')
self.parse_chapter_string(chapter_string, data) chapter, sep, minor = chapter.strip().partition(".")
data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = sep + minor
results.append((url, data.copy())) results.append((url, data.copy()))
return results return results

@ -18,8 +18,8 @@ class LynxchanExtractor(BaseExtractor):
BASE_PATTERN = LynxchanExtractor.update({ BASE_PATTERN = LynxchanExtractor.update({
"bbw-chan": { "bbw-chan": {
"root": "https://bbw-chan.nl", "root": "https://bbw-chan.link",
"pattern": r"bbw-chan\.nl", "pattern": r"bbw-chan\.(?:link|nl)",
}, },
"kohlchan": { "kohlchan": {
"root": "https://kohlchan.net", "root": "https://kohlchan.net",
@ -40,7 +40,7 @@ class LynxchanThreadExtractor(LynxchanExtractor):
filename_fmt = "{postId}{num:?-//} {filename}.{extension}" filename_fmt = "{postId}{num:?-//} {filename}.{extension}"
archive_fmt = "{boardUri}_{postId}_{num}" archive_fmt = "{boardUri}_{postId}_{num}"
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
example = "https://bbw-chan.nl/a/res/12345.html" example = "https://endchan.org/a/res/12345.html"
def __init__(self, match): def __init__(self, match):
LynxchanExtractor.__init__(self, match) LynxchanExtractor.__init__(self, match)
@ -71,7 +71,7 @@ class LynxchanBoardExtractor(LynxchanExtractor):
"""Extractor for LynxChan boards""" """Extractor for LynxChan boards"""
subcategory = "board" subcategory = "board"
pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
example = "https://bbw-chan.nl/a/" example = "https://endchan.org/a/"
def __init__(self, match): def __init__(self, match):
LynxchanExtractor.__init__(self, match) LynxchanExtractor.__init__(self, match)

@ -148,6 +148,32 @@ class MangadexFeedExtractor(MangadexExtractor):
return self.api.user_follows_manga_feed() return self.api.user_follows_manga_feed()
class MangadexListExtractor(MangadexExtractor):
"""Extractor for mangadex lists"""
subcategory = "list"
pattern = (BASE_PATTERN +
r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?")
example = ("https://mangadex.org/list"
"/01234567-89ab-cdef-0123-456789abcdef/NAME")
def __init__(self, match):
MangadexExtractor.__init__(self, match)
if match.group(2) == "feed":
self.subcategory = "list-feed"
else:
self.items = self._items_titles
def chapters(self):
return self.api.list_feed(self.uuid)
def _items_titles(self):
data = {"_extractor": MangadexMangaExtractor}
for item in self.api.list(self.uuid)["relationships"]:
if item["type"] == "manga":
url = "{}/title/{}".format(self.root, item["id"])
yield Message.Queue, url, data
class MangadexAPI(): class MangadexAPI():
"""Interface for the MangaDex API v5 """Interface for the MangaDex API v5
@ -173,6 +199,12 @@ class MangadexAPI():
params = {"includes[]": ("scanlation_group",)} params = {"includes[]": ("scanlation_group",)}
return self._call("/chapter/" + uuid, params)["data"] return self._call("/chapter/" + uuid, params)["data"]
def list(self, uuid):
return self._call("/list/" + uuid)["data"]
def list_feed(self, uuid):
return self._pagination("/list/" + uuid + "/feed")
@memcache(keyarg=1) @memcache(keyarg=1)
def manga(self, uuid): def manga(self, uuid):
params = {"includes[]": ("artist", "author")} params = {"includes[]": ("artist", "author")}
@ -266,6 +298,6 @@ class MangadexAPI():
return return
@cache(maxage=28*24*3600, keyarg=0) @cache(maxage=28*86400, keyarg=0)
def _refresh_token_cache(username): def _refresh_token_cache(username):
return None return None

@ -10,7 +10,11 @@ from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text
import re import re
BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" BASE_PATTERN = (
r"(?:https?://)?"
r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o"
r"\.(?:to|com))"
)
class ManganeloBase(): class ManganeloBase():
@ -67,10 +71,11 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):
def images(self, page): def images(self, page):
page = text.extr( page = text.extr(
page, 'class="container-chapter-reader', '\n<div') page, 'class="container-chapter-reader', 'class="container')
return [ return [
(url, None) (url, None)
for url in text.extract_iter(page, '<img src="', '"') for url in text.extract_iter(page, '<img src="', '"')
if not url.endswith("/gohome.png")
] or [ ] or [
(url, None) (url, None)
for url in text.extract_iter( for url in text.extract_iter(

@ -50,8 +50,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor):
page = text.extr( page = text.extr(
page, '<div class="reading-content">', '<div class="entry-header') page, '<div class="reading-content">', '<div class="entry-header')
return [ return [
(url.strip(), None) (text.extr(img, 'src="', '"').strip(), None)
for url in text.extract_iter(page, 'data-src="', '"') for img in text.extract_iter(page, '<img id="image-', '>')
] ]

@ -45,6 +45,9 @@ class MastodonExtractor(BaseExtractor):
attachments = status["media_attachments"] attachments = status["media_attachments"]
del status["media_attachments"] del status["media_attachments"]
if status["reblog"]:
attachments.extend(status["reblog"]["media_attachments"])
status["instance"] = self.instance status["instance"] = self.instance
acct = status["account"]["acct"] acct = status["account"]["acct"]
status["instance_remote"] = \ status["instance_remote"] = \
@ -72,7 +75,7 @@ class MastodonExtractor(BaseExtractor):
account["acct"], account["moved"]["acct"]) account["acct"], account["moved"]["acct"])
INSTANCES = { BASE_PATTERN = MastodonExtractor.update({
"mastodon.social": { "mastodon.social": {
"root" : "https://mastodon.social", "root" : "https://mastodon.social",
"pattern" : r"mastodon\.social", "pattern" : r"mastodon\.social",
@ -97,9 +100,7 @@ INSTANCES = {
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
} }
} }) + "(?:/web)?"
BASE_PATTERN = MastodonExtractor.update(INSTANCES) + "(?:/web)?"
class MastodonUserExtractor(MastodonExtractor): class MastodonUserExtractor(MastodonExtractor):
@ -113,7 +114,10 @@ class MastodonUserExtractor(MastodonExtractor):
return api.account_statuses( return api.account_statuses(
api.account_id_by_username(self.item), api.account_id_by_username(self.item),
only_media=not self.config("text-posts", False), only_media=(
not self.reblogs and
not self.config("text-posts", False)
),
exclude_replies=not self.replies, exclude_replies=not self.replies,
) )
@ -146,7 +150,7 @@ class MastodonFollowingExtractor(MastodonExtractor):
class MastodonStatusExtractor(MastodonExtractor): class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status""" """Extractor for images from a status"""
subcategory = "status" subcategory = "status"
pattern = BASE_PATTERN + r"/@[^/?#]+/(\d+)" pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)"
example = "https://mastodon.social/@USER/12345" example = "https://mastodon.social/@USER/12345"
def statuses(self): def statuses(self):
@ -168,10 +172,8 @@ class MastodonAPI():
if access_token is None or access_token == "cache": if access_token is None or access_token == "cache":
access_token = _access_token_cache(extractor.instance) access_token = _access_token_cache(extractor.instance)
if not access_token: if not access_token:
try: access_token = extractor.config_instance("access-token")
access_token = INSTANCES[extractor.category]["access-token"]
except (KeyError, TypeError):
pass
if access_token: if access_token:
self.headers = {"Authorization": "Bearer " + access_token} self.headers = {"Authorization": "Bearer " + access_token}
else: else:
@ -271,6 +273,6 @@ class MastodonAPI():
params = None params = None
@cache(maxage=100*365*24*3600, keyarg=0) @cache(maxage=36500*86400, keyarg=0)
def _access_token_cache(instance): def _access_token_cache(instance):
return None return None

@ -70,6 +70,10 @@ BASE_PATTERN = MisskeyExtractor.update({
"root": "https://misskey.io", "root": "https://misskey.io",
"pattern": r"misskey\.io", "pattern": r"misskey\.io",
}, },
"misskey.design": {
"root": "https://misskey.design",
"pattern": r"misskey\.design",
},
"lesbian.energy": { "lesbian.energy": {
"root": "https://lesbian.energy", "root": "https://lesbian.energy",
"pattern": r"lesbian\.energy", "pattern": r"lesbian\.energy",

@ -124,6 +124,11 @@ class MoebooruPoolExtractor(MoebooruExtractor):
self.pool_id = match.group(match.lastindex) self.pool_id = match.group(match.lastindex)
def metadata(self): def metadata(self):
if self.config("metadata"):
url = "{}/pool/show/{}.json".format(self.root, self.pool_id)
pool = self.request(url).json()
pool.pop("posts", None)
return {"pool": pool}
return {"pool": text.parse_int(self.pool_id)} return {"pool": text.parse_int(self.pool_id)}
def posts(self): def posts(self):

@ -16,12 +16,12 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
root = "https://myhentaigallery.com" root = "https://myhentaigallery.com"
directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}") directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}")
pattern = (r"(?:https?://)?myhentaigallery\.com" pattern = (r"(?:https?://)?myhentaigallery\.com"
r"/gallery/(?:thumbnails|show)/(\d+)") r"/g(?:allery/(?:thumbnails|show))?/(\d+)")
example = "https://myhentaigallery.com/gallery/thumbnails/12345" example = "https://myhentaigallery.com/g/12345"
def __init__(self, match): def __init__(self, match):
self.gallery_id = match.group(1) self.gallery_id = match.group(1)
url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id) url = "{}/g/{}".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url) GalleryExtractor.__init__(self, match, url)
def _init(self): def _init(self):

@ -46,7 +46,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
"episode" : self.episode, "episode" : self.episode,
"comic" : extr('titleName: "', '"'), "comic" : extr('titleName: "', '"'),
"tags" : [t.strip() for t in text.extract_iter( "tags" : [t.strip() for t in text.extract_iter(
extr("tagList: [", "}],"), '"tagName":"', '"')], extr("tagList: [", "],"), '"tagName":"', '"')],
"title" : extr('"subtitle":"', '"'), "title" : extr('"subtitle":"', '"'),
"author" : [a.strip() for a in text.extract_iter( "author" : [a.strip() for a in text.extract_iter(
extr('"writers":[', ']'), '"name":"', '"')], extr('"writers":[', ']'), '"name":"', '"')],

@ -23,7 +23,7 @@ class NewgroundsExtractor(Extractor):
root = "https://www.newgrounds.com" root = "https://www.newgrounds.com"
cookies_domain = ".newgrounds.com" cookies_domain = ".newgrounds.com"
cookies_names = ("NG_GG_username", "vmk1du5I8m") cookies_names = ("NG_GG_username", "vmk1du5I8m")
request_interval = 1.0 request_interval = (0.5, 1.5)
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
@ -54,14 +54,31 @@ class NewgroundsExtractor(Extractor):
if metadata: if metadata:
post.update(metadata) post.update(metadata)
yield Message.Directory, post yield Message.Directory, post
post["num"] = 0
yield Message.Url, url, text.nameext_from_url(url, post) yield Message.Url, url, text.nameext_from_url(url, post)
for num, url in enumerate(text.extract_iter( if "_multi" in post:
post["_comment"], 'data-smartload-src="', '"'), 1): for data in post["_multi"]:
post["num"] = num post["num"] += 1
post["_index"] = "{}_{:>02}".format(post["index"], num) post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
post.update(data)
url = data["image"]
text.nameext_from_url(url, post)
yield Message.Url, url, post
if "_fallback" in post:
del post["_fallback"]
for url in text.extract_iter(
post["_comment"], 'data-smartload-src="', '"'):
post["num"] += 1
post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
url = text.ensure_http_scheme(url) url = text.ensure_http_scheme(url)
yield Message.Url, url, text.nameext_from_url(url, post) text.nameext_from_url(url, post)
yield Message.Url, url, post
else: else:
self.log.warning( self.log.warning(
"Unable to get download URL for '%s'", post_url) "Unable to get download URL for '%s'", post_url)
@ -81,7 +98,7 @@ class NewgroundsExtractor(Extractor):
if username: if username:
self.cookies_update(self._login_impl(username, password)) self.cookies_update(self._login_impl(username, password))
@cache(maxage=360*24*3600, keyarg=1) @cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
@ -153,8 +170,7 @@ class NewgroundsExtractor(Extractor):
data["post_url"] = post_url data["post_url"] = post_url
return data return data
@staticmethod def _extract_image_data(self, extr, url):
def _extract_image_data(extr, url):
full = text.extract_from(util.json_loads(extr( full = text.extract_from(util.json_loads(extr(
'"full_image_text":', '});'))) '"full_image_text":', '});')))
data = { data = {
@ -172,8 +188,34 @@ class NewgroundsExtractor(Extractor):
index = data["url"].rpartition("/")[2].partition("_")[0] index = data["url"].rpartition("/")[2].partition("_")[0]
data["index"] = text.parse_int(index) data["index"] = text.parse_int(index)
data["_index"] = index data["_index"] = index
image_data = extr("let imageData =", "\n];")
if image_data:
data["_multi"] = self._extract_images_multi(image_data)
else:
art_images = extr('<div class="art-images', '\n</div>')
if art_images:
data["_multi"] = self._extract_images_art(art_images, data)
return data return data
def _extract_images_multi(self, html):
data = util.json_loads(html + "]")
yield from data[1:]
def _extract_images_art(self, html, data):
ext = text.ext_from_url(data["url"])
for url in text.extract_iter(html, 'data-smartload-src="', '"'):
url = text.ensure_http_scheme(url)
url = url.replace("/medium_views/", "/images/", 1)
if text.ext_from_url(url) == "webp":
yield {
"image" : url.replace(".webp", "." + ext),
"_fallback": (url,),
}
else:
yield {"image": url}
@staticmethod @staticmethod
def _extract_audio_data(extr, url): def _extract_audio_data(extr, url):
index = url.split("/")[5] index = url.split("/")[5]

@ -19,6 +19,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
directory_fmt = ("{category}", "{user_id}") directory_fmt = ("{category}", "{user_id}")
filename_fmt = "{image_id}_p{num}.{extension}" filename_fmt = "{image_id}_p{num}.{extension}"
archive_fmt = "{image_id}_{num}" archive_fmt = "{image_id}_{num}"
request_interval = (1.0, 2.0)
def __init__(self, match): def __init__(self, match):
BaseExtractor.__init__(self, match) BaseExtractor.__init__(self, match)
@ -54,9 +55,16 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
else: else:
data["user_id"] = data["artist_id"] data["user_id"] = data["artist_id"]
data["user_name"] = data["artist_name"] data["user_name"] = data["artist_name"]
yield Message.Directory, data
for image in self._extract_images(page): urls = list(self._extract_images(image_id, page))
data["count"] = len(urls)
yield Message.Directory, data
for num, url in enumerate(urls):
image = text.nameext_from_url(url, {
"num": num,
"url": "https:" + url,
})
image.update(data) image.update(data)
if not image["extension"]: if not image["extension"]:
image["extension"] = "jpg" image["extension"] = "jpg"
@ -71,7 +79,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
extr = text.extract_from(page) extr = text.extract_from(page)
keywords = text.unescape(extr( keywords = text.unescape(extr(
'name="keywords" content="', '" />')).split(",") 'name="keywords" content="', '" />')).split(",")
data = { return {
"title" : keywords[0].strip(), "title" : keywords[0].strip(),
"description": text.unescape(extr( "description": text.unescape(extr(
'"description": "', '"').replace("&amp;", "&")), '"description": "', '"').replace("&amp;", "&")),
@ -81,7 +89,6 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"artist_name": keywords[1], "artist_name": keywords[1],
"tags" : keywords[2:-1], "tags" : keywords[2:-1],
} }
return data
@staticmethod @staticmethod
def _extract_data_horne(page): def _extract_data_horne(page):
@ -89,7 +96,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
extr = text.extract_from(page) extr = text.extract_from(page)
keywords = text.unescape(extr( keywords = text.unescape(extr(
'name="keywords" content="', '" />')).split(",") 'name="keywords" content="', '" />')).split(",")
data = { return {
"title" : keywords[0].strip(), "title" : keywords[0].strip(),
"description": text.unescape(extr( "description": text.unescape(extr(
'property="og:description" content="', '"')), 'property="og:description" content="', '"')),
@ -100,21 +107,17 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"itemprop='datePublished' content=", "<").rpartition(">")[2], "itemprop='datePublished' content=", "<").rpartition(">")[2],
"%Y-%m-%d %H:%M:%S", 9), "%Y-%m-%d %H:%M:%S", 9),
} }
return data
@staticmethod def _extract_images(self, image_id, page):
def _extract_images(page): if '&#diff_1" ' in page:
"""Extract image URLs from 'page'""" # multiple images
images = text.extract_iter(page, "/view_popup.php", "</a>") url = "{}/view_popup.php?id={}".format(self.root, image_id)
for num, image in enumerate(images): page = self.request(url).text
src = text.extr(image, 'src="', '"') yield from text.extract_iter(
if not src: page, 'href="javascript:void(0);"><img src="', '"')
continue else:
url = ("https:" + src).replace("/__rs_l120x120/", "/") pos = page.find('id="view-center"') + 1
yield text.nameext_from_url(url, { yield text.extract(page, 'itemprop="image" src="', '"', pos)[0]
"num": num,
"url": url,
})
@staticmethod @staticmethod
def _extract_user_name(page): def _extract_user_name(page):
@ -125,15 +128,15 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
return return
username, password = self._get_auth_info() username, password = self._get_auth_info()
self.cookies_update(self._login_impl(username, password)) if username:
return self.cookies_update(self._login_impl(username, password))
@cache(maxage=90*24*3600, keyarg=1) raise exception.AuthenticationError("Username and password required")
def _login_impl(self, username, password):
if not username or not password:
raise exception.AuthenticationError(
"Username and password required")
@cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = "{}/login_int.php".format(self.root) url = "{}/login_int.php".format(self.root)
data = {"email": username, "password": password, "save": "on"} data = {"email": username, "password": password, "save": "on"}

@ -96,6 +96,8 @@ class NitterExtractor(BaseExtractor):
for url in text.extract_iter( for url in text.extract_iter(
attachments, '<source src="', '"'): attachments, '<source src="', '"'):
if url[0] == "/":
url = self.root + url
append(text.nameext_from_url(url, {"url": url})) append(text.nameext_from_url(url, {"url": url}))
else: else:
@ -233,10 +235,6 @@ BASE_PATTERN = NitterExtractor.update({
"root": "https://nitter.net", "root": "https://nitter.net",
"pattern": r"nitter\.net", "pattern": r"nitter\.net",
}, },
"nitter.lacontrevoie.fr": {
"root": "https://nitter.lacontrevoie.fr",
"pattern": r"nitter\.lacontrevoie\.fr",
},
"nitter.1d4.us": { "nitter.1d4.us": {
"root": "https://nitter.1d4.us", "root": "https://nitter.1d4.us",
"pattern": r"nitter\.1d4\.us", "pattern": r"nitter\.1d4\.us",

@ -20,6 +20,7 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
filename_fmt = "{album_id}_{num:>03}_{id}.{extension}" filename_fmt = "{album_id}_{num:>03}_{id}.{extension}"
directory_fmt = ("{category}", "{album_id} {title}") directory_fmt = ("{category}", "{album_id} {title}")
archive_fmt = "{id}" archive_fmt = "{id}"
referer = False
pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))" pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
example = "https://nsfwalbum.com/album/12345" example = "https://nsfwalbum.com/album/12345"
@ -71,8 +72,8 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
@staticmethod @staticmethod
def _validate_response(response): def _validate_response(response):
return not response.request.url.endswith( return not response.url.endswith(
("/no_image.jpg", "/placeholder.png")) ("/no_image.jpg", "/placeholder.png", "/error.jpg"))
@staticmethod @staticmethod
def _annihilate(value, base=6): def _annihilate(value, base=6):

@ -1,87 +0,0 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://nudecollect.com/"""
from .common import GalleryExtractor
from .. import text
class NudecollectExtractor(GalleryExtractor):
"""Base class for Nudecollect extractors"""
category = "nudecollect"
directory_fmt = ("{category}", "{title}")
filename_fmt = "{slug}_{num:>03}.{extension}"
archive_fmt = "{slug}_{num}"
root = "https://www.nudecollect.com"
def request(self, url, **kwargs):
kwargs["allow_redirects"] = False
return GalleryExtractor.request(self, url, **kwargs)
@staticmethod
def get_title(page):
return text.unescape(text.extr(page, "<title>", "</title>"))[31:]
@staticmethod
def get_image(page):
return text.extr(page, '<img src="', '"')
class NudecollectImageExtractor(NudecollectExtractor):
"""Extractor for individual images from nudecollect.com"""
subcategory = "image"
pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)"
r"-mirror-(\d+)\.html)")
example = ("https://www.nudecollect.com/content/12345_TITLE"
"/image-1-pics-108-mirror-1.html")
def __init__(self, match):
NudecollectExtractor.__init__(self, match)
_, self.slug, self.num, self.count, self.mirror = match.groups()
def metadata(self, page):
return {
"slug" : self.slug,
"title" : self.get_title(page),
"count" : text.parse_int(self.count),
"mirror": text.parse_int(self.mirror),
}
def images(self, page):
return ((self.get_image(page), {"num": text.parse_int(self.num)}),)
class NudecollectAlbumExtractor(NudecollectExtractor):
"""Extractor for image albums on nudecollect.com"""
subcategory = "album"
pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)"
r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html")
example = ("https://www.nudecollect.com/content/12345_TITLE"
"/index-mirror-01-123.html")
def __init__(self, match):
self.slug = match.group(1)
self.mirror = match.group(2) or match.group(5)
self.count = text.parse_int(match.group(3) or match.group(4))
url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format(
self.root, self.slug, self.count, self.mirror)
NudecollectExtractor.__init__(self, match, url)
def metadata(self, page):
return {
"slug" : self.slug,
"title" : self.get_title(page),
"mirror": text.parse_int(self.mirror),
}
def images(self, page):
url = self.get_image(page)
p1, _, p2 = url.partition("/image0")
ufmt = p1 + "/image{:>05}" + p2[4:]
return [(ufmt.format(num), None) for num in range(1, self.count + 1)]

@ -11,7 +11,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, oauth, util, config, exception from .. import text, oauth, util, config, exception
from ..output import stdout_write from ..output import stdout_write
from ..cache import cache from ..cache import cache, memcache
import urllib.parse import urllib.parse
import binascii import binascii
import hashlib import hashlib
@ -31,6 +31,9 @@ class OAuthBase(Extractor):
def _init(self): def _init(self):
self.cache = config.get(("extractor", self.category), "cache", True) self.cache = config.get(("extractor", self.category), "cache", True)
if self.cache and cache is memcache:
self.log.warning("cache file is not writeable")
self.cache = False
def oauth_config(self, key, default=None): def oauth_config(self, key, default=None):
value = config.interpolate(("extractor", self.subcategory), key) value = config.interpolate(("extractor", self.subcategory), key)
@ -180,7 +183,7 @@ class OAuthBase(Extractor):
} }
if auth: if auth:
auth = (client_id, client_secret) auth = util.HTTPBasicAuth(client_id, client_secret)
else: else:
auth = None auth = None
data["client_id"] = client_id data["client_id"] = client_id
@ -355,8 +358,8 @@ class OAuthMastodon(OAuthBase):
yield Message.Version, 1 yield Message.Version, 1
from . import mastodon from . import mastodon
for application in mastodon.INSTANCES.values(): for _, root, application in mastodon.MastodonExtractor.instances:
if self.instance == application["root"].partition("://")[2]: if self.instance == root.partition("://")[2]:
break break
else: else:
application = self._register(self.instance) application = self._register(self.instance)
@ -373,7 +376,7 @@ class OAuthMastodon(OAuthBase):
cache=mastodon._access_token_cache, cache=mastodon._access_token_cache,
) )
@cache(maxage=10*365*24*3600, keyarg=1) @cache(maxage=36500*86400, keyarg=1)
def _register(self, instance): def _register(self, instance):
self.log.info("Registering application for '%s'", instance) self.log.info("Registering application for '%s'", instance)

@ -32,7 +32,7 @@ class PahealExtractor(Extractor):
post["tags"] = text.unquote(post["tags"]) post["tags"] = text.unquote(post["tags"])
post.update(data) post.update(data)
yield Message.Directory, post yield Message.Directory, post
yield Message.Url, url, text.nameext_from_url(url, post) yield Message.Url, url, post
def get_metadata(self): def get_metadata(self):
"""Return general metadata""" """Return general metadata"""
@ -56,14 +56,16 @@ class PahealExtractor(Extractor):
"date" : text.parse_datetime( "date" : text.parse_datetime(
extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
"source" : text.unescape(text.extr( "source" : text.unescape(text.extr(
extr(">Source&nbsp;Link<", "</td>"), "href='", "'")), extr(">Source Link<", "</td>"), "href='", "'")),
} }
dimensions, size, ext = extr("Info</th><td>", ">").split(" // ") dimensions, size, ext = extr("Info</th><td>", "<").split(" // ")
post["width"], _, height = dimensions.partition("x")
post["size"] = text.parse_bytes(size[:-1]) post["size"] = text.parse_bytes(size[:-1])
post["width"], _, height = dimensions.partition("x")
post["height"], _, duration = height.partition(", ") post["height"], _, duration = height.partition(", ")
post["duration"] = text.parse_float(duration[:-1]) post["duration"] = text.parse_float(duration[:-1])
post["filename"] = "{} - {}".format(post_id, post["tags"])
post["extension"] = ext
return post return post
@ -112,6 +114,7 @@ class PahealTagExtractor(PahealExtractor):
tags, data, date = data.split("\n") tags, data, date = data.split("\n")
dimensions, size, ext = data.split(" // ") dimensions, size, ext = data.split(" // ")
tags = text.unescape(tags)
width, _, height = dimensions.partition("x") width, _, height = dimensions.partition("x")
height, _, duration = height.partition(", ") height, _, duration = height.partition(", ")
@ -119,9 +122,11 @@ class PahealTagExtractor(PahealExtractor):
"id": pid, "md5": md5, "file_url": url, "id": pid, "md5": md5, "file_url": url,
"width": width, "height": height, "width": width, "height": height,
"duration": text.parse_float(duration[:-1]), "duration": text.parse_float(duration[:-1]),
"tags": text.unescape(tags), "tags": tags,
"size": text.parse_bytes(size[:-1]), "size": text.parse_bytes(size[:-1]),
"date": text.parse_datetime(date, "%B %d, %Y; %H:%M"), "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
"filename" : "{} - {}".format(pid, tags),
"extension": ext,
} }
def _extract_data_ex(self, post): def _extract_data_ex(self, post):

@ -52,19 +52,29 @@ class PatreonExtractor(Extractor):
post["hash"] = fhash post["hash"] = fhash
post["type"] = kind post["type"] = kind
post["num"] += 1 post["num"] += 1
yield Message.Url, url, text.nameext_from_url(name, post) text.nameext_from_url(name, post)
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
post["extension"] = "mp4"
yield Message.Url, url, post
else: else:
self.log.debug("skipping %s (%s %s)", url, fhash, kind) self.log.debug("skipping %s (%s %s)", url, fhash, kind)
@staticmethod def _postfile(self, post):
def _postfile(post):
postfile = post.get("post_file") postfile = post.get("post_file")
if postfile: if postfile:
return (("postfile", postfile["url"], postfile["name"]),) url = postfile["url"]
name = postfile.get("name")
if not name:
if url.startswith("https://stream.mux.com/"):
name = url
else:
name = self._filename(url) or url
return (("postfile", url, name),)
return () return ()
def _images(self, post): def _images(self, post):
for image in post["images"]: for image in post.get("images") or ():
url = image.get("download_url") url = image.get("download_url")
if url: if url:
name = image.get("file_name") or self._filename(url) or url name = image.get("file_name") or self._filename(url) or url
@ -80,7 +90,7 @@ class PatreonExtractor(Extractor):
return () return ()
def _attachments(self, post): def _attachments(self, post):
for attachment in post["attachments"]: for attachment in post.get("attachments") or ():
url = self.request( url = self.request(
attachment["url"], method="HEAD", attachment["url"], method="HEAD",
allow_redirects=False, fatal=False, allow_redirects=False, fatal=False,
@ -249,8 +259,39 @@ class PatreonExtractor(Extractor):
return [genmap[ft] for ft in filetypes] return [genmap[ft] for ft in filetypes]
def _extract_bootstrap(self, page): def _extract_bootstrap(self, page):
return util.json_loads(text.extr( data = text.extr(
page, "window.patreon.bootstrap,", "});") + "}") page, 'id="__NEXT_DATA__" type="application/json">', '</script')
if data:
try:
return (util.json_loads(data)["props"]["pageProps"]
["bootstrapEnvelope"]["bootstrap"])
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
bootstrap = text.extr(
page, 'window.patreon = {"bootstrap":', '},"apiServer"')
if bootstrap:
return util.json_loads(bootstrap + "}")
bootstrap = text.extr(
page,
'window.patreon = wrapInProxy({"bootstrap":',
'},"apiServer"')
if bootstrap:
return util.json_loads(bootstrap + "}")
bootstrap = text.extr(page, "window.patreon.bootstrap,", "});")
if bootstrap:
return util.json_loads(bootstrap + "}")
data = text.extr(page, "window.patreon = {", "};\n")
if data:
try:
return util.json_loads("{" + data + "}")["bootstrap"]
except Exception:
pass
raise exception.StopExtraction("Unable to extract bootstrap data")
class PatreonCreatorExtractor(PatreonExtractor): class PatreonCreatorExtractor(PatreonExtractor):
@ -267,34 +308,52 @@ class PatreonCreatorExtractor(PatreonExtractor):
def posts(self): def posts(self):
query = text.parse_query(self.query) query = text.parse_query(self.query)
campaign_id = self._get_campaign_id(query)
filters = self._get_filters(query)
self.log.debug("campaign_id: %s", campaign_id)
url = self._build_url("posts", (
"&filter[campaign_id]=" + campaign_id +
"&filter[contains_exclusive_posts]=true"
"&filter[is_draft]=false" + filters +
"&sort=" + query.get("sort", "-published_at")
))
return self._pagination(url)
creator_id = query.get("u") def _get_campaign_id(self, query):
if creator_id: if self.creator.startswith("id:"):
url = "{}/user/posts?u={}".format(self.root, creator_id) return self.creator[3:]
campaign_id = query.get("c") or query.get("campaign_id")
if campaign_id:
return campaign_id
user_id = query.get("u")
if user_id:
url = "{}/user/posts?u={}".format(self.root, user_id)
else: else:
url = "{}/{}/posts".format(self.root, self.creator) url = "{}/{}/posts".format(self.root, self.creator)
page = self.request(url, notfound="creator").text page = self.request(url, notfound="creator").text
try: try:
data = None
data = self._extract_bootstrap(page) data = self._extract_bootstrap(page)
campaign_id = data["creator"]["data"]["id"] return data["campaign"]["data"]["id"]
except (KeyError, ValueError): except (KeyError, ValueError) as exc:
raise exception.NotFoundError("creator") if data:
self.log.debug(data)
filters = "".join( raise exception.StopExtraction(
"Unable to extract campaign ID (%s: %s)",
exc.__class__.__name__, exc)
def _get_filters(self, query):
return "".join(
"&filter[{}={}".format(key[8:], text.escape(value)) "&filter[{}={}".format(key[8:], text.escape(value))
for key, value in query.items() for key, value in query.items()
if key.startswith("filters[") if key.startswith("filters[")
) )
url = self._build_url("posts", (
"&filter[campaign_id]=" + campaign_id +
"&filter[contains_exclusive_posts]=true"
"&filter[is_draft]=false" + filters +
"&sort=" + query.get("sort", "-published_at")
))
return self._pagination(url)
class PatreonUserExtractor(PatreonExtractor): class PatreonUserExtractor(PatreonExtractor):
"""Extractor for media from creators supported by you""" """Extractor for media from creators supported by you"""

@ -18,7 +18,7 @@ class PhilomenaExtractor(BooruExtractor):
basecategory = "philomena" basecategory = "philomena"
filename_fmt = "{filename}.{extension}" filename_fmt = "{filename}.{extension}"
archive_fmt = "{id}" archive_fmt = "{id}"
request_interval = 1.0 request_interval = (0.5, 1.5)
page_start = 1 page_start = 1
per_page = 50 per_page = 50
@ -32,7 +32,7 @@ class PhilomenaExtractor(BooruExtractor):
post["date"] = text.parse_datetime(post["created_at"]) post["date"] = text.parse_datetime(post["created_at"])
INSTANCES = { BASE_PATTERN = PhilomenaExtractor.update({
"derpibooru": { "derpibooru": {
"root": "https://derpibooru.org", "root": "https://derpibooru.org",
"pattern": r"(?:www\.)?derpibooru\.org", "pattern": r"(?:www\.)?derpibooru\.org",
@ -48,9 +48,7 @@ INSTANCES = {
"pattern": r"furbooru\.org", "pattern": r"furbooru\.org",
"filter_id": "2", "filter_id": "2",
}, },
} })
BASE_PATTERN = PhilomenaExtractor.update(INSTANCES)
class PhilomenaPostExtractor(PhilomenaExtractor): class PhilomenaPostExtractor(PhilomenaExtractor):
@ -176,10 +174,7 @@ class PhilomenaAPI():
if filter_id: if filter_id:
params["filter_id"] = filter_id params["filter_id"] = filter_id
elif not api_key: elif not api_key:
try: params["filter_id"] = extr.config_instance("filter_id") or "2"
params["filter_id"] = INSTANCES[extr.category]["filter_id"]
except (KeyError, TypeError):
params["filter_id"] = "2"
params["page"] = extr.page_start params["page"] = extr.page_start
params["per_page"] = extr.per_page params["per_page"] = extr.per_page

@ -56,7 +56,7 @@ class PillowfortExtractor(Extractor):
post["num"] = 0 post["num"] = 0
for file in files: for file in files:
url = file["url"] url = file["url"] or file.get("b2_lg_url")
if not url: if not url:
continue continue
@ -91,7 +91,7 @@ class PillowfortExtractor(Extractor):
if username: if username:
self.cookies_update(self._login_impl(username, password)) self.cookies_update(self._login_impl(username, password))
@cache(maxage=14*24*3600, keyarg=1) @cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
@ -132,7 +132,7 @@ class PillowfortPostExtractor(PillowfortExtractor):
class PillowfortUserExtractor(PillowfortExtractor): class PillowfortUserExtractor(PillowfortExtractor):
"""Extractor for all posts of a pillowfort user""" """Extractor for all posts of a pillowfort user"""
subcategory = "user" subcategory = "user"
pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+)" pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)"
example = "https://www.pillowfort.social/USER" example = "https://www.pillowfort.social/USER"
def posts(self): def posts(self):

@ -10,7 +10,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache
import itertools import itertools
BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+"
@ -33,7 +32,6 @@ class PinterestExtractor(Extractor):
self.api = PinterestAPI(self) self.api = PinterestAPI(self)
def items(self): def items(self):
self.api.login()
data = self.metadata() data = self.metadata()
videos = self.config("videos", True) videos = self.config("videos", True)
@ -49,6 +47,7 @@ class PinterestExtractor(Extractor):
carousel_data = pin.get("carousel_data") carousel_data = pin.get("carousel_data")
if carousel_data: if carousel_data:
pin["count"] = len(carousel_data["carousel_slots"])
for num, slot in enumerate(carousel_data["carousel_slots"], 1): for num, slot in enumerate(carousel_data["carousel_slots"], 1):
slot["media_id"] = slot.pop("id") slot["media_id"] = slot.pop("id")
pin.update(slot) pin.update(slot)
@ -67,7 +66,7 @@ class PinterestExtractor(Extractor):
if videos or media.get("duration") is None: if videos or media.get("duration") is None:
pin.update(media) pin.update(media)
pin["num"] = 0 pin["num"] = pin["count"] = 1
pin["media_id"] = "" pin["media_id"] = ""
url = media["url"] url = media["url"]
@ -416,41 +415,6 @@ class PinterestAPI():
options = {"query": query, "scope": "pins", "rs": "typed"} options = {"query": query, "scope": "pins", "rs": "typed"}
return self._pagination("BaseSearch", options) return self._pagination("BaseSearch", options)
def login(self):
"""Login and obtain session cookies"""
username, password = self.extractor._get_auth_info()
if username:
self.cookies.update(self._login_impl(username, password))
@cache(maxage=180*24*3600, keyarg=1)
def _login_impl(self, username, password):
self.extractor.log.info("Logging in as %s", username)
url = self.root + "/resource/UserSessionResource/create/"
options = {
"username_or_email": username,
"password" : password,
}
data = {
"data" : util.json_dumps({"options": options}),
"source_url": "",
}
try:
response = self.extractor.request(
url, method="POST", headers=self.headers,
cookies=self.cookies, data=data)
resource = response.json()["resource_response"]
except (exception.HttpError, ValueError, KeyError):
raise exception.AuthenticationError()
if resource["status"] != "success":
raise exception.AuthenticationError()
return {
cookie.name: cookie.value
for cookie in response.cookies
}
def _call(self, resource, options): def _call(self, resource, options):
url = "{}/resource/{}Resource/get/".format(self.root, resource) url = "{}/resource/{}Resource/get/".format(self.root, resource)
params = { params = {

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://pixeldrain.com/"""
from .common import Extractor, Message
from .. import text, util
BASE_PATTERN = r"(?:https?://)?pixeldrain\.com"
class PixeldrainExtractor(Extractor):
"""Base class for pixeldrain extractors"""
category = "pixeldrain"
root = "https://pixeldrain.com"
archive_fmt = "{id}"
def _init(self):
api_key = self.config("api-key")
if api_key:
self.session.auth = util.HTTPBasicAuth("", api_key)
def parse_datetime(self, date_string):
return text.parse_datetime(
date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
class PixeldrainFileExtractor(PixeldrainExtractor):
"""Extractor for pixeldrain files"""
subcategory = "file"
filename_fmt = "{filename[:230]} ({id}).{extension}"
pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)"
example = "https://pixeldrain.com/u/abcdefgh"
def __init__(self, match):
Extractor.__init__(self, match)
self.file_id = match.group(1)
def items(self):
url = "{}/api/file/{}".format(self.root, self.file_id)
file = self.request(url + "/info").json()
file["url"] = url + "?download"
file["date"] = self.parse_datetime(file["date_upload"])
text.nameext_from_url(file["name"], file)
yield Message.Directory, file
yield Message.Url, file["url"], file
class PixeldrainAlbumExtractor(PixeldrainExtractor):
"""Extractor for pixeldrain albums"""
subcategory = "album"
directory_fmt = ("{category}",
"{album[date]:%Y-%m-%d} {album[title]} ({album[id]})")
filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}"
pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)"
example = "https://pixeldrain.com/l/abcdefgh"
def __init__(self, match):
Extractor.__init__(self, match)
self.album_id = match.group(1)
def items(self):
url = "{}/api/list/{}".format(self.root, self.album_id)
album = self.request(url).json()
files = album["files"]
album["count"] = album["file_count"]
album["date"] = self.parse_datetime(album["date_created"])
del album["files"]
del album["file_count"]
yield Message.Directory, {"album": album}
for num, file in enumerate(files, 1):
file["album"] = album
file["num"] = num
file["url"] = url = "{}/api/file/{}?download".format(
self.root, file["id"])
file["date"] = self.parse_datetime(file["date_upload"])
text.nameext_from_url(file["name"], file)
yield Message.Url, url, file

@ -517,6 +517,7 @@ class PixivPixivisionExtractor(PixivExtractor):
directory_fmt = ("{category}", "pixivision", directory_fmt = ("{category}", "pixivision",
"{pixivision_id} {pixivision_title}") "{pixivision_id} {pixivision_title}")
archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}" archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}"
cookies_domain = ".pixiv.net"
pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)" pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)"
example = "https://www.pixivision.net/en/a/12345" example = "https://www.pixivision.net/en/a/12345"
@ -549,6 +550,9 @@ class PixivSeriesExtractor(PixivExtractor):
directory_fmt = ("{category}", "{user[id]} {user[account]}", directory_fmt = ("{category}", "{user[id]} {user[account]}",
"{series[id]} {series[title]}") "{series[id]} {series[title]}")
filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
cookies_domain = ".pixiv.net"
browser = "firefox"
tls12 = False
pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)"
example = "https://www.pixiv.net/user/12345/series/12345" example = "https://www.pixiv.net/user/12345/series/12345"
@ -590,7 +594,7 @@ class PixivSeriesExtractor(PixivExtractor):
class PixivNovelExtractor(PixivExtractor): class PixivNovelExtractor(PixivExtractor):
"""Extractor for pixiv novels""" """Extractor for pixiv novels"""
subcategory = "novel" subcategory = "novel"
request_interval = 1.0 request_interval = (0.5, 1.5)
pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)" pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)"
example = "https://www.pixiv.net/novel/show.php?id=12345" example = "https://www.pixiv.net/novel/show.php?id=12345"
@ -822,9 +826,9 @@ class PixivAppAPI():
extractor.session.headers.update({ extractor.session.headers.update({
"App-OS" : "ios", "App-OS" : "ios",
"App-OS-Version": "13.1.2", "App-OS-Version": "16.7.2",
"App-Version" : "7.7.6", "App-Version" : "7.19.1",
"User-Agent" : "PixivIOSApp/7.7.6 (iOS 13.1.2; iPhone11,8)", "User-Agent" : "PixivIOSApp/7.19.1 (iOS 16.7.2; iPhone12,8)",
"Referer" : "https://app-api.pixiv.net/", "Referer" : "https://app-api.pixiv.net/",
}) })
@ -992,6 +996,6 @@ class PixivAppAPI():
params = text.parse_query(query) params = text.parse_query(query)
@cache(maxage=10*365*24*3600, keyarg=0) @cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(username): def _refresh_token_cache(username):
return None return None

@ -18,7 +18,7 @@ class PlurkExtractor(Extractor):
"""Base class for plurk extractors""" """Base class for plurk extractors"""
category = "plurk" category = "plurk"
root = "https://www.plurk.com" root = "https://www.plurk.com"
request_interval = 1.0 request_interval = (0.5, 1.5)
def items(self): def items(self):
urls = self._urls_ex if self.config("comments", False) else self._urls urls = self._urls_ex if self.config("comments", False) else self._urls

@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for http://www.poringa.net/"""
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import itertools
BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net"
class PoringaExtractor(Extractor):
category = "poringa"
directory_fmt = ("{category}", "{user}", "{post_id}")
filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}"
archive_fmt = "{post_id}_{num}"
root = "http://www.poringa.net"
def __init__(self, match):
Extractor.__init__(self, match)
self.item = match.group(1)
self.__cookies = True
def items(self):
for post_id in self.posts():
url = "{}/posts/imagenes/{}".format(self.root, post_id)
try:
response = self.request(url)
except exception.HttpError as exc:
self.log.warning(
"Unable to fetch posts for '%s' (%s)", post_id, exc)
continue
if "/registro-login?" in response.url:
self.log.warning("Private post '%s'", post_id)
continue
page = response.text
title, pos = text.extract(
page, 'property="og:title" content="', '"')
try:
pos = page.index('<div class="main-info', pos)
user, pos = text.extract(
page, 'href="http://www.poringa.net/', '"', pos)
except ValueError:
user = None
if not user:
user = "poringa"
data = {
"post_id" : post_id,
"title" : text.unescape(title),
"user" : text.unquote(user),
"_http_headers": {"Referer": url},
}
main_post = text.extr(
page, 'property="dc:content" role="main">', '</div>')
urls = list(text.extract_iter(
main_post, '<img class="imagen" border="0" src="', '"'))
data["count"] = len(urls)
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def posts(self):
return ()
def request(self, url, **kwargs):
if self.__cookies:
self.__cookies = False
self.cookies_update(_cookie_cache())
for _ in range(5):
response = Extractor.request(self, url, **kwargs)
if response.cookies:
_cookie_cache.update("", response.cookies)
if response.content.find(
b"<title>Please wait a few moments</title>", 0, 600) < 0:
return response
self.sleep(5.0, "check")
def _pagination(self, url, params):
for params["p"] in itertools.count(1):
page = self.request(url, params=params).text
posts_ids = PoringaPostExtractor.pattern.findall(page)
posts_ids = list(dict.fromkeys(posts_ids))
yield from posts_ids
if len(posts_ids) < 19:
return
class PoringaPostExtractor(PoringaExtractor):
"""Extractor for posts on poringa.net"""
subcategory = "post"
pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)"
example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html"
def posts(self):
return (self.item,)
class PoringaUserExtractor(PoringaExtractor):
subcategory = "user"
pattern = BASE_PATTERN + r"/(\w+)$"
example = "http://www.poringa.net/USER"
def posts(self):
url = self.root + "/buscar/"
params = {"q": self.item}
return self._pagination(url, params)
class PoringaSearchExtractor(PoringaExtractor):
subcategory = "search"
pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)"
example = "http://www.poringa.net/buscar/?q=QUERY"
def posts(self):
url = self.root + "/buscar/"
params = {"q": self.item}
return self._pagination(url, params)
@cache()
def _cookie_cache():
return ()

@ -143,7 +143,7 @@ class PornhubGifExtractor(PornhubExtractor):
"url" : extr('"contentUrl": "', '"'), "url" : extr('"contentUrl": "', '"'),
"date" : text.parse_datetime( "date" : text.parse_datetime(
extr('"uploadDate": "', '"'), "%Y-%m-%d"), extr('"uploadDate": "', '"'), "%Y-%m-%d"),
"user" : extr('data-mxptext="', '"'), "user" : text.remove_html(extr("Created by:", "</div>")),
} }
yield Message.Directory, gif yield Message.Directory, gif

@ -0,0 +1,203 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Postmill instances"""
import re
from .common import BaseExtractor, Message
from .. import text, exception
class PostmillExtractor(BaseExtractor):
"""Base class for Postmill extractors"""
basecategory = "postmill"
directory_fmt = ("{category}", "{instance}", "{forum}")
filename_fmt = "{id}_{title[:220]}.{extension}"
archive_fmt = "{filename}"
def _init(self):
self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False)
self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search
self._search_image_tag = re.compile(
r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self):
for post_url in self.post_urls():
page = self.request(post_url).text
extr = text.extract_from(page)
title = text.unescape(extr(
'<meta property="og:title" content="', '">'))
date = text.parse_datetime(extr(
'<meta property="og:article:published_time" content="', '">'))
username = extr(
'<meta property="og:article:author" content="', '">')
post_canonical_url = text.unescape(extr(
'<link rel="canonical" href="', '">'))
url = text.unescape(extr(
'<h1 class="submission__title unheaderize inline"><a href="',
'"'))
body = extr(
'<div class="submission__body break-text text-flow">',
'</div>')
match = self._search_canonical_url(post_canonical_url)
forum = match.group(1)
id = int(match.group(2))
is_text_post = url.startswith("/")
is_image_post = self._search_image_tag(page) is not None
data = {
"title": title,
"date": date,
"username": username,
"forum": forum,
"id": id,
"flair": [text.unescape(i) for i in text.extract_iter(
page, '<span class="flair__label">', '</span>')],
"instance": self.instance,
}
urls = []
if is_text_post or self.save_link_post_body:
urls.append((Message.Url, "text:" + body))
if is_image_post:
urls.append((Message.Url, url))
elif not is_text_post:
urls.append((Message.Queue, url))
data["count"] = len(urls)
yield Message.Directory, data
for data["num"], (msg, url) in enumerate(urls, 1):
if url.startswith("text:"):
data["filename"], data["extension"] = "", "htm"
else:
data = text.nameext_from_url(url, data)
yield msg, url, data
class PostmillSubmissionsExtractor(PostmillExtractor):
"""Base class for Postmill submissions extractors"""
whitelisted_parameters = ()
def __init__(self, match):
PostmillExtractor.__init__(self, match)
groups = match.groups()
self.base = groups[-3]
self.sorting_path = groups[-2] or ""
self.query = {key: value for key, value in text.parse_query(
groups[-1]).items() if self.acceptable_query(key)}
def items(self):
url = self.root + self.base + self.sorting_path
while url:
response = self.request(url, params=self.query)
if response.history:
redirect_url = response.url
if redirect_url == self.root + "/login":
raise exception.StopExtraction(
"HTTP redirect to login page (%s)", redirect_url)
page = response.text
for nav in text.extract_iter(page,
'<nav class="submission__nav">',
'</nav>'):
post_url = text.unescape(text.extr(nav, '<a href="', '"'))
yield Message.Queue, text.urljoin(url, post_url), \
{"_extractor": PostmillPostExtractor}
url = text.unescape(text.extr(page,
'<link rel="next" href="', '">'))
def acceptable_query(self, key):
return key in self.whitelisted_parameters or key == "t" or \
(key.startswith("next[") and key.endswith("]"))
BASE_PATTERN = PostmillExtractor.update({
"raddle": {
"root" : None,
"pattern": (r"(?:raddle\.me|"
r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid"
r"\.onion)"),
}
})
QUERY_RE = r"(?:\?([^#]+))?$"
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
QUERY_RE
class PostmillPostExtractor(PostmillExtractor):
"""Extractor for a single submission URL"""
subcategory = "post"
pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
example = "https://raddle.me/f/FORUM/123/TITLE"
def __init__(self, match):
PostmillExtractor.__init__(self, match)
self.forum = match.group(3)
self.post_id = match.group(4)
def post_urls(self):
return (self.root + "/f/" + self.forum + "/" + self.post_id,)
class PostmillShortURLExtractor(PostmillExtractor):
"""Extractor for short submission URLs"""
subcategory = "shorturl"
pattern = BASE_PATTERN + r"/(\d+)$"
example = "https://raddle.me/123"
def __init__(self, match):
PostmillExtractor.__init__(self, match)
self.post_id = match.group(3)
def items(self):
url = self.root + "/" + self.post_id
response = self.request(url, method="HEAD", allow_redirects=False)
full_url = text.urljoin(url, response.headers["Location"])
yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor}
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
"""Extractor for the home page"""
subcategory = "home"
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
example = "https://raddle.me/"
class PostmillForumExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum"""
subcategory = "forum"
pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
example = "https://raddle.me/f/FORUM"
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions made by a user"""
subcategory = "usersubmissions"
pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
example = "https://raddle.me/user/USER/submissions"
class PostmillTagExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum with a specific tag"""
subcategory = "tag"
pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
example = "https://raddle.me/tag/TAG"
class PostmillSearchExtractor(PostmillSubmissionsExtractor):
"""Extractor for search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$"
example = "https://raddle.me/search?q=QUERY"
whitelisted_parameters = ("q",)

@ -18,7 +18,7 @@ class ReactorExtractor(BaseExtractor):
basecategory = "reactor" basecategory = "reactor"
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
archive_fmt = "{post_id}_{num}" archive_fmt = "{post_id}_{num}"
request_interval = 5.0 request_interval = (3.0, 6.0)
def __init__(self, match): def __init__(self, match):
BaseExtractor.__init__(self, match) BaseExtractor.__init__(self, match)

@ -23,7 +23,7 @@ class ReadcomiconlineBase():
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
archive_fmt = "{issue_id}_{page}" archive_fmt = "{issue_id}_{page}"
root = "https://readcomiconline.li" root = "https://readcomiconline.li"
request_interval = (3.0, 7.0) request_interval = (3.0, 6.0)
def request(self, url, **kwargs): def request(self, url, **kwargs):
"""Detect and handle redirects to CAPTCHA pages""" """Detect and handle redirects to CAPTCHA pages"""

@ -115,12 +115,18 @@ class RedditExtractor(Extractor):
continue continue
if url[0] == "/": if url[0] == "/":
url = "https://www.reddit.com" + url url = "https://www.reddit.com" + url
if url.startswith((
"https://www.reddit.com/message/compose",
"https://reddit.com/message/compose",
)):
continue
match = match_submission(url) match = match_submission(url)
if match: if match:
extra.append(match.group(1)) extra.append(match.group(1))
elif not match_user(url) and not match_subreddit(url): elif not match_user(url) and not match_subreddit(url):
if previews and "preview" in data: if previews and "comment" not in data and \
"preview" in data:
data["_fallback"] = self._previews(data) data["_fallback"] = self._previews(data)
yield Message.Queue, text.unescape(url), data yield Message.Queue, text.unescape(url), data
if "_fallback" in data: if "_fallback" in data:
@ -153,7 +159,7 @@ class RedditExtractor(Extractor):
data = meta[item["media_id"]] data = meta[item["media_id"]]
if data["status"] != "valid" or "s" not in data: if data["status"] != "valid" or "s" not in data:
self.log.warning( self.log.warning(
"gallery %s: skipping item %s ('status: %s')", "gallery %s: skipping item %s (status: %s)",
submission["id"], item["media_id"], data.get("status")) submission["id"], item["media_id"], data.get("status"))
continue continue
src = data["s"] src = data["s"]
@ -286,6 +292,29 @@ class RedditImageExtractor(Extractor):
yield Message.Url, url, data yield Message.Url, url, data
class RedditRedirectExtractor(Extractor):
"""Extractor for personalized share URLs produced by the mobile app"""
category = "reddit"
subcategory = "redirect"
pattern = (r"(?:https?://)?(?:"
r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))"
r"/s/([a-zA-Z0-9]{10})")
example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ"
def __init__(self, match):
Extractor.__init__(self, match)
self.subreddit = match.group(1)
self.share_url = match.group(2)
def items(self):
url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \
self.share_url
data = {"_extractor": RedditSubmissionExtractor}
response = self.request(url, method="HEAD", allow_redirects=False,
notfound="submission")
yield Message.Queue, response.headers["Location"], data
class RedditAPI(): class RedditAPI():
"""Interface for the Reddit API """Interface for the Reddit API
@ -394,9 +423,10 @@ class RedditAPI():
"grants/installed_client"), "grants/installed_client"),
"device_id": "DO_NOT_TRACK_THIS_DEVICE"} "device_id": "DO_NOT_TRACK_THIS_DEVICE"}
auth = util.HTTPBasicAuth(self.client_id, "")
response = self.extractor.request( response = self.extractor.request(
url, method="POST", headers=self.headers, url, method="POST", headers=self.headers,
data=data, auth=(self.client_id, ""), fatal=False) data=data, auth=auth, fatal=False)
data = response.json() data = response.json()
if response.status_code != 200: if response.status_code != 200:
@ -501,7 +531,7 @@ class RedditAPI():
return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz") return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
@cache(maxage=100*365*24*3600, keyarg=0) @cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(token): def _refresh_token_cache(token):
if token and token[0] == "#": if token and token[0] == "#":
return None return None

@ -89,14 +89,20 @@ class RedgifsUserExtractor(RedgifsExtractor):
"""Extractor for redgifs user profiles""" """Extractor for redgifs user profiles"""
subcategory = "user" subcategory = "user"
directory_fmt = ("{category}", "{userName}") directory_fmt = ("{category}", "{userName}")
pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?$" pattern = (r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?"
r"(?:\?([^#]+))?$")
example = "https://www.redgifs.com/users/USER" example = "https://www.redgifs.com/users/USER"
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
self.query = match.group(2)
def metadata(self): def metadata(self):
return {"userName": self.key} return {"userName": self.key}
def gifs(self): def gifs(self):
return self.api.user(self.key) order = text.parse_query(self.query).get("order")
return self.api.user(self.key, order or "new")
class RedgifsCollectionExtractor(RedgifsExtractor): class RedgifsCollectionExtractor(RedgifsExtractor):
@ -140,11 +146,17 @@ class RedgifsCollectionsExtractor(RedgifsExtractor):
class RedgifsNichesExtractor(RedgifsExtractor): class RedgifsNichesExtractor(RedgifsExtractor):
"""Extractor for redgifs niches""" """Extractor for redgifs niches"""
subcategory = "niches" subcategory = "niches"
pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)" pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)/?"
r"(?:\?([^#]+))?$")
example = "https://www.redgifs.com/niches/NAME" example = "https://www.redgifs.com/niches/NAME"
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
self.query = match.group(2)
def gifs(self): def gifs(self):
return self.api.niches(self.key) order = text.parse_query(self.query).get("order")
return self.api.niches(self.key, order or "new")
class RedgifsSearchExtractor(RedgifsExtractor): class RedgifsSearchExtractor(RedgifsExtractor):
@ -208,7 +220,7 @@ class RedgifsAPI():
endpoint = "/v2/gallery/" + gallery_id endpoint = "/v2/gallery/" + gallery_id
return self._call(endpoint) return self._call(endpoint)
def user(self, user, order="best"): def user(self, user, order="new"):
endpoint = "/v2/users/{}/search".format(user.lower()) endpoint = "/v2/users/{}/search".format(user.lower())
params = {"order": order} params = {"order": order}
return self._pagination(endpoint, params) return self._pagination(endpoint, params)
@ -226,9 +238,10 @@ class RedgifsAPI():
endpoint = "/v2/users/{}/collections".format(user) endpoint = "/v2/users/{}/collections".format(user)
return self._pagination(endpoint, key="collections") return self._pagination(endpoint, key="collections")
def niches(self, niche): def niches(self, niche, order):
endpoint = "/v2/niches/{}/gifs".format(niche) endpoint = "/v2/niches/{}/gifs".format(niche)
return self._pagination(endpoint) params = {"count": 30, "order": order}
return self._pagination(endpoint, params)
def search(self, params): def search(self, params):
endpoint = "/v2/gifs/search" endpoint = "/v2/gifs/search"

@ -38,7 +38,11 @@ class Rule34usExtractor(BooruExtractor):
"height" : extr(' x ', 'h'), "height" : extr(' x ', 'h'),
"file_url": extr(' src="', '"'), "file_url": extr(' src="', '"'),
} }
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
url = post["file_url"]
if "//video-cdn1." in url:
post["_fallback"] = (url.replace("//video-cdn1.", "//video."),)
post["md5"] = url.rpartition("/")[2].partition(".")[0]
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
for tag_type, tag_name in self._find_tags(page): for tag_type, tag_name in self._find_tags(page):

@ -87,7 +87,7 @@ class SankakuTagExtractor(SankakuExtractor):
subcategory = "tag" subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}") directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}" archive_fmt = "t_{search_tags}_{id}"
pattern = BASE_PATTERN + r"/?\?([^#]*)" pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)"
example = "https://sankaku.app/?tags=TAG" example = "https://sankaku.app/?tags=TAG"
def __init__(self, match): def __init__(self, match):
@ -117,7 +117,7 @@ class SankakuPoolExtractor(SankakuExtractor):
subcategory = "pool" subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}")
archive_fmt = "p_{pool}_{id}" archive_fmt = "p_{pool}_{id}"
pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)" pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\d+)"
example = "https://sankaku.app/books/12345" example = "https://sankaku.app/books/12345"
def __init__(self, match): def __init__(self, match):
@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor):
"""Extractor for single posts from sankaku.app""" """Extractor for single posts from sankaku.app"""
subcategory = "post" subcategory = "post"
archive_fmt = "{id}" archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)" pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)"
example = "https://sankaku.app/post/show/12345" example = "https://sankaku.app/post/show/12345"
def __init__(self, match): def __init__(self, match):
@ -179,12 +179,16 @@ class SankakuAPI():
def __init__(self, extractor): def __init__(self, extractor):
self.extractor = extractor self.extractor = extractor
self.headers = { self.headers = {
"Accept" : "application/vnd.sankaku.api+json;v=2", "Accept" : "application/vnd.sankaku.api+json;v=2",
"Platform": "web-app", "Platform" : "web-app",
"Origin" : extractor.root, "Api-Version": None,
"Origin" : extractor.root,
} }
self.username, self.password = self.extractor._get_auth_info() if extractor.config("id-format") in ("alnum", "alphanumeric"):
self.headers["Api-Version"] = "2"
self.username, self.password = extractor._get_auth_info()
if not self.username: if not self.username:
self.authenticate = util.noop self.authenticate = util.noop
@ -285,7 +289,7 @@ class SankakuAPI():
return return
@cache(maxage=365*24*3600, keyarg=1) @cache(maxage=365*86400, keyarg=1)
def _authenticate_impl(extr, username, password): def _authenticate_impl(extr, username, password):
extr.log.info("Logging in as %s", username) extr.log.info("Logging in as %s", username)

@ -19,17 +19,12 @@ class Shimmie2Extractor(BaseExtractor):
archive_fmt = "{id}" archive_fmt = "{id}"
def _init(self): def _init(self):
try: cookies = self.config_instance("cookies")
instance = INSTANCES[self.category]
except KeyError:
return
cookies = instance.get("cookies")
if cookies: if cookies:
domain = self.root.rpartition("/")[2] domain = self.root.rpartition("/")[2]
self.cookies_update_dict(cookies, domain=domain) self.cookies_update_dict(cookies, domain=domain)
file_url = instance.get("file_url") file_url = self.config_instance("file_url")
if file_url: if file_url:
self.file_url_fmt = file_url self.file_url_fmt = file_url
@ -41,8 +36,9 @@ class Shimmie2Extractor(BaseExtractor):
for post in self.posts(): for post in self.posts():
for key in ("id", "width", "height"): post["id"] = text.parse_int(post["id"])
post[key] = text.parse_int(post[key]) post["width"] = text.parse_int(post["width"])
post["height"] = text.parse_int(post["height"])
post["tags"] = text.unquote(post["tags"]) post["tags"] = text.unquote(post["tags"])
post.update(data) post.update(data)
@ -64,20 +60,23 @@ class Shimmie2Extractor(BaseExtractor):
"""Return an iterable containing data of all relevant posts""" """Return an iterable containing data of all relevant posts"""
return () return ()
def _quote_type(self, page):
"""Return quoting character used in 'page' (' or ")"""
try:
return page[page.index("<link rel=")+10]
except Exception:
return "'"
INSTANCES = {
"mememuseum": { BASE_PATTERN = Shimmie2Extractor.update({
"root": "https://meme.museum",
"pattern": r"meme\.museum",
},
"loudbooru": { "loudbooru": {
"root": "https://loudbooru.com", "root": "https://loudbooru.com",
"pattern": r"loudbooru\.com", "pattern": r"loudbooru\.com",
"cookies": {"ui-tnc-agreed": "true"}, "cookies": {"ui-tnc-agreed": "true"},
}, },
"giantessbooru": { "giantessbooru": {
"root": "https://giantessbooru.com", "root": "https://sizechangebooru.com",
"pattern": r"giantessbooru\.com", "pattern": r"(?:sizechange|giantess)booru\.com",
"cookies": {"agreed": "true"}, "cookies": {"agreed": "true"},
}, },
"tentaclerape": { "tentaclerape": {
@ -89,9 +88,11 @@ INSTANCES = {
"pattern": r"booru\.cavemanon\.xyz", "pattern": r"booru\.cavemanon\.xyz",
"file_url": "{0}/index.php?q=image/{2}.{4}", "file_url": "{0}/index.php?q=image/{2}.{4}",
}, },
} "rule34hentai": {
"root": "https://rule34hentai.net",
BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=/?)?" "pattern": r"rule34hentai\.net",
},
}) + r"/(?:index\.php\?q=/?)?"
class Shimmie2TagExtractor(Shimmie2Extractor): class Shimmie2TagExtractor(Shimmie2Extractor):
@ -125,21 +126,26 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
if init: if init:
init = False init = False
has_mime = ("data-mime='" in page) quote = self._quote_type(page)
has_pid = ("data-post-id='" in page) has_mime = (" data-mime=" in page)
has_pid = (" data-post-id=" in page)
while True: while True:
if has_mime: if has_mime:
mime = extr("data-mime='", "'") mime = extr(" data-mime="+quote, quote)
if has_pid: if has_pid:
pid = extr("data-post-id='", "'") pid = extr(" data-post-id="+quote, quote)
else: else:
pid = extr("href='/post/view/", "?") pid = extr(" href='/post/view/", quote)
if not pid: if not pid:
break break
tags, dimensions, size = extr("title='", "'").split(" // ") data = extr("title="+quote, quote).split(" // ")
tags = data[0]
dimensions = data[1]
size = data[2]
width, _, height = dimensions.partition("x") width, _, height = dimensions.partition("x")
md5 = extr("/_thumbs/", "/") md5 = extr("/_thumbs/", "/")
@ -170,25 +176,25 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
extr = text.extract_from(self.request(url).text) extr = text.extract_from(self.request(url).text)
while True: while True:
pid = extr('href="./index.php?q=/post/view/', '&') pid = extr("href='./index.php?q=/post/view/", "&")
if not pid: if not pid:
break break
tags, dimensions, size = extr('title="', '"').split(" // ") tags, dimensions, size = extr("title='", "'").split(" // ")
width, _, height = dimensions.partition("x") width, _, height = dimensions.partition("x")
yield { yield {
"file_url": file_url_fmt(pid), "file_url": file_url_fmt(pid),
"id": pid, "id" : pid,
"md5": "", "md5" : "",
"tags": tags, "tags" : tags,
"width": width, "width" : width,
"height": height, "height" : height,
"size": text.parse_bytes(size[:-1]), "size" : text.parse_bytes(size[:-1]),
} }
pnum += 1 pnum += 1
if not extr('/{}">{}<'.format(pnum, pnum), ">"): if not extr("/{0}'>{0}<".format(pnum), ">"):
return return
@ -204,15 +210,17 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
def posts(self): def posts(self):
url = "{}/post/view/{}".format(self.root, self.post_id) url = "{}/post/view/{}".format(self.root, self.post_id)
extr = text.extract_from(self.request(url).text) page = self.request(url).text
extr = text.extract_from(page)
quote = self._quote_type(page)
post = { post = {
"id" : self.post_id, "id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : extr("/_thumbs/", "/"), "md5" : extr("/_thumbs/", "/"),
"file_url": self.root + ( "file_url": self.root + (
extr("id='main_image' src='", "'") or extr("id={0}main_image{0} src={0}".format(quote), quote) or
extr("<source src='", "'")).lstrip("."), extr("<source src="+quote, quote)).lstrip("."),
"width" : extr("data-width=", " ").strip("\"'"), "width" : extr("data-width=", " ").strip("\"'"),
"height" : extr("data-height=", ">").partition( "height" : extr("data-height=", ">").partition(
" ")[0].strip("\"'"), " ")[0].strip("\"'"),
@ -233,7 +241,7 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
"id" : self.post_id, "id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : "", "md5" : "",
"file_url": self.root + extr('id="main_image" src=".', '"'), "file_url": self.root + extr("id='main_image' src='.", "'"),
"width" : extr("orig_width =", ";"), "width" : extr("orig_width =", ";"),
"height" : 0, "height" : 0,
"size" : 0, "size" : 0,

@ -0,0 +1,211 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.steamgriddb.com"""
from .common import Extractor, Message
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com"
LANGUAGE_CODES = (
"aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az",
"ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce",
"ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee",
"el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr",
"fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr",
"ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is",
"it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn",
"ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln",
"lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms",
"mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv",
"ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu",
"rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk",
"sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta",
"te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw",
"ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi",
"yo", "za", "zh", "zu",
)
FILE_EXT_TO_MIME = {
"png": "image/png",
"jpeg": "image/jpeg",
"jpg": "image/jpeg",
"webp": "image/webp",
"ico": "image/vnd.microsoft.icon",
"all": "all",
}
class SteamgriddbExtractor(Extractor):
"""Base class for SteamGridDB"""
category = "steamgriddb"
directory_fmt = ("{category}", "{subcategory}", "{game[id]}")
filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}"
archive_fmt = "{filename}"
root = "https://www.steamgriddb.com"
def _init(self):
self.cookies_update({
"userprefs": "%7B%22adult%22%3Afalse%7D",
})
def items(self):
download_fake_png = self.config("download-fake-png", True)
for asset in self.assets():
if download_fake_png and asset.get("fake_png"):
urls = (asset["url"], asset["fake_png"])
else:
urls = (asset["url"],)
asset["count"] = len(urls)
yield Message.Directory, asset
for asset["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, asset)
def _call(self, endpoint, **kwargs):
data = self.request(self.root + endpoint, **kwargs).json()
if not data["success"]:
raise exception.StopExtraction(data["error"])
return data["data"]
class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
"""Base class for extracting a list of assets"""
def __init__(self, match):
SteamgriddbExtractor.__init__(self, match)
list_type = match.group(1)
id = int(match.group(2))
self.game_id = id if list_type == "game" else None
self.collection_id = id if list_type == "collection" else None
self.page = int(match.group(3) or 1)
def assets(self):
limit = 48
page = min(self.page - 1, 0)
sort = self.config("sort", "score_desc")
if sort not in ("score_desc", "score_asc", "score_old_desc",
"score_old_asc", "age_desc", "age_asc"):
raise exception.StopExtractor("Invalid sort '%s'", sort)
json = {
"static" : self.config("static", True),
"animated": self.config("animated", True),
"humor" : self.config("humor", True),
"nsfw" : self.config("nsfw", True),
"epilepsy": self.config("epilepsy", True),
"untagged": self.config("untagged", True),
"asset_type": self.asset_type,
"limit": limit,
"order": sort,
}
if self.valid_dimensions:
json["dimensions"] = self.config_list(
"dimensions", "dimension", self.valid_dimensions)
json["styles"] = self.config_list("styles", "style", self.valid_styles)
json["languages"] = self.config_list(
"languages", "language", LANGUAGE_CODES)
file_types = self.config_list(
"file-types", "file type", self.valid_file_types)
json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types]
if self.game_id:
json["game_id"] = [self.game_id]
else:
json["collection_id"] = self.collection_id
while True:
json["page"] = page
data = self._call(
"/api/public/search/assets", method="POST", json=json)
for asset in data["assets"]:
if not asset.get("game"):
asset["game"] = data["game"]
yield asset
if data["total"] <= limit * page:
break
page += 1
def config_list(self, key, type_name, valid_values):
value = self.config(key)
if isinstance(value, str):
value = value.split(",")
if value is None or "all" in value:
return ["all"]
for i in value:
if i not in valid_values:
raise exception.StopExtraction("Invalid %s '%s'", type_name, i)
return value
class SteamgriddbAssetExtractor(SteamgriddbExtractor):
"""Extractor for a single asset"""
subcategory = "asset"
pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)"
example = "https://www.steamgriddb.com/grid/1234"
def __init__(self, match):
SteamgriddbExtractor.__init__(self, match)
self.asset_type = match.group(1)
self.asset_id = match.group(2)
def assets(self):
endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id
asset = self._call(endpoint)["asset"]
return (asset,)
class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor):
subcategory = "grids"
asset_type = "grid"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/grids"
valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930",
"512x512", "1024x1024")
valid_styles = ("alternate", "blurred", "no_logo", "material",
"white_logo")
valid_file_types = ("png", "jpeg", "jpg", "webp")
class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor):
subcategory = "heroes"
asset_type = "hero"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/heroes"
valid_dimensions = ("1920x620", "3840x1240", "1600x650")
valid_styles = ("alternate", "blurred", "material")
valid_file_types = ("png", "jpeg", "jpg", "webp")
class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor):
subcategory = "logos"
asset_type = "logo"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/logos"
valid_dimensions = None
valid_styles = ("official", "white", "black", "custom")
valid_file_types = ("png", "webp")
class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor):
subcategory = "icons"
asset_type = "icon"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/icons"
valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24,
28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90,
96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192,
194, 256, 310, 512, 768, 1024)]
valid_styles = ("official", "custom")
valid_file_types = ("png", "ico")

@ -56,7 +56,7 @@ class SubscribestarExtractor(Extractor):
if username: if username:
self.cookies_update(self._login_impl(username, password)) self.cookies_update(self._login_impl(username, password))
@cache(maxage=28*24*3600, keyarg=1) @cache(maxage=28*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)

@ -87,6 +87,10 @@ BASE_PATTERN = SzurubooruExtractor.update({
"root": "https://booru.bcbnsfw.space", "root": "https://booru.bcbnsfw.space",
"pattern": r"booru\.bcbnsfw\.space", "pattern": r"booru\.bcbnsfw\.space",
}, },
"snootbooru": {
"root": "https://snootbooru.com",
"pattern": r"snootbooru\.com",
},
}) })

@ -81,7 +81,7 @@ class TapasExtractor(Extractor):
self.cookies.set( self.cookies.set(
"adjustedBirthDate", "1981-02-03", domain=self.cookies_domain) "adjustedBirthDate", "1981-02-03", domain=self.cookies_domain)
@cache(maxage=14*24*3600, keyarg=1) @cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://tmohentai.com/"""
from .common import GalleryExtractor
from .. import text
BASE_PATTERN = r"(?:https?://)?tmohentai\.com"
class TmohentaiGalleryExtractor(GalleryExtractor):
category = "tmohentai"
root = "http://tmohentai.com"
directory_fmt = ("{category}", "{title} ({gallery_id})")
pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)"
example = "https://tmohentai.com/contents/12345a67b89c0"
def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/contents/{}".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def images(self, page):
fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format(
self.gallery_id).format
cnt = page.count('class="lanzador')
return [(fmt(i), None) for i in range(0, cnt)]
def metadata(self, page):
extr = text.extract_from(page)
return {
"gallery_id": self.gallery_id,
"title" : text.unescape(extr("<h3>", "<").strip()),
"artists" : text.split_html(extr(
"<label>Artists and Artists Groups</label>", "</ul>")),
"genres" : text.split_html(extr(
"<label>Genders</label>", "</ul>")),
"tags" : text.split_html(extr(
"<label>Tags</label>", "</ul>")),
"uploader" : text.remove_html(extr(
"<label>Uploaded By</label>", "</ul>")),
"language" : extr("&nbsp;", "\n"),
}

@ -27,7 +27,7 @@ class TsuminoBase():
self.cookies.setdefault( self.cookies.setdefault(
"ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5") "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
@cache(maxage=14*24*3600, keyarg=1) @cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = "{}/Account/Login".format(self.root) url = "{}/Account/Login".format(self.root)

@ -9,7 +9,7 @@
"""Extractors for https://www.tumblr.com/""" """Extractors for https://www.tumblr.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, oauth, exception from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
import re import re
@ -262,7 +262,7 @@ class TumblrExtractor(Extractor):
return updated, (resized == updated) return updated, (resized == updated)
def _original_image_fallback(self, url, post_id): def _original_image_fallback(self, url, post_id):
for _ in range(self.fallback_retries): for _ in util.repeat(self.fallback_retries):
self.sleep(self.fallback_delay, "image token") self.sleep(self.fallback_delay, "image token")
yield self._update_image_token(url)[0] yield self._update_image_token(url)[0]
self.log.warning("Unable to fetch higher-resolution " self.log.warning("Unable to fetch higher-resolution "
@ -322,12 +322,15 @@ class TumblrDayExtractor(TumblrExtractor):
def __init__(self, match): def __init__(self, match):
TumblrExtractor.__init__(self, match) TumblrExtractor.__init__(self, match)
year, month, day = match.group(4).split("/") year, month, day = match.group(4).split("/")
self.date_min = ( self.ordinal = date(int(year), int(month), int(day)).toordinal()
# 719163 == date(1970, 1, 1).toordinal()
date(int(year), int(month), int(day)).toordinal() - 719163) * 86400
def _init(self): def _init(self):
TumblrExtractor._init(self) TumblrExtractor._init(self)
self.date_min = (
# 719163 == date(1970, 1, 1).toordinal()
(self.ordinal - 719163) * 86400)
self.api.before = self.date_min + 86400 self.api.before = self.date_min + 86400
def posts(self): def posts(self):
@ -401,66 +404,70 @@ class TumblrAPI(oauth.OAuth1API):
def _call(self, endpoint, params, **kwargs): def _call(self, endpoint, params, **kwargs):
url = self.ROOT + endpoint url = self.ROOT + endpoint
kwargs["params"] = params kwargs["params"] = params
response = self.request(url, **kwargs)
try: while True:
data = response.json() response = self.request(url, **kwargs)
except ValueError:
data = response.text
status = response.status_code
else:
status = data["meta"]["status"]
if 200 <= status < 400:
return data["response"]
self.log.debug(data)
if status == 403:
raise exception.AuthorizationError()
elif status == 404:
try: try:
error = data["errors"][0]["detail"] data = response.json()
board = ("only viewable within the Tumblr dashboard" in error) except ValueError:
except Exception: data = response.text
board = False status = response.status_code
else:
if board: status = data["meta"]["status"]
self.log.info("Run 'gallery-dl oauth:tumblr' " if 200 <= status < 400:
"to access dashboard-only blogs") return data["response"]
raise exception.AuthorizationError(error)
raise exception.NotFoundError("user or post") self.log.debug(data)
elif status == 429: if status == 403:
# daily rate limit raise exception.AuthorizationError()
if response.headers.get("x-ratelimit-perday-remaining") == "0":
self.log.info("Daily API rate limit exceeded") elif status == 404:
reset = response.headers.get("x-ratelimit-perday-reset") try:
error = data["errors"][0]["detail"]
api_key = self.api_key or self.session.auth.consumer_key board = ("only viewable within the Tumblr dashboard"
if api_key == self.API_KEY: in error)
self.log.info("Register your own OAuth application and " except Exception:
"use its credentials to prevent this error: " board = False
"https://github.com/mikf/gallery-dl/blob/mas"
"ter/docs/configuration.rst#extractortumblra" if board:
"pi-key--api-secret") self.log.info("Run 'gallery-dl oauth:tumblr' "
"to access dashboard-only blogs")
if self.extractor.config("ratelimit") == "wait": raise exception.AuthorizationError(error)
raise exception.NotFoundError("user or post")
elif status == 429:
# daily rate limit
if response.headers.get("x-ratelimit-perday-remaining") == "0":
self.log.info("Daily API rate limit exceeded")
reset = response.headers.get("x-ratelimit-perday-reset")
api_key = self.api_key or self.session.auth.consumer_key
if api_key == self.API_KEY:
self.log.info(
"Register your own OAuth application and use its "
"credentials to prevent this error: https://githu"
"b.com/mikf/gallery-dl/blob/master/docs/configurat"
"ion.rst#extractortumblrapi-key--api-secret")
if self.extractor.config("ratelimit") == "wait":
self.extractor.wait(seconds=reset)
continue
t = (datetime.now() + timedelta(0, float(reset))).time()
raise exception.StopExtraction(
"Aborting - Rate limit will reset at %s",
"{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
# hourly rate limit
reset = response.headers.get("x-ratelimit-perhour-reset")
if reset:
self.log.info("Hourly API rate limit exceeded")
self.extractor.wait(seconds=reset) self.extractor.wait(seconds=reset)
return self._call(endpoint, params, **kwargs) continue
t = (datetime.now() + timedelta(seconds=float(reset))).time()
raise exception.StopExtraction(
"Aborting - Rate limit will reset at %s",
"{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
# hourly rate limit
reset = response.headers.get("x-ratelimit-perhour-reset")
if reset:
self.log.info("Hourly API rate limit exceeded")
self.extractor.wait(seconds=reset)
return self._call(endpoint, params, **kwargs)
raise exception.StopExtraction(data) raise exception.StopExtraction(data)
def _pagination(self, blog, endpoint, params, key="posts", cache=False): def _pagination(self, blog, endpoint, params, key="posts", cache=False):
endpoint = "/v2/blog/{}{}".format(blog, endpoint) endpoint = "/v2/blog/{}{}".format(blog, endpoint)

@ -22,7 +22,7 @@ class TwibooruExtractor(BooruExtractor):
root = "https://twibooru.org" root = "https://twibooru.org"
filename_fmt = "{id}_{filename}.{extension}" filename_fmt = "{id}_{filename}.{extension}"
archive_fmt = "{id}" archive_fmt = "{id}"
request_interval = 6.05 request_interval = (6.0, 6.1)
page_start = 1 page_start = 1
per_page = 50 per_page = 50
@ -44,7 +44,7 @@ class TwibooruExtractor(BooruExtractor):
class TwibooruPostExtractor(TwibooruExtractor): class TwibooruPostExtractor(TwibooruExtractor):
"""Extractor for single twibooru posts""" """Extractor for single twibooru posts"""
subcategory = "post" subcategory = "post"
request_interval = 1.0 request_interval = (0.5, 1.5)
pattern = BASE_PATTERN + r"/(\d+)" pattern = BASE_PATTERN + r"/(\d+)"
example = "https://twibooru.org/12345" example = "https://twibooru.org/12345"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save