Merge branch 'mikf:master' into rawkuma

pull/4571/head
thatDudo 8 months ago committed by GitHub
commit 0f2dc855b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,56 @@
name: docker
on:
push:
tags:
- v[0-9]+.[0-9]+.[0-9]+
permissions:
packages: write
jobs:
docker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
# https://github.com/docker/setup-buildx-action
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
# https://github.com/docker/login-action
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
# https://github.com/docker/metadata-action
- name: Generate Docker tags
uses: docker/metadata-action@v5
id: metadata
with:
images: |
mikf123/gallery-dl
ghcr.io/mikf/gallery-dl
tags: |
type=sha,format=long,prefix=
type=ref,event=tag
# https://github.com/docker/build-push-action
- name: Build image
uses: docker/build-push-action@v5
with:
push: true
tags: ${{ steps.metadata.outputs.tags }}
labels: ${{ steps.metadata.outputs.labels }}
platforms: linux/amd64

@ -11,12 +11,12 @@ jobs:
matrix:
os: ["windows-latest", "macOS-latest"]
architecture: ["x64"]
python-version: ["3.11"]
python-version: ["3.12"]
python-packages: [""]
include:
- os: "ubuntu-latest"
architecture: "x64"
python-version: "3.11"
python-version: "3.12"
python-packages: "secretstorage"
- os: "windows-2019"
architecture: "x86"
@ -24,7 +24,7 @@ jobs:
python-packages: "toml"
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }}
uses: actions/setup-python@v4

@ -15,10 +15,10 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.9"]
python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Check file permissions
run: |
@ -40,7 +40,7 @@ jobs:
3.4|3.5)
# don't install yt-dlp
;;
3.6)
3.6|3.7)
# install from PyPI
pip install yt-dlp
;;

@ -1,5 +1,383 @@
# Changelog
## 1.26.7 - 2024-01-21
### Extractors
#### Additions
- [2ch] add support ([#1009](https://github.com/mikf/gallery-dl/issues/1009), [#3540](https://github.com/mikf/gallery-dl/issues/3540), [#4444](https://github.com/mikf/gallery-dl/issues/4444))
- [deviantart:avatar] add `formats` option ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [hatenablog] add support ([#5036](https://github.com/mikf/gallery-dl/issues/5036), [#5037](https://github.com/mikf/gallery-dl/issues/5037))
- [mangadex] add `list` extractor ([#5025](https://github.com/mikf/gallery-dl/issues/5025))
- [steamgriddb] add support ([#5033](https://github.com/mikf/gallery-dl/issues/5033), [#5041](https://github.com/mikf/gallery-dl/issues/5041))
- [wikimedia] add support ([#1443](https://github.com/mikf/gallery-dl/issues/1443), [#2906](https://github.com/mikf/gallery-dl/issues/2906), [#3660](https://github.com/mikf/gallery-dl/issues/3660), [#2340](https://github.com/mikf/gallery-dl/issues/2340))
- [wikimedia] support `fandom` wikis ([#2677](https://github.com/mikf/gallery-dl/issues/2677), [#3378](https://github.com/mikf/gallery-dl/issues/3378))
#### Fixes
- [blogger] fix `lh-*.googleusercontent.com` URLs ([#5091](https://github.com/mikf/gallery-dl/issues/5091))
- [bunkr] update domain ([#5088](https://github.com/mikf/gallery-dl/issues/5088))
- [deviantart] fix AttributeError for URLs without username ([#5065](https://github.com/mikf/gallery-dl/issues/5065))
- [deviantart] fix `KeyError: 'premium_folder_data'` ([#5063](https://github.com/mikf/gallery-dl/issues/5063))
- [deviantart:avatar] fix exception when `comments` are enabled ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [fuskator] make metadata extraction non-fatal ([#5039](https://github.com/mikf/gallery-dl/issues/5039))
- [gelbooru] only log "Incomplete API response" for favorites ([#5045](https://github.com/mikf/gallery-dl/issues/5045))
- [giantessbooru] update domain
- [issuu] fix extraction
- [nijie] fix download URLs of single image posts ([#5049](https://github.com/mikf/gallery-dl/issues/5049))
- [patreon] fix `KeyError: 'name'` ([#5048](https://github.com/mikf/gallery-dl/issues/5048), [#5069](https://github.com/mikf/gallery-dl/issues/5069), [#5093](https://github.com/mikf/gallery-dl/issues/5093))
- [pixiv] update API headers ([#5029](https://github.com/mikf/gallery-dl/issues/5029))
- [realbooru] fix download URLs of older posts
- [twitter] revert to using `media` timeline by default ([#4953](https://github.com/mikf/gallery-dl/issues/4953))
- [vk] transform image URLs to non-blurred versions ([#5017](https://github.com/mikf/gallery-dl/issues/5017))
#### Improvements
- [batoto] support more mirror domains ([#5042](https://github.com/mikf/gallery-dl/issues/5042))
- [batoto] improve v2 manga URL pattern
- [gelbooru] support `all` tag and URLs with empty tags ([#5076](https://github.com/mikf/gallery-dl/issues/5076))
- [patreon] download `m3u8` manifests with ytdl
- [sankaku] support post URLs with alphanumeric IDs ([#5073](https://github.com/mikf/gallery-dl/issues/5073))
#### Metadata
- [batoto] improve `manga_id` extraction ([#5042](https://github.com/mikf/gallery-dl/issues/5042))
- [erome] fix `count` metadata
- [kemonoparty] add `revision_hash` metadata ([#4706](https://github.com/mikf/gallery-dl/issues/4706), [#4727](https://github.com/mikf/gallery-dl/issues/4727), [#5013](https://github.com/mikf/gallery-dl/issues/5013))
- [paheal] fix `source` metadata
- [webtoons] extract more metadata ([#5061](https://github.com/mikf/gallery-dl/issues/5061), [#5094](https://github.com/mikf/gallery-dl/issues/5094))
#### Removals
- [chevereto] remove `pixl.li`
- [hbrowse] remove module
- [nitter] remove `nitter.lacontrevoie.fr`
## 1.26.6 - 2024-01-06
### Extractors
#### Additions
- [batoto] add `chapter` and `manga` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434), [#2111](https://github.com/mikf/gallery-dl/issues/2111), [#4979](https://github.com/mikf/gallery-dl/issues/4979))
- [deviantart] add `avatar` and `background` extractors ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [poringa] add support ([#4675](https://github.com/mikf/gallery-dl/issues/4675), [#4962](https://github.com/mikf/gallery-dl/issues/4962))
- [szurubooru] support `snootbooru.com` ([#5023](https://github.com/mikf/gallery-dl/issues/5023))
- [zzup] add `gallery` extractor ([#4517](https://github.com/mikf/gallery-dl/issues/4517), [#4604](https://github.com/mikf/gallery-dl/issues/4604), [#4659](https://github.com/mikf/gallery-dl/issues/4659), [#4863](https://github.com/mikf/gallery-dl/issues/4863), [#5016](https://github.com/mikf/gallery-dl/issues/5016))
#### Fixes
- [gelbooru] fix `favorite` extractor ([#4903](https://github.com/mikf/gallery-dl/issues/4903))
- [idolcomplex] fix extraction & update URL patterns ([#5002](https://github.com/mikf/gallery-dl/issues/5002))
- [imagechest] fix loading more than 10 images in a gallery ([#4469](https://github.com/mikf/gallery-dl/issues/4469))
- [jpgfish] update domain
- [komikcast] fix `manga` extractor ([#5027](https://github.com/mikf/gallery-dl/issues/5027))
- [komikcast] update domain ([#5027](https://github.com/mikf/gallery-dl/issues/5027))
- [lynxchan] update `bbw-chan` domain ([#4970](https://github.com/mikf/gallery-dl/issues/4970))
- [manganelo] fix extraction & recognize `.to` TLDs ([#5005](https://github.com/mikf/gallery-dl/issues/5005))
- [paheal] restore `extension` metadata ([#4976](https://github.com/mikf/gallery-dl/issues/4976))
- [rule34us] add fallback for `video-cdn1` videos ([#4985](https://github.com/mikf/gallery-dl/issues/4985))
- [weibo] fix AttributeError in `user` extractor ([#5022](https://github.com/mikf/gallery-dl/issues/5022))
#### Improvements
- [gelbooru] show error for invalid API responses ([#4903](https://github.com/mikf/gallery-dl/issues/4903))
- [rule34] recognize URLs with `www` subdomain ([#4984](https://github.com/mikf/gallery-dl/issues/4984))
- [twitter] raise error for invalid `strategy` values ([#4953](https://github.com/mikf/gallery-dl/issues/4953))
#### Metadata
- [fanbox] add `metadata` option ([#4921](https://github.com/mikf/gallery-dl/issues/4921))
- [nijie] add `count` metadata ([#146](https://github.com/mikf/gallery-dl/issues/146))
- [pinterest] add `count` metadata ([#4981](https://github.com/mikf/gallery-dl/issues/4981))
### Miscellaneous
- fix and update zsh completion ([#4972](https://github.com/mikf/gallery-dl/issues/4972))
- fix `--cookies-from-browser` macOS Firefox profile path
## 1.26.5 - 2023-12-23
### Extractors
#### Additions
- [deviantart] add `intermediary` option ([#4955](https://github.com/mikf/gallery-dl/issues/4955))
- [inkbunny] add `unread` extractor ([#4934](https://github.com/mikf/gallery-dl/issues/4934))
- [mastodon] support non-numeric status IDs ([#4936](https://github.com/mikf/gallery-dl/issues/4936))
- [myhentaigallery] recognize `/g/` URLs ([#4920](https://github.com/mikf/gallery-dl/issues/4920))
- [postmill] add support ([#4917](https://github.com/mikf/gallery-dl/issues/4917), [#4919](https://github.com/mikf/gallery-dl/issues/4919))
- {shimmie2[ support `rule34hentai.net` ([#861](https://github.com/mikf/gallery-dl/issues/861), [#4789](https://github.com/mikf/gallery-dl/issues/4789), [#4945](https://github.com/mikf/gallery-dl/issues/4945))
#### Fixes
- [deviantart] add workaround for integer `client-id` values ([#4924](https://github.com/mikf/gallery-dl/issues/4924))
- [exhentai] fix error for infinite `fallback-retries` ([#4911](https://github.com/mikf/gallery-dl/issues/4911))
- [inkbunny] stop pagination on empty results
- [patreon] fix bootstrap data extraction again ([#4904](https://github.com/mikf/gallery-dl/issues/4904))
- [tumblr] fix exception after waiting for rate limit ([#4916](https://github.com/mikf/gallery-dl/issues/4916))
#### Improvements
- [exhentai] output continuation URL when interrupted ([#4782](https://github.com/mikf/gallery-dl/issues/4782))
- [inkbunny] improve `/submissionsviewall.php` patterns ([#4934](https://github.com/mikf/gallery-dl/issues/4934))
- [tumblr] support infinite `fallback-retries`
- [twitter] default to `tweets` timeline when `replies` are enabled ([#4953](https://github.com/mikf/gallery-dl/issues/4953))
#### Metadata
- [danbooru] provide `tags` as list ([#4942](https://github.com/mikf/gallery-dl/issues/4942))
- [deviantart] set `is_original` for intermediary URLs to `false`
- [twitter] remove `date_liked` ([#3850](https://github.com/mikf/gallery-dl/issues/3850), [#4108](https://github.com/mikf/gallery-dl/issues/4108), [#4657](https://github.com/mikf/gallery-dl/issues/4657))
### Docker
- add Docker instructions to README ([#4850](https://github.com/mikf/gallery-dl/issues/4850))
- fix auto-generation of `latest` tags
## 1.26.4 - 2023-12-10
### Extractors
#### Additions
- [exhentai] add `fallback-retries` option ([#4792](https://github.com/mikf/gallery-dl/issues/4792))
- [urlgalleries] add `gallery` extractor ([#919](https://github.com/mikf/gallery-dl/issues/919), [#1184](https://github.com/mikf/gallery-dl/issues/1184), [#2905](https://github.com/mikf/gallery-dl/issues/2905), [#4886](https://github.com/mikf/gallery-dl/issues/4886))
#### Fixes
- [nijie] fix image URLs of multi-image posts ([#4876](https://github.com/mikf/gallery-dl/issues/4876))
- [patreon] fix bootstrap data extraction ([#4904](https://github.com/mikf/gallery-dl/issues/4904), [#4906](https://github.com/mikf/gallery-dl/issues/4906))
- [twitter] fix `/media` timelines ([#4898](https://github.com/mikf/gallery-dl/issues/4898), [#4899](https://github.com/mikf/gallery-dl/issues/4899))
- [twitter] retry API requests when response contains incomplete results ([#4811](https://github.com/mikf/gallery-dl/issues/4811))
#### Improvements
- [exhentai] store more cookies when logging in with username & password ([#4881](https://github.com/mikf/gallery-dl/issues/4881))
- [twitter] generalize "Login Required" errors ([#4734](https://github.com/mikf/gallery-dl/issues/4734), [#4324](https://github.com/mikf/gallery-dl/issues/4324))
### Options
- add `-e/--error-file` command-line and `output.errorfile` config option ([#4732](https://github.com/mikf/gallery-dl/issues/4732))
### Miscellaneous
- automatically build and push Docker images
- prompt for passwords on login when necessary
- fix `util.dump_response()` to work with `bytes` header values
## 1.26.3 - 2023-11-27
### Extractors
#### Additions
- [behance] support `text` modules ([#4799](https://github.com/mikf/gallery-dl/issues/4799))
- [behance] add `modules` option ([#4799](https://github.com/mikf/gallery-dl/issues/4799))
- [blogger] support `www.micmicidol.club` ([#4759](https://github.com/mikf/gallery-dl/issues/4759))
- [erome] add `count` metadata ([#4812](https://github.com/mikf/gallery-dl/issues/4812))
- [exhentai] add `gp` option ([#4576](https://github.com/mikf/gallery-dl/issues/4576))
- [fapello] support `.su` TLD ([#4840](https://github.com/mikf/gallery-dl/issues/4840), [#4841](https://github.com/mikf/gallery-dl/issues/4841))
- [pixeldrain] add `file` and `album` extractors ([#4839](https://github.com/mikf/gallery-dl/issues/4839))
- [pixeldrain] add `api-key` option ([#4839](https://github.com/mikf/gallery-dl/issues/4839))
- [tmohentai] add `gallery` extractor ([#4808](https://github.com/mikf/gallery-dl/issues/4808), [#4832](https://github.com/mikf/gallery-dl/issues/4832))
#### Fixes
- [cyberdrop] update to site layout changes
- [exhentai] handle `Downloading … requires GP` errors ([#4576](https://github.com/mikf/gallery-dl/issues/4576), [#4763](https://github.com/mikf/gallery-dl/issues/4763))
- [exhentai] fix empty API URL with `"source": "hitomi"` ([#4829](https://github.com/mikf/gallery-dl/issues/4829))
- [hentaifoundry] check for and update expired sessions ([#4694](https://github.com/mikf/gallery-dl/issues/4694))
- [hiperdex] fix `manga` metadata
- [idolcomplex] update to site layout changes
- [imagefap] fix resolution of single images
- [instagram] fix exception on empty `video_versions` ([#4795](https://github.com/mikf/gallery-dl/issues/4795))
- [mangaread] fix extraction
- [mastodon] fix reblogs ([#4580](https://github.com/mikf/gallery-dl/issues/4580))
- [nitter] fix video extraction ([#4853](https://github.com/mikf/gallery-dl/issues/4853), [#4855](https://github.com/mikf/gallery-dl/issues/4855))
- [pornhub] fix `user` metadata for gifs
- [tumblr] fix `day` extractor
- [wallpapercave] fix extraction
- [warosu] fix file URLs
- [webtoons] fix pagination when receiving an HTTP redirect
- [xvideos] fix metadata extraction
- [zerochan] fix metadata extraction
#### Improvements
- [hentaicosplays] force `https://` for download URLs
- [oauth] warn when cache is enabled but not writeable ([#4771](https://github.com/mikf/gallery-dl/issues/4771))
- [sankaku] update URL patterns
- [twitter] ignore promoted Tweets ([#3894](https://github.com/mikf/gallery-dl/issues/3894), [#4790](https://github.com/mikf/gallery-dl/issues/4790))
- [weibo] detect redirects to login page ([#4773](https://github.com/mikf/gallery-dl/issues/4773))
#### Removals
- [foolslide] remove `powermanga.org`
### Downloaders
#### Changes
- [http] treat files not passing `filesize-min`/`-max` as skipped ([#4821](https://github.com/mikf/gallery-dl/issues/4821))
### Options
#### Additions
- add `metadata-extractor` option ([#4549](https://github.com/mikf/gallery-dl/issues/4549))
- support `metadata-*` names for `*-metadata` options
(for example `url-metadata` is now also recognized as `metadata-url`)
### CLI
#### Additions
- implement `-I/--input-file-comment` and `-x/--input-file-delete` options ([#4732](https://github.com/mikf/gallery-dl/issues/4732))
- add `--ugoira` as a general version of `--ugoira-conv` and co.
- add `--mtime` as a general version of `--mtime-from-date`
- add `--cbz`
#### Fixes
- allow `--mtime-from-date` to work with Weibo`s metadata structure
### Miscellaneous
#### Additions
- add a simple Dockerfile ([#4831](https://github.com/mikf/gallery-dl/issues/4831))
## 1.26.2 - 2023-11-04
### Extractors
#### Additions
- [4archive] add `thread` and `board` extractors ([#1262](https://github.com/mikf/gallery-dl/issues/1262), [#2418](https://github.com/mikf/gallery-dl/issues/2418), [#4400](https://github.com/mikf/gallery-dl/issues/4400), [#4710](https://github.com/mikf/gallery-dl/issues/4710), [#4714](https://github.com/mikf/gallery-dl/issues/4714))
- [hitomi] recognize `imageset` gallery URLs ([#4756](https://github.com/mikf/gallery-dl/issues/4756))
- [kemonoparty] add `revision_index` metadata field ([#4727](https://github.com/mikf/gallery-dl/issues/4727))
- [misskey] support `misskey.design` ([#4713](https://github.com/mikf/gallery-dl/issues/4713))
- [reddit] support Reddit Mobile share links ([#4693](https://github.com/mikf/gallery-dl/issues/4693))
- [sankaku] support `/posts/` tag search URLs ([#4740](https://github.com/mikf/gallery-dl/issues/4740))
- [twitter] recognize `fixupx.com` URLs ([#4755](https://github.com/mikf/gallery-dl/issues/4755))
#### Fixes
- [exhentai] update to site layout changes ([#4730](https://github.com/mikf/gallery-dl/issues/4730), [#4754](https://github.com/mikf/gallery-dl/issues/4754))
- [exhentai] provide fallback URLs ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4745](https://github.com/mikf/gallery-dl/issues/4745))
- [exhentai] disable `DH` ciphers to avoid `DH_KEY_TOO_SMALL` errors ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4593](https://github.com/mikf/gallery-dl/issues/4593))
- [idolcomplex] disable sending Referer headers ([#4726](https://github.com/mikf/gallery-dl/issues/4726))
- [instagram] update API headers
- [kemonoparty] fix parsing of non-standard `date` values ([#4676](https://github.com/mikf/gallery-dl/issues/4676))
- [patreon] fix `campaign_id` extraction ([#4699](https://github.com/mikf/gallery-dl/issues/4699), [#4715](https://github.com/mikf/gallery-dl/issues/4715), [#4736](https://github.com/mikf/gallery-dl/issues/4736), [#4738](https://github.com/mikf/gallery-dl/issues/4738))
- [pixiv] load cookies for non-OAuth URLs ([#4760](https://github.com/mikf/gallery-dl/issues/4760))
- [twitter] fix avatars without `date` information ([#4696](https://github.com/mikf/gallery-dl/issues/4696))
- [twitter] restore truncated retweet texts ([#3430](https://github.com/mikf/gallery-dl/issues/3430), [#4690](https://github.com/mikf/gallery-dl/issues/4690))
- [weibo] fix Sina Visitor requests
#### Improvements
- [behance] unescape embed URLs ([#4742](https://github.com/mikf/gallery-dl/issues/4742))
- [fantia] simplify `tags` to a list of strings ([#4752](https://github.com/mikf/gallery-dl/issues/4752))
- [kemonoparty] limit `title` length ([#4741](https://github.com/mikf/gallery-dl/issues/4741))
- [nijie] set 1-2s delay between requests to avoid 429 errors
- [patreon] provide ways to manually specify a user's campaign_id
- `https://www.patreon.com/id:12345`
- `https://www.patreon.com/USER?c=12345`
- `https://www.patreon.com/USER?campaign_id=12345`
- [twitter] cache `user_by_…` results ([#4719](https://github.com/mikf/gallery-dl/issues/4719))
### Post Processors
#### Fixes
- [metadata] ignore non-string tag values ([#4764](https://github.com/mikf/gallery-dl/issues/4764))
### Miscellaneous
#### Fixes
- prevent crash when `stdout.line_buffering` is not defined ([#642](https://github.com/mikf/gallery-dl/issues/642))
## 1.26.1 - 2023-10-21
### Extractors
#### Additions
- [bunkr] add extractor for media URLs ([#4684](https://github.com/mikf/gallery-dl/issues/4684))
- [chevereto] add generic extractors for `chevereto` sites ([#4664](https://github.com/mikf/gallery-dl/issues/4664))
- `deltaporno.com` ([#1381](https://github.com/mikf/gallery-dl/issues/1381))
- `img.kiwi`
- `jpgfish`
- `pixl.li` ([#3179](https://github.com/mikf/gallery-dl/issues/3179), [#4357](https://github.com/mikf/gallery-dl/issues/4357))
- [deviantart] implement `"group": "skip"` ([#4630](https://github.com/mikf/gallery-dl/issues/4630))
- [fantia] add `content_count` and `content_num` metadata fields ([#4627](https://github.com/mikf/gallery-dl/issues/4627))
- [imgbb] add `displayname` and `user_id` metadata ([#4626](https://github.com/mikf/gallery-dl/issues/4626))
- [kemonoparty] support post revisions; add `revisions` option ([#4498](https://github.com/mikf/gallery-dl/issues/4498), [#4597](https://github.com/mikf/gallery-dl/issues/4597))
- [kemonoparty] support searches ([#3385](https://github.com/mikf/gallery-dl/issues/3385), [#4057](https://github.com/mikf/gallery-dl/issues/4057))
- [kemonoparty] support discord URLs with channel IDs ([#4662](https://github.com/mikf/gallery-dl/issues/4662))
- [moebooru] add `metadata` option ([#4646](https://github.com/mikf/gallery-dl/issues/4646))
- [newgrounds] support multi-image posts ([#4642](https://github.com/mikf/gallery-dl/issues/4642))
- [sankaku] support `/posts/` URLs ([#4688](https://github.com/mikf/gallery-dl/issues/4688))
- [twitter] add `sensitive` metadata field ([#4619](https://github.com/mikf/gallery-dl/issues/4619))
#### Fixes
- [4chanarchives] disable Referer headers by default ([#4686](https://github.com/mikf/gallery-dl/issues/4686))
- [bunkr] fix `/d/` file URLs ([#4685](https://github.com/mikf/gallery-dl/issues/4685))
- [deviantart] expand nested comment replies ([#4653](https://github.com/mikf/gallery-dl/issues/4653))
- [deviantart] disable `jwt` ([#4652](https://github.com/mikf/gallery-dl/issues/4652))
- [hentaifoundry] fix `.swf` file downloads ([#4641](https://github.com/mikf/gallery-dl/issues/4641))
- [imgbb] fix `user` metadata extraction ([#4626](https://github.com/mikf/gallery-dl/issues/4626))
- [imgbb] update pagination end condition ([#4626](https://github.com/mikf/gallery-dl/issues/4626))
- [kemonoparty] update API endpoints ([#4676](https://github.com/mikf/gallery-dl/issues/4676), [#4677](https://github.com/mikf/gallery-dl/issues/4677))
- [patreon] update `campaign_id` path ([#4639](https://github.com/mikf/gallery-dl/issues/4639))
- [reddit] fix wrong previews ([#4649](https://github.com/mikf/gallery-dl/issues/4649))
- [redgifs] fix `niches` extraction ([#4666](https://github.com/mikf/gallery-dl/issues/4666), [#4667](https://github.com/mikf/gallery-dl/issues/4667))
- [twitter] fix crash due to missing `source` ([#4620](https://github.com/mikf/gallery-dl/issues/4620))
- [warosu] fix extraction ([#4634](https://github.com/mikf/gallery-dl/issues/4634))
### Post Processors
#### Additions
- support `{_filename}`, `{_directory}`, and `{_path}` replacement fields for `--exec` ([#4633](https://github.com/mikf/gallery-dl/issues/4633))
### Miscellaneous
#### Improvements
- avoid temporary copies with `--cookies-from-browser` by opening cookie databases in read-only mode
## 1.26.0 - 2023-10-03
- ### Extractors
#### Additions
- [behance] add `date` metadata field ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [danbooru] support `booru.borvar.art` ([#4096](https://github.com/mikf/gallery-dl/issues/4096))
- [danbooru] support `donmai.moe`
- [deviantart] add `is_original` metadata field ([#4559](https://github.com/mikf/gallery-dl/issues/4559))
- [e621] support `e6ai.net` ([#4320](https://github.com/mikf/gallery-dl/issues/4320))
- [exhentai] add `fav` option ([#4409](https://github.com/mikf/gallery-dl/issues/4409))
- [gelbooru_v02] support `xbooru.com` ([#4493](https://github.com/mikf/gallery-dl/issues/4493))
- [instagram] add `following` extractor ([#1848](https://github.com/mikf/gallery-dl/issues/1848))
- [pillowfort] support `/tagged/` URLs ([#4570](https://github.com/mikf/gallery-dl/issues/4570))
- [pornhub] add `gif` support ([#4463](https://github.com/mikf/gallery-dl/issues/4463))
- [reddit] add `previews` option ([#4322](https://github.com/mikf/gallery-dl/issues/4322))
- [redgifs] add `niches` extractor ([#4311](https://github.com/mikf/gallery-dl/issues/4311), [#4312](https://github.com/mikf/gallery-dl/issues/4312))
- [redgifs] support `order` parameter for user URLs ([#4583](https://github.com/mikf/gallery-dl/issues/4583))
- [twitter] add `user` extractor and `include` option ([#4275](https://github.com/mikf/gallery-dl/issues/4275))
- [twitter] add `tweet-endpoint` option ([#4307](https://github.com/mikf/gallery-dl/issues/4307))
- [twitter] add `date_original` metadata for retweets ([#4337](https://github.com/mikf/gallery-dl/issues/4337), [#4443](https://github.com/mikf/gallery-dl/issues/4443))
- [twitter] extract `source` metadata ([#4459](https://github.com/mikf/gallery-dl/issues/4459))
- [twitter] support `x.com` URLs ([#4452](https://github.com/mikf/gallery-dl/issues/4452))
#### Improvements
- include `Referer` header in all HTTP requests ([#4490](https://github.com/mikf/gallery-dl/issues/4490), [#4518](https://github.com/mikf/gallery-dl/issues/4518))
(can be disabled with `referer` option)
- [behance] show errors for mature content ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [deviantart] re-add `quality` option and `/intermediary/` transform
- [fantia] improve metadata extraction ([#4126](https://github.com/mikf/gallery-dl/issues/4126))
- [instagram] better error messages for invalid users ([#4606](https://github.com/mikf/gallery-dl/issues/4606))
- [mangadex] support multiple values for `lang` ([#4093](https://github.com/mikf/gallery-dl/issues/4093))
- [mastodon] support `/@USER/following` URLs ([#4608](https://github.com/mikf/gallery-dl/issues/4608))
- [moebooru] match search URLs with empty `tags` ([#4354](https://github.com/mikf/gallery-dl/issues/4354))
- [pillowfort] extract `b2_lg_url` media ([#4570](https://github.com/mikf/gallery-dl/issues/4570))
- [reddit] improve comment metadata ([#4482](https://github.com/mikf/gallery-dl/issues/4482))
- [reddit] ignore `/message/compose` URLs ([#4482](https://github.com/mikf/gallery-dl/issues/4482), [#4581](https://github.com/mikf/gallery-dl/issues/4581))
- [redgifs] provide `collection` metadata as separate field ([#4508](https://github.com/mikf/gallery-dl/issues/4508))
- [redgifs] match `gfycat` image URLs ([#4558](https://github.com/mikf/gallery-dl/issues/4558))
- [twitter] improve error messages for single Tweets ([#4369](https://github.com/mikf/gallery-dl/issues/4369))
#### Fixes
- [acidimg] fix extraction
- [architizer] fix extraction ([#4537](https://github.com/mikf/gallery-dl/issues/4537))
- [behance] fix and update `user` extractor ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [behance] fix cookie usage ([#4417](https://github.com/mikf/gallery-dl/issues/4417))
- [behance] handle videos without `renditions` ([#4523](https://github.com/mikf/gallery-dl/issues/4523))
- [bunkr] fix media domain for `cdn9` ([#4386](https://github.com/mikf/gallery-dl/issues/4386), [#4412](https://github.com/mikf/gallery-dl/issues/4412))
- [bunkr] fix extracting `.wmv` files ([#4419](https://github.com/mikf/gallery-dl/issues/4419))
- [bunkr] fix media domain for `cdn-pizza.bunkr.ru` ([#4489](https://github.com/mikf/gallery-dl/issues/4489))
- [bunkr] fix extraction ([#4514](https://github.com/mikf/gallery-dl/issues/4514), [#4532](https://github.com/mikf/gallery-dl/issues/4532), [#4529](https://github.com/mikf/gallery-dl/issues/4529), [#4540](https://github.com/mikf/gallery-dl/issues/4540))
- [deviantart] fix full resolution URLs for non-downloadable images ([#293](https://github.com/mikf/gallery-dl/issues/293), [#4548](https://github.com/mikf/gallery-dl/issues/4548), [#4563](https://github.com/mikf/gallery-dl/issues/4563))
- [deviantart] fix shortened URLs ([#4316](https://github.com/mikf/gallery-dl/issues/4316))
- [deviantart] fix search ([#4384](https://github.com/mikf/gallery-dl/issues/4384))
- [deviantart] update Eclipse API endpoints ([#4553](https://github.com/mikf/gallery-dl/issues/4553), [#4615](https://github.com/mikf/gallery-dl/issues/4615))
- [deviantart] use private tokens for `is_mature` posts ([#4563](https://github.com/mikf/gallery-dl/issues/4563))
- [flickr] update default API credentials ([#4332](https://github.com/mikf/gallery-dl/issues/4332))
- [giantessbooru] fix extraction ([#4373](https://github.com/mikf/gallery-dl/issues/4373))
- [hiperdex] fix crash for titles containing Unicode characters ([#4325](https://github.com/mikf/gallery-dl/issues/4325))
- [hiperdex] fix `manga` metadata
- [imagefap] fix pagination ([#3013](https://github.com/mikf/gallery-dl/issues/3013))
- [imagevenue] fix extraction ([#4473](https://github.com/mikf/gallery-dl/issues/4473))
- [instagram] fix private posts with long shortcodes ([#4362](https://github.com/mikf/gallery-dl/issues/4362))
- [instagram] fix video preview archive IDs ([#2135](https://github.com/mikf/gallery-dl/issues/2135), [#4455](https://github.com/mikf/gallery-dl/issues/4455))
- [instagram] handle exceptions due to missing media ([#4555](https://github.com/mikf/gallery-dl/issues/4555))
- [issuu] fix extraction ([#4420](https://github.com/mikf/gallery-dl/issues/4420))
- [jpgfish] update domain to `jpg1.su` ([#4494](https://github.com/mikf/gallery-dl/issues/4494))
- [kemonoparty] update `favorite` API endpoint ([#4522](https://github.com/mikf/gallery-dl/issues/4522))
- [lensdump] fix extraction ([#4352](https://github.com/mikf/gallery-dl/issues/4352))
- [mangakakalot] update domain
- [reddit] fix `preview.redd.it` URLs ([#4470](https://github.com/mikf/gallery-dl/issues/4470))
- [patreon] fix extraction ([#4547](https://github.com/mikf/gallery-dl/issues/4547))
- [pixiv] handle errors for private novels ([#4481](https://github.com/mikf/gallery-dl/issues/4481))
- [pornhub] fix extraction ([#4301](https://github.com/mikf/gallery-dl/issues/4301))
- [pururin] fix extraction ([#4375](https://github.com/mikf/gallery-dl/issues/4375))
- [subscribestar] fix preview detection ([#4468](https://github.com/mikf/gallery-dl/issues/4468))
- [twitter] fix crash on private user ([#4349](https://github.com/mikf/gallery-dl/issues/4349))
- [twitter] fix `TweetWithVisibilityResults` ([#4369](https://github.com/mikf/gallery-dl/issues/4369))
- [twitter] fix crash when `sortIndex` is undefined ([#4499](https://github.com/mikf/gallery-dl/issues/4499))
- [zerochan] fix `tags` extraction ([#4315](https://github.com/mikf/gallery-dl/issues/4315), [#4319](https://github.com/mikf/gallery-dl/issues/4319))
#### Removals
- [gfycat] remove module
- [shimmie2] remove `meme.museum`
- ### Post Processors
#### Changes
- update `finalize` events
- add `finalize-error` and `finalize-success` events that trigger
depending on whether error(s) did or did not happen
- change `finalize` to always trigger regardless of error status
#### Additions
- add `python` post processor
- add `prepare-after` event ([#4083](https://github.com/mikf/gallery-dl/issues/4083))
- [ugoira] add `"framerate": "uniform"` ([#4421](https://github.com/mikf/gallery-dl/issues/4421))
#### Improvements
- [ugoira] extend `ffmpeg-output` ([#4421](https://github.com/mikf/gallery-dl/issues/4421))
#### Fixes
- [ugoira] restore `libx264-prevent-odd` ([#4407](https://github.com/mikf/gallery-dl/issues/4407))
- [ugoira] fix high frame rates ([#4421](https://github.com/mikf/gallery-dl/issues/4421))
- ### Downloaders
#### Fixes
- [http] close connection when file already exists ([#4403](https://github.com/mikf/gallery-dl/issues/4403))
- ### Options
#### Additions
- support `parent>child` categories for child extractor options,
for example an `imgur` album from a `reddit` thread with `reddit>imgur`
- implement `subconfigs` option ([#4440](https://github.com/mikf/gallery-dl/issues/4440))
- add `"ascii+"` as a special `path-restrict` value ([#4371](https://github.com/mikf/gallery-dl/issues/4371))
#### Removals
- remove `pyopenssl` option
- ### Tests
#### Improvements
- move extractor results into their own, separate files ([#4504](https://github.com/mikf/gallery-dl/issues/4504))
- include fallback URLs in content tests ([#3163](https://github.com/mikf/gallery-dl/issues/3163))
- various test method improvements
- ### Miscellaneous
#### Fixes
- [formatter] use value of last alternative ([#4492](https://github.com/mikf/gallery-dl/issues/4492))
- fix imports when running `__main__.py` ([#4581](https://github.com/mikf/gallery-dl/issues/4581))
- fix symlink resolution in `__main__.py`
- fix default Firefox user agent string
## 1.25.8 - 2023-07-15
### Changes
- update default User-Agent header to Firefox 115 ESR

@ -0,0 +1,5 @@
FROM python:alpine
RUN python3 -m pip install -U gallery-dl yt-dlp
RUN apk update
RUN apk add ffmpeg
ENTRYPOINT [ "gallery-dl" ]

@ -72,9 +72,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.exe>`__
- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.bin>`__
- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.bin>`__
Nightly Builds
@ -132,6 +132,43 @@ For macOS users with MacPorts:
sudo port install gallery-dl
Docker
--------
Using the Dockerfile in the repository:
.. code:: bash
git clone https://github.com/mikf/gallery-dl.git
cd gallery-dl/
docker build -t gallery-dl:latest .
Pulling image from `Docker Hub <https://hub.docker.com/r/mikf123/gallery-dl>`__:
.. code:: bash
docker pull mikf123/gallery-dl
docker tag mikf123/gallery-dl gallery-dl
Pulling image from `GitHub Container Registry <https://github.com/mikf/gallery-dl/pkgs/container/gallery-dl>`__:
.. code:: bash
docker pull ghcr.io/mikf/gallery-dl
docker tag ghcr.io/mikf/gallery-dl gallery-dl
To run the container you will probably want to attach some directories on the host so that the config file and downloads can persist across runs.
Make sure to either download the example config file reference in the repo and place it in the mounted volume location or touch an empty file there.
If you gave the container a different tag or are using podman then make sure you adjust. Run ``docker image ls`` to check the name if you are not sure.
This will remove the container after every use so you will always have a fresh environment for it to run. If you setup a ci-cd pipeline to autobuild the container you can also add a ``--pull=newer`` flag so that when you run it docker will check to see if there is a newer container and download it before running.
.. code:: bash
docker run --rm -v $HOME/Downloads/:/gallery-dl/ -v $HOME/.config/gallery-dl/gallery-dl.conf:/etc/gallery-dl.conf -it gallery-dl:latest
You can also add an alias to your shell for "gallery-dl" or create a simple bash script and drop it somewhere in your $PATH to act as a shim for this command.
Usage
=====

@ -166,6 +166,8 @@ Description
extractor.*.parent-metadata
---------------------------
extractor.*.metadata-parent
---------------------------
Type
* ``bool``
* ``string``
@ -377,7 +379,7 @@ Description
The username and password to use when attempting to log in to
another site.
Specifying a username and password is required for
Specifying username and password is required for
* ``nijie``
@ -413,6 +415,10 @@ Description
(*) The password value for these sites should be
the API key found in your user profile, not the actual account password.
Note: Leave the ``password`` value empty or undefined
to get prompted for a passeword when performing a login
(see `getpass() <https://docs.python.org/3/library/getpass.html#getpass.getpass>`__).
extractor.*.netrc
-----------------
@ -621,6 +627,20 @@ Description
`ssl.SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>`__
extractor.*.tls12
-----------------
Type
``bool``
Default
* ``true``
* ``false`` for ``patreon``, ``pixiv:series``
Description
Allow selecting TLS 1.2 cipher suites.
Can be disabled to alter TLS fingerprints
and potentially bypass Cloudflare blocks.
extractor.*.keywords
--------------------
Type
@ -642,12 +662,12 @@ Description
`format strings`_.
extractor.*.metadata-url
------------------------
extractor.*.url-metadata
------------------------
Type
``string``
Default
``null``
Description
Insert a file's download URL into its metadata dictionary as the given name.
@ -658,12 +678,12 @@ Description
with a ``metadata`` post processor, etc.
extractor.*.metadata-path
-------------------------
extractor.*.path-metadata
-------------------------
Type
``string``
Default
``null``
Description
Insert a reference to the current
`PathFormat <https://github.com/mikf/gallery-dl/blob/v1.24.2/gallery_dl/path.py#L27>`__
@ -673,12 +693,24 @@ Description
to access the current file's filename as ``"{gdl_path.filename}"``.
extractor.*.metadata-extractor
------------------------------
extractor.*.extractor-metadata
------------------------------
Type
``string``
Description
Insert a reference to the current
`Extractor <https://github.com/mikf/gallery-dl/blob/v1.26.2/gallery_dl/extractor/common.py#L26>`__
object into metadata dictionaries as the given name.
extractor.*.metadata-http
-------------------------
extractor.*.http-metadata
-------------------------
Type
``string``
Default
``null``
Description
Insert an ``object`` containing a file's HTTP headers and
``filename``, ``extension``, and ``date`` parsed from them
@ -689,12 +721,12 @@ Description
and its parsed form as ``"{gdl_http[date]}"``.
extractor.*.metadata-version
----------------------------
extractor.*.version-metadata
----------------------------
Type
``string``
Default
``null``
Description
Insert an ``object`` containing gallery-dl's version info into
metadata dictionaries as the given name.
@ -1048,6 +1080,25 @@ Description
after a colon ``:``, for example ``{date:%Y%m%d}``.
extractor.*.write-pages
-----------------------
Type
* ``bool``
* ``string``
Default
``false``
Description
During data extraction,
write received HTTP request data
to enumerated files in the current working directory.
Special values:
* ``"all"``: Include HTTP request and response headers. Hide ``Authorization``, ``Cookie``, and ``Set-Cookie`` values.
* ``"ALL"``: Include all HTTP request and response headers.
Extractor-specific Options
==========================
@ -1110,6 +1161,19 @@ Description
The maximum possible value appears to be ``1920``.
extractor.behance.modules
-------------------------
Type
``list`` of ``strings``
Default
``["image", "video", "mediacollection", "embed"]``
Description
Selects which gallery modules to download from.
Supported module types are
``image``, ``video``, ``mediacollection``, ``embed``, ``text``.
extractor.blogger.videos
------------------------
Type
@ -1306,13 +1370,21 @@ Description
extractor.deviantart.group
--------------------------
Type
``bool``
* ``bool``
* ``string``
Default
``true``
Description
Check whether the profile name in a given URL
belongs to a group or a regular user.
When disabled, assume every given profile name
belongs to a regular user.
Special values:
* ``"skip"``: Skip groups
extractor.deviantart.include
----------------------------
@ -1329,11 +1401,28 @@ Description
when processing a user profile.
Possible values are
``"gallery"``, ``"scraps"``, ``"journal"``, ``"favorite"``, ``"status"``.
``"avatar"``,
``"background"``,
``"gallery"``,
``"scraps"``,
``"journal"``,
``"favorite"``,
``"status"``.
It is possible to use ``"all"`` instead of listing all values separately.
extractor.deviantart.intermediary
---------------------------------
Type
``bool``
Default
``true``
Description
For older non-downloadable images,
download a higher-quality ``/intermediary/`` version.
extractor.deviantart.journals
-----------------------------
Type
@ -1360,7 +1449,7 @@ Description
of otherwise non-downloadable, low-resolution images
to be able to download them in full resolution.
Note: This got patched by DeviantArt on 2023-09-19 and no longer works.
Note: No longer functional as of 2023-10-11
extractor.deviantart.mature
@ -1429,6 +1518,19 @@ Description
when a `refresh token <extractor.deviantart.refresh-token_>`__ is provided.
extractor.deviantart.quality
----------------------------
Type
``integer``
Default
``100``
Description
JPEG quality level of newer images for which
an original file download is not available.
Note: Only has an effect when `deviantart.jwt <extractor.deviantart.jwt_>`__ is disabled.
extractor.deviantart.refresh-token
----------------------------------
Type
@ -1457,6 +1559,19 @@ Description
Minimum wait time in seconds before API requests.
extractor.deviantart.avatar.formats
-----------------------------------
Type
``list`` of ``strings``
Example
``["original.jpg", "big.jpg", "big.gif", ".png"]``
Description
Avatar URL formats to return.
| Each format is parsed as ``SIZE.EXT``.
| Leave ``SIZE`` empty to download the regular, small avatar format.
extractor.[E621].metadata
-------------------------
Type
@ -1467,7 +1582,7 @@ Default
``false``
Example
* ``notes,pools``
* ``["notes", "pools"``
* ``["notes", "pools"]``
Description
Extract additional metadata (notes, pool metadata) if available.
@ -1504,6 +1619,17 @@ Description
* ``"exhentai.org"``: Use ``exhentai.org`` for all URLs
extractor.exhentai.fallback-retries
-----------------------------------
Type
``integer``
Default
``2``
Description
Number of times a failed image gets retried
or ``-1`` for infinite retries.
extractor.exhentai.fav
----------------------
Type
@ -1520,6 +1646,20 @@ Description
to already favorited galleries.
extractor.exhentai.gp
---------------------
Type
``string``
Default
``"resized"``
Description
Selects how to handle "you do not have enough GP" errors.
* `"resized"`: Continue downloading `non-original <extractor.exhentai.original_>`__ images.
* `"stop"`: Stop the current extractor run.
* `"wait"`: Wait for user input before retrying the current image.
extractor.exhentai.limits
-------------------------
Type
@ -1584,6 +1724,21 @@ Description
* ``false``: Ignore embeds.
extractor.fanbox.metadata
-------------------------
Type
* ``bool``
* ``string``
* ``list`` of ``strings``
Default
``false``
Example
* ``user,plan``
* ``["user", "plan"]``
Description
Extract ``plan`` and extended ``user`` metadata.
extractor.flickr.access-token & .access-token-secret
----------------------------------------------------
Type
@ -2051,7 +2206,22 @@ Type
Default
``false``
Description
Extract ``username`` metadata
Extract ``username`` metadata.
extractor.kemonoparty.revisions
-------------------------------
Type
* ``bool``
* ``string``
Default
``false``
Description
Extract post revisions.
Set this to ``"unique"`` to filter out duplicate revisions.
Note: This requires 1 additional HTTP request per post.
extractor.khinsider.format
@ -2236,6 +2406,18 @@ Description
Fetch media from replies to other notes.
extractor.[moebooru].pool.metadata
----------------------------------
Type
``bool``
Default
``false``
Description
Extract extended ``pool`` metadata.
Note: Not supported by all ``moebooru`` instances.
extractor.newgrounds.flash
--------------------------
Type
@ -2481,6 +2663,14 @@ Description
Download from video pins.
extractor.pixeldrain.api-key
----------------------------
Type
``string``
Description
Your account's `API key <https://pixeldrain.com/user/api_keys>`__
extractor.pixiv.include
-----------------------
Type
@ -2625,6 +2815,16 @@ Description
Also search Plurk comments for URLs.
extractor.[postmill].save-link-post-body
----------------------------------------
Type
``bool``
Default
``false``
Description
Whether or not to save the body for link/image posts.
extractor.reactor.gif
---------------------
Type
@ -2809,6 +3009,19 @@ Description
restrict it to only one possible format.
extractor.sankaku.id-format
---------------------------
Type
``string``
Default
``"numeric"``
Description
Format of ``id`` metadata fields.
* ``"alphanumeric"`` or ``"alnum"``: 11-character alphanumeric IDs (``y0abGlDOr2o``)
* ``"numeric"`` or ``"legacy"``: numeric IDs (``360451``)
extractor.sankaku.refresh
-------------------------
Type
@ -2892,6 +3105,176 @@ Description
Download video files.
extractor.steamgriddb.animated
------------------------------
Type
``bool``
Default
``true``
Description
Include animated assets when downloading from a list of assets.
extractor.steamgriddb.epilepsy
------------------------------
Type
``bool``
Default
``true``
Description
Include assets tagged with epilepsy when downloading from a list of assets.
extractor.steamgriddb.dimensions
--------------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"all"``
Examples
* ``"1024x512,512x512"``
* ``["460x215", "920x430"]``
Description
Only include assets that are in the specified dimensions. ``all`` can be
used to specify all dimensions. Valid values are:
* Grids: ``460x215``, ``920x430``, ``600x900``, ``342x482``, ``660x930``,
``512x512``, ``1024x1024``
* Heroes: ``1920x620``, ``3840x1240``, ``1600x650``
* Logos: N/A (will be ignored)
* Icons: ``8x8``, ``10x10``, ``14x14``, ``16x16``, ``20x20``, ``24x24``,
``28x28``, ``32x32``, ``35x35``, ``40x40``, ``48x48``, ``54x54``,
``56x56``, ``57x57``, ``60x60``, ``64x64``, ``72x72``, ``76x76``,
``80x80``, ``90x90``, ``96x96``, ``100x100``, ``114x114``, ``120x120``,
``128x128``, ``144x144``, ``150x150``, ``152x152``, ``160x160``,
``180x180``, ``192x192``, ``194x194``, ``256x256``, ``310x310``,
``512x512``, ``768x768``, ``1024x1024``
extractor.steamgriddb.file-types
--------------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"all"``
Examples
* ``"png,jpeg"``
* ``["jpeg", "webp"]``
Description
Only include assets that are in the specified file types. ``all`` can be
used to specifiy all file types. Valid values are:
* Grids: ``png``, ``jpeg``, ``jpg``, ``webp``
* Heroes: ``png``, ``jpeg``, ``jpg``, ``webp``
* Logos: ``png``, ``webp``
* Icons: ``png``, ``ico``
extractor.steamgriddb.download-fake-png
---------------------------------------
Type
``bool``
Default
``true``
Description
Download fake PNGs alongside the real file.
extractor.steamgriddb.humor
---------------------------
Type
``bool``
Default
``true``
Description
Include assets tagged with humor when downloading from a list of assets.
extractor.steamgriddb.languages
-------------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"all"``
Examples
* ``"en,km"``
* ``["fr", "it"]``
Description
Only include assets that are in the specified languages. ``all`` can be
used to specifiy all languages. Valid values are `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639-1>`__
language codes.
extractor.steamgriddb.nsfw
--------------------------
Type
``bool``
Default
``true``
Description
Include assets tagged with adult content when downloading from a list of assets.
extractor.steamgriddb.sort
--------------------------
Type
``string``
Default
``score_desc``
Description
Set the chosen sorting method when downloading from a list of assets. Can be one of:
* ``score_desc`` (Highest Score (Beta))
* ``score_asc`` (Lowest Score (Beta))
* ``score_old_desc`` (Highest Score (Old))
* ``score_old_asc`` (Lowest Score (Old))
* ``age_desc`` (Newest First)
* ``age_asc`` (Oldest First)
extractor.steamgriddb.static
----------------------------
Type
``bool``
Default
``true``
Description
Include static assets when downloading from a list of assets.
extractor.steamgriddb.styles
----------------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``all``
Examples
* ``white,black``
* ``["no_logo", "white_logo"]``
Description
Only include assets that are in the specified styles. ``all`` can be used
to specify all styles. Valid values are:
* Grids: ``alternate``, ``blurred``, ``no_logo``, ``material``, ``white_logo``
* Heroes: ``alternate``, ``blurred``, ``material``
* Logos: ``official``, ``white``, ``black``, ``custom``
* Icons: ``official``, ``custom``
extractor.steamgriddb.untagged
------------------------------
Type
``bool``
Default
``true``
Description
Include untagged assets when downloading from a list of assets.
extractor.[szurubooru].username & .token
----------------------------------------
Type
@ -3035,7 +3418,8 @@ Type
Default
``2``
Description
Number of retries for fetching full-resolution images.
Number of retries for fetching full-resolution images
or ``-1`` for infinite retries.
extractor.twibooru.api-key
@ -3064,6 +3448,16 @@ Description
See `Filters <https://twibooru.org/filters>`__ for details.
extractor.twitter.ads
---------------------
Type
``bool``
Default
``false``
Description
Fetch media from promoted Tweets.
extractor.twitter.cards
-----------------------
Type
@ -3142,8 +3536,6 @@ Description
for each Tweet in said timeline.
Note: This requires at least 1 additional API call per initial Tweet.
Age-restricted replies cannot be expanded when using the
`syndication <extractor.twitter.syndication_>`__ API.
extractor.twitter.include
@ -3211,30 +3603,6 @@ Description
``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``.
extractor.twitter.syndication
-----------------------------
Type
* ``bool``
* ``string``
Default
``false``
Description
Controls how to retrieve age-restricted content when not logged in.
* ``false``: Skip age-restricted Tweets.
* ``true``: Download using Twitter's syndication API.
* ``"extended"``: Try to fetch Tweet metadata using the normal API
in addition to the syndication API. This requires additional HTTP
requests in some cases (e.g. when `retweets <extractor.twitter.retweets_>`_
are enabled).
Note: This does not apply to search results (including
`timeline strategies <extractor.twitter.timeline.strategy_>`__).
To retrieve such content from search results, you must log in and
disable "Hide sensitive content" in your `search settings
<https://twitter.com/settings/search>`__.
extractor.twitter.logout
------------------------
Type
@ -4300,6 +4668,24 @@ Description
The default format string here is ``"{message}"``.
output.errorfile
----------------
Type
* |Path|_
* |Logging Configuration|_
Description
File to write input URLs which returned an error to.
The default format string here is also ``"{message}"``.
When combined with
``-I``/``--input-file-comment`` or
``-x``/``--input-file-delete``,
this option will cause *all* input URLs from these files
to be commented/deleted after processing them
and not just successful ones.
output.num-to-str
-----------------
Type
@ -5234,9 +5620,14 @@ How To
* login and visit the `apps <https://www.reddit.com/prefs/apps/>`__
section of your account's preferences
* click the "are you a developer? create an app..." button
* fill out the form, choose "installed app", preferably set
"http://localhost:6414/" as "redirect uri" and finally click
"create app"
* fill out the form:
* choose a name
* select "installed app"
* set ``http://localhost:6414/`` as "redirect uri"
* solve the "I'm not a rebot" reCATCHA if needed
* click "create app"
* copy the client id (third line, under your application's name and
"installed app") and put it in your configuration file
as ``"client-id"``

@ -176,16 +176,15 @@
"imgur":
{
"#": "use different directory and filename formats when coming from a reddit post",
"directory":
{
"'_reddit' in locals()": []
},
"filename":
{
"'_reddit' in locals()": "{_reddit[id]} {id}.{extension}",
"" : "{id}.{extension}"
}
"#": "general imgur settings",
"filename": "{id}.{extension}"
},
"reddit>imgur":
{
"#": "special settings for imgur URLs found in reddit posts",
"directory": [],
"filename": "{_reddit[id]} {_reddit[title]} {id}.{extension}"
},
"tumblr":

@ -75,6 +75,7 @@
"client-id": null,
"client-secret": null,
"refresh-token": null,
"auto-watch": false,
"auto-unwatch": false,
"comments": false,
@ -84,11 +85,13 @@
"group": true,
"include": "gallery",
"journals": "html",
"jwt": false,
"mature": true,
"metadata": false,
"original": true,
"pagination": "api",
"public": true,
"quality": 100,
"wait-min": 0
},
"e621":

@ -0,0 +1,9 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>gallery-dl</title>
</head>
<body>
</body>
</html>

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>gallery-dl - OAuth Redirect</title>
<script>
window.location.href = "http://localhost:6414/" + window.location.search;
</script>
</head>
<body>
</body>
</html>

@ -6,8 +6,6 @@
## General Options:
-h, --help Print this help message and exit
--version Print program version and exit
-i, --input-file FILE Download URLs found in FILE ('-' for stdin).
More than one --input-file can be specified
-f, --filename FORMAT Filename format string for downloaded files
('/O' for "original" filenames)
-d, --destination PATH Target location for file downloads
@ -19,6 +17,16 @@
--clear-cache MODULE Delete cached login sessions, cookies, etc. for
MODULE (ALL to delete everything)
## Input Options:
-i, --input-file FILE Download URLs found in FILE ('-' for stdin).
More than one --input-file can be specified
-I, --input-file-comment FILE
Download URLs found in FILE. Comment them out
after they were downloaded successfully.
-x, --input-file-delete FILE
Download URLs found in FILE. Delete them after
they were downloaded successfully.
## Output Options:
-q, --quiet Activate quiet mode
-v, --verbose Print various debugging information
@ -31,6 +39,7 @@
-E, --extractor-info Print extractor defaults and settings
-K, --list-keywords Print a list of available keywords and example
values for the given URLs
-e, --error-file FILE Add input URLs which returned an error to FILE
--list-modules Print a list of available extractor modules
--list-extractors Print a list of extractor classes with
description, (sub)category and example URL
@ -43,7 +52,8 @@
## Downloader Options:
-r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M)
-R, --retries N Maximum number of retries for failed HTTP
requests or -1 for infinite retries (default: 4)
requests or -1 for infinite retries (default:
4)
--http-timeout SECONDS Timeout for HTTP connections (default: 30.0)
--sleep SECONDS Number of seconds to wait before each download.
This can be either a constant value or a range
@ -110,23 +120,24 @@
and other delegated URLs
## Post-processing Options:
--zip Store downloaded files in a ZIP archive
--ugoira-conv Convert Pixiv Ugoira to WebM (requires FFmpeg)
--ugoira-conv-lossless Convert Pixiv Ugoira to WebM in VP9 lossless
mode
--ugoira-conv-copy Convert Pixiv Ugoira to MKV without re-encoding
any frames
-P, --postprocessor NAME Activate the specified post processor
-O, --postprocessor-option KEY=VALUE
Additional post processor options
--write-metadata Write metadata to separate JSON files
--write-info-json Write gallery metadata to a info.json file
--write-tags Write image tags to separate text files
--mtime-from-date Set file modification times according to 'date'
metadata
--exec CMD Execute CMD for each downloaded file. Example:
--exec "convert {} {}.png && rm {}"
--exec-after CMD Execute CMD after all files were downloaded
successfully. Example: --exec-after "cd {} &&
--zip Store downloaded files in a ZIP archive
--cbz Store downloaded files in a CBZ archive
--mtime NAME Set file modification times according to
metadata selected by NAME. Examples: 'date' or
'status[date]'
--ugoira FORMAT Convert Pixiv Ugoira to FORMAT using FFmpeg.
Supported formats are 'webm', 'mp4', 'gif',
'vp8', 'vp9', 'vp9-lossless', 'copy'.
--exec CMD Execute CMD for each downloaded file. Supported
replacement fields are {} or {_path},
{_directory}, {_filename}. Example: --exec
"convert {} {}.png && rm {}"
--exec-after CMD Execute CMD after all files were downloaded.
Example: --exec-after "cd {_directory} &&
convert * ../doc.pdf"
-P, --postprocessor NAME Activate the specified post processor
-O, --postprocessor-option OPT
Additional '<key>=<value>' post processor
options

@ -1,7 +1,7 @@
# Supported Sites
<!-- auto-generated by scripts/supportedsites.py -->
Consider all sites to be NSFW unless otherwise known.
Consider all listed sites to potentially be NSFW.
<table>
<thead valign="bottom">
@ -13,6 +13,12 @@ Consider all sites to be NSFW unless otherwise known.
</tr>
</thead>
<tbody valign="top">
<tr>
<td>2ch</td>
<td>https://2ch.hk/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr>
<td>2chen</td>
<td>https://sturdychan.help/</td>
@ -31,6 +37,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Pools, Popular Images, Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>4archive</td>
<td>https://4archive.org/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr>
<td>4chan</td>
<td>https://www.4chan.org/</td>
@ -91,6 +103,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Albums, Artwork Listings, Challenges, Followed Users, individual Images, Likes, Search Results, User Profiles</td>
<td></td>
</tr>
<tr>
<td>BATO.TO</td>
<td>https://bato.to/</td>
<td>Chapters, Manga</td>
<td></td>
</tr>
<tr>
<td>BBC</td>
<td>https://bbc.co.uk/</td>
@ -103,16 +121,10 @@ Consider all sites to be NSFW unless otherwise known.
<td>Collections, Galleries, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Blogger</td>
<td>https://www.blogger.com/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td>Bunkr</td>
<td>https://bunkrr.su/</td>
<td>Albums</td>
<td>https://bunkr.sk/</td>
<td>Albums, Media Files</td>
<td></td>
</tr>
<tr>
@ -148,7 +160,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr>
<td>DeviantArt</td>
<td>https://www.deviantart.com/</td>
<td>Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches</td>
<td>Avatars, Backgrounds, Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr>
<tr>
@ -254,9 +266,9 @@ Consider all sites to be NSFW unless otherwise known.
<td></td>
</tr>
<tr>
<td>HBrowse</td>
<td>https://www.hbrowse.com/</td>
<td>Chapters, Manga</td>
<td>HatenaBlog</td>
<td>https://hatenablog.com</td>
<td>Archive, Individual Posts, Home Feed, Search Results</td>
<td></td>
</tr>
<tr>
@ -400,7 +412,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr>
<td>Inkbunny</td>
<td>https://inkbunny.net/</td>
<td>Favorites, Followed Users, Pools, Posts, Search Results, User Profiles</td>
<td>Favorites, Followed Users, Pools, Posts, Search Results, Unread Submissions, User Profiles</td>
<td>Supported</td>
</tr>
<tr>
@ -427,12 +439,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Games</td>
<td></td>
</tr>
<tr>
<td>JPG Fish</td>
<td>https://jpg1.su/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Keenspot</td>
<td>http://www.keenspot.com/</td>
@ -453,7 +459,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr>
<tr>
<td>Komikcast</td>
<td>https://komikcast.site/</td>
<td>https://komikcast.lol/</td>
<td>Chapters, Manga</td>
<td></td>
</tr>
@ -502,7 +508,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr>
<td>MangaDex</td>
<td>https://mangadex.org/</td>
<td>Chapters, Followed Feed, Manga</td>
<td>Chapters, Followed Feed, Lists, Manga</td>
<td>Supported</td>
</tr>
<tr>
@ -595,12 +601,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Albums</td>
<td></td>
</tr>
<tr>
<td>Nudecollect</td>
<td>https://nudecollect.com/</td>
<td>Albums, individual Images</td>
<td></td>
</tr>
<tr>
<td>Patreon</td>
<td>https://www.patreon.com/</td>
@ -643,6 +643,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>All Pins, Created Pins, Pins, pin.it Links, related Pins, Search Results, Sections, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td>
</tr>
<tr>
<td>pixeldrain</td>
<td>https://pixeldrain.com/</td>
<td>Albums, Files</td>
<td></td>
</tr>
<tr>
<td>Pixhost</td>
<td>https://pixhost.to/</td>
@ -679,6 +685,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Posts, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Poringa</td>
<td>http://www.poringa.net/</td>
<td>Posts Images, Search Results, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Porn Image</td>
<td>https://porn-images-xxx.com/</td>
@ -718,7 +730,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr>
<td>Reddit</td>
<td>https://www.reddit.com/</td>
<td>Home Feed, individual Images, Submissions, Subreddits, User Profiles</td>
<td>Home Feed, individual Images, Redirects, Submissions, Subreddits, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr>
<tr>
@ -805,6 +817,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Presentations</td>
<td></td>
</tr>
<tr>
<td>SteamGridDB</td>
<td>https://www.steamgriddb.com</td>
<td>Individual Assets, Grids, Heroes, Icons, Logos</td>
<td></td>
</tr>
<tr>
<td>SubscribeStar</td>
<td>https://www.subscribestar.com/</td>
@ -829,6 +847,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Galleries</td>
<td></td>
</tr>
<tr>
<td>TMOHentai</td>
<td>https://tmohentai.com/</td>
<td>Galleries</td>
<td></td>
</tr>
<tr>
<td>Toyhouse</td>
<td>https://toyhou.se/</td>
@ -883,6 +907,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Files</td>
<td></td>
</tr>
<tr>
<td>Urlgalleries</td>
<td>https://urlgalleries.net/</td>
<td>Galleries</td>
<td></td>
</tr>
<tr>
<td>Vipergirls</td>
<td>https://vipergirls.to/</td>
@ -985,6 +1015,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>individual Images, Tag Searches</td>
<td>Supported</td>
</tr>
<tr>
<td>Zzup</td>
<td>https://zzup.com/</td>
<td>Galleries</td>
<td></td>
</tr>
<tr>
<td>かべうち</td>
<td>https://kabe-uchiroom.com/</td>
@ -998,6 +1034,44 @@ Consider all sites to be NSFW unless otherwise known.
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Blogger Instances</strong></td>
</tr>
<tr>
<td>Blogspot</td>
<td>https://www.blogger.com/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td>MIC MIC IDOL</td>
<td>https://www.micmicidol.club/</td>
<td>Blogs, Labels, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Chevereto Instances</strong></td>
</tr>
<tr>
<td>JPG Fish</td>
<td>https://jpg4.su/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr>
<td>IMG.Kiwi</td>
<td>https://img.kiwi/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr>
<td>DeltaPorno</td>
<td>https://gallery.deltaporno.com/</td>
<td>Albums, individual Images, User Profiles</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Danbooru Instances</strong></td>
</tr>
@ -1137,7 +1211,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr>
<tr>
<td>Bbw-chan</td>
<td>https://bbw-chan.nl/</td>
<td>https://bbw-chan.link/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
@ -1163,6 +1237,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Favorites, Followed Users, Images from Notes, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Misskey.design</td>
<td>https://misskey.design/</td>
<td>Favorites, Followed Users, Images from Notes, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Lesbian.energy</td>
<td>https://lesbian.energy/</td>
@ -1201,12 +1281,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Media Files, Replies, Search Results, Tweets</td>
<td></td>
</tr>
<tr>
<td>Nitter.lacontrevoie.fr</td>
<td>https://nitter.lacontrevoie.fr/</td>
<td>Media Files, Replies, Search Results, Tweets</td>
<td></td>
</tr>
<tr>
<td>Nitter.1d4.us</td>
<td>https://nitter.1d4.us/</td>
@ -1254,6 +1328,16 @@ Consider all sites to be NSFW unless otherwise known.
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Postmill Instances</strong></td>
</tr>
<tr>
<td>Raddle</td>
<td>https://raddle.me/</td>
<td>Forums, Home Feed, Individual Posts, Search Results, Tag Searches, User Profiles</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Reactor Instances</strong></td>
</tr>
@ -1285,12 +1369,6 @@ Consider all sites to be NSFW unless otherwise known.
<tr>
<td colspan="4"><strong>Shimmie2 Instances</strong></td>
</tr>
<tr>
<td>meme.museum</td>
<td>https://meme.museum/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Loudbooru</td>
<td>https://loudbooru.com/</td>
@ -1299,7 +1377,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr>
<tr>
<td>Giantessbooru</td>
<td>https://giantessbooru.com/</td>
<td>https://sizechangebooru.com/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
@ -1315,6 +1393,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Rule34Hentai</td>
<td>https://rule34hentai.net/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>szurubooru Instances</strong></td>
@ -1331,6 +1415,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Snootbooru</td>
<td>https://snootbooru.com/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>URL Shorteners</strong></td>
@ -1370,6 +1460,82 @@ Consider all sites to be NSFW unless otherwise known.
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Wikimedia Instances</strong></td>
</tr>
<tr>
<td>Wikipedia</td>
<td>https://www.wikipedia.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wiktionary</td>
<td>https://www.wiktionary.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikiquote</td>
<td>https://www.wikiquote.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikibooks</td>
<td>https://www.wikibooks.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikisource</td>
<td>https://www.wikisource.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikinews</td>
<td>https://www.wikinews.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikiversity</td>
<td>https://www.wikiversity.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikispecies</td>
<td>https://species.wikimedia.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Wikimedia Commons</td>
<td>https://commons.wikimedia.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>MediaWiki</td>
<td>https://www.mediawiki.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Fandom</td>
<td>https://www.fandom.com/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Super Mario Wiki</td>
<td>https://www.mariowiki.com/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Moebooru and MyImouto</strong></td>
</tr>
@ -1456,16 +1622,6 @@ Consider all sites to be NSFW unless otherwise known.
<td></td>
</tr>
<tr>
<td colspan="4"><strong>FoOlSlide Instances</strong></td>
</tr>
<tr>
<td>PowerManga</td>
<td>https://read.powermanga.org/</td>
<td>Chapters, Manga</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Mastodon Instances</strong></td>
</tr>

@ -18,19 +18,6 @@ __email__ = "mike_faehrmann@web.de"
__version__ = version.__version__
def progress(urls, pformat):
"""Wrapper around urls to output a simple progress indicator"""
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
pinfo = {"total": len(urls)}
for pinfo["current"], pinfo["url"] in enumerate(urls, 1):
output.stderr_write(pformat.format_map(pinfo))
yield pinfo["url"]
def main():
try:
parser = option.build_parser()
@ -58,7 +45,7 @@ def main():
elif filename.startswith("\\f"):
filename = "\f" + filename[2:]
config.set((), "filename", filename)
if args.directory:
if args.directory is not None:
config.set((), "base-directory", args.directory)
config.set((), "directory", ())
if args.postprocessors:
@ -128,6 +115,7 @@ def main():
output.configure_logging(args.loglevel)
if args.loglevel >= logging.ERROR:
config.set(("output",), "mode", "null")
config.set(("downloader",), "progress", None)
elif args.loglevel <= logging.DEBUG:
import platform
import requests
@ -224,7 +212,7 @@ def main():
return config.initialize()
else:
if not args.urls and not args.inputfiles:
if not args.urls and not args.input_files:
parser.error(
"The following arguments are required: URL\n"
"Use 'gallery-dl --help' to get a list of all options.")
@ -238,50 +226,62 @@ def main():
else:
jobtype = args.jobtype or job.DownloadJob
urls = args.urls
if args.inputfiles:
for inputfile in args.inputfiles:
try:
if inputfile == "-":
if sys.stdin:
urls += util.parse_inputfile(sys.stdin, log)
else:
log.warning(
"input file: stdin is not readable")
else:
with open(inputfile, encoding="utf-8") as file:
urls += util.parse_inputfile(file, log)
except OSError as exc:
log.warning("input file: %s", exc)
input_manager = InputManager()
input_manager.log = input_log = logging.getLogger("inputfile")
# unsupported file logging handler
handler = output.setup_logging_handler(
"unsupportedfile", fmt="{message}")
if handler:
ulog = logging.getLogger("unsupported")
ulog = job.Job.ulog = logging.getLogger("unsupported")
ulog.addHandler(handler)
ulog.propagate = False
job.Job.ulog = ulog
# error file logging handler
handler = output.setup_logging_handler(
"errorfile", fmt="{message}", mode="a")
if handler:
elog = input_manager.err = logging.getLogger("errorfile")
elog.addHandler(handler)
elog.propagate = False
# collect input URLs
input_manager.add_list(args.urls)
if args.input_files:
for input_file, action in args.input_files:
try:
path = util.expand_path(input_file)
input_manager.add_file(path, action)
except Exception as exc:
input_log.error(exc)
return getattr(exc, "code", 128)
pformat = config.get(("output",), "progress", True)
if pformat and len(urls) > 1 and args.loglevel < logging.ERROR:
urls = progress(urls, pformat)
else:
urls = iter(urls)
if pformat and len(input_manager.urls) > 1 and \
args.loglevel < logging.ERROR:
input_manager.progress(pformat)
# process input URLs
retval = 0
url = next(urls, None)
while url is not None:
for url in input_manager:
try:
log.debug("Starting %s for '%s'", jobtype.__name__, url)
if isinstance(url, util.ExtendedUrl):
if isinstance(url, ExtendedUrl):
for opts in url.gconfig:
config.set(*opts)
with config.apply(url.lconfig):
retval |= jobtype(url.value).run()
status = jobtype(url.value).run()
else:
retval |= jobtype(url).run()
status = jobtype(url).run()
if status:
retval |= status
input_manager.error()
else:
input_manager.success()
except exception.TerminateExtraction:
pass
except exception.RestartExtraction:
@ -290,9 +290,9 @@ def main():
except exception.NoExtractorError:
log.error("Unsupported URL '%s'", url)
retval |= 64
input_manager.error()
url = next(urls, None)
input_manager.next()
return retval
except KeyboardInterrupt:
@ -304,3 +304,226 @@ def main():
if exc.errno != errno.EPIPE:
raise
return 1
class InputManager():
def __init__(self):
self.urls = []
self.files = ()
self.log = self.err = None
self._url = ""
self._item = None
self._index = 0
self._pformat = None
def add_url(self, url):
self.urls.append(url)
def add_list(self, urls):
self.urls += urls
def add_file(self, path, action=None):
"""Process an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair
separated by an '='. where
'key' is a dot-separated option name and
'value' is a JSON-parsable string.
These configuration options will be applied
while processing the next URL only.
Lines starting with '-G' are the same as above, except these options
will be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
if path == "-" and not action:
try:
lines = sys.stdin.readlines()
except Exception:
raise exception.InputFileError("stdin is not readable")
path = None
else:
try:
with open(path, encoding="utf-8") as fp:
lines = fp.readlines()
except Exception as exc:
raise exception.InputFileError(str(exc))
if self.files:
self.files[path] = lines
else:
self.files = {path: lines}
if action == "c":
action = self._action_comment
elif action == "d":
action = self._action_delete
else:
action = None
gconf = []
lconf = []
indicies = []
strip_comment = None
append = self.urls.append
for n, line in enumerate(lines):
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
if action:
indicies.append(n)
key, sep, value = line.partition("=")
if not sep:
raise exception.InputFileError(
"Invalid KEY=VALUE pair '%s' on line %s in %s",
line, n+1, path)
try:
value = util.json_loads(value.strip())
except ValueError as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
raise exception.InputFileError(
"Unable to parse '%s' on line %s in %s",
value, n+1, path)
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
import re
strip_comment = re.compile(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
url = ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
url = line
if action:
indicies.append(n)
append((url, path, action, indicies))
indicies = []
else:
append(url)
def progress(self, pformat=True):
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
self._pformat = pformat.format_map
def next(self):
self._index += 1
def success(self):
if self._item:
self._rewrite()
def error(self):
if self.err:
if self._item:
url, path, action, indicies = self._item
lines = self.files[path]
out = "".join(lines[i] for i in indicies)
if out and out[-1] == "\n":
out = out[:-1]
self._rewrite()
else:
out = str(self._url)
self.err.info(out)
def _rewrite(self):
url, path, action, indicies = self._item
lines = self.files[path]
action(lines, indicies)
try:
with open(path, "w", encoding="utf-8") as fp:
fp.writelines(lines)
except Exception as exc:
self.log.warning(
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)
@staticmethod
def _action_comment(lines, indicies):
for i in indicies:
lines[i] = "# " + lines[i]
@staticmethod
def _action_delete(lines, indicies):
for i in indicies:
lines[i] = ""
def __iter__(self):
self._index = 0
return self
def __next__(self):
try:
url = self.urls[self._index]
except IndexError:
raise StopIteration
if isinstance(url, tuple):
self._item = url
url = url[0]
else:
self._item = None
self._url = url
if self._pformat:
output.stderr_write(self._pformat({
"total" : len(self.urls),
"current": self._index + 1,
"url" : url,
}))
return url
class ExtendedUrl():
"""URL with attached config key-value pairs"""
__slots__ = ("value", "gconfig", "lconfig")
def __init__(self, url, gconf, lconf):
self.value = url
self.gconfig = gconf
self.lconfig = lconf
def __str__(self):
return self.value

@ -9,10 +9,10 @@
import sys
if __package__ is None and not hasattr(sys, "frozen"):
if not __package__ and not hasattr(sys, "frozen"):
import os.path
path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.realpath(path))
path = os.path.realpath(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(os.path.dirname(path)))
import gallery_dl

@ -47,7 +47,7 @@ def load_cookies(cookiejar, browser_specification):
def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None):
path, container_id = _firefox_cookies_database(profile, container)
with DatabaseCopy(path) as db:
with DatabaseConnection(path) as db:
sql = ("SELECT name, value, host, path, isSecure, expiry "
"FROM moz_cookies")
@ -100,7 +100,7 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None,
path = _chrome_cookies_database(profile, config)
_log_debug("Extracting cookies from %s", path)
with DatabaseCopy(path) as db:
with DatabaseConnection(path) as db:
db.text_factory = bytes
decryptor = get_cookie_decryptor(
config["directory"], config["keyring"], keyring)
@ -215,9 +215,11 @@ def _firefox_cookies_database(profile=None, container=None):
def _firefox_browser_directory():
if sys.platform in ("win32", "cygwin"):
return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles")
return os.path.expandvars(
r"%APPDATA%\Mozilla\Firefox\Profiles")
if sys.platform == "darwin":
return os.path.expanduser("~/Library/Application Support/Firefox")
return os.path.expanduser(
"~/Library/Application Support/Firefox/Profiles")
return os.path.expanduser("~/.mozilla/firefox")
@ -814,7 +816,7 @@ class DataParser:
self.skip_to(len(self._data), description)
class DatabaseCopy():
class DatabaseConnection():
def __init__(self, path):
self.path = path
@ -822,13 +824,27 @@ class DatabaseCopy():
self.directory = None
def __enter__(self):
try:
# https://www.sqlite.org/uri.html#the_uri_path
path = self.path.replace("?", "%3f").replace("#", "%23")
if util.WINDOWS:
path = "/" + os.path.abspath(path)
uri = "file:{}?mode=ro&immutable=1".format(path)
self.database = sqlite3.connect(
uri, uri=True, isolation_level=None, check_same_thread=False)
return self.database
except Exception as exc:
_log_debug("Falling back to temporary database copy (%s: %s)",
exc.__class__.__name__, exc)
try:
self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-")
path_copy = os.path.join(self.directory.name, "copy.sqlite")
shutil.copyfile(self.path, path_copy)
self.database = db = sqlite3.connect(
self.database = sqlite3.connect(
path_copy, isolation_level=None, check_same_thread=False)
return db
return self.database
except BaseException:
if self.directory:
self.directory.cleanup()
@ -836,7 +852,8 @@ class DatabaseCopy():
def __exit__(self, exc, value, tb):
self.database.close()
self.directory.cleanup()
if self.directory:
self.directory.cleanup()
def Popen_communicate(*args):

@ -200,13 +200,15 @@ class HttpDownloader(DownloaderBase):
self.log.warning(
"File size smaller than allowed minimum (%s < %s)",
size, self.minsize)
return False
pathfmt.temppath = ""
return True
if self.maxsize and size > self.maxsize:
self.release_conn(response)
self.log.warning(
"File size larger than allowed maximum (%s > %s)",
size, self.maxsize)
return False
pathfmt.temppath = ""
return True
build_path = False

@ -21,6 +21,7 @@ Exception
| +-- FilenameFormatError
| +-- DirectoryFormatError
+-- FilterError
+-- InputFileError
+-- NoExtractorError
+-- StopExtraction
+-- TerminateExtraction
@ -99,6 +100,15 @@ class FilterError(GalleryDLException):
code = 32
class InputFileError(GalleryDLException):
"""Error when parsing input file"""
code = 32
def __init__(self, message, *args):
GalleryDLException.__init__(
self, message % args if args else message)
class NoExtractorError(GalleryDLException):
"""No extractor can handle the given URL"""
code = 64

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://2ch.hk/"""
from .common import Extractor, Message
from .. import text, util
class _2chThreadExtractor(Extractor):
"""Extractor for 2ch threads"""
category = "2ch"
subcategory = "thread"
root = "https://2ch.hk"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{tim}{filename:? //}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
example = "https://2ch.hk/a/res/12345.html"
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
def items(self):
url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
posts = self.request(url).json()["threads"][0]["posts"]
op = posts[0]
title = op.get("subject") or text.remove_html(op["comment"])
thread = {
"board" : self.board,
"thread": self.thread,
"title" : text.unescape(title)[:50],
}
yield Message.Directory, thread
for post in posts:
files = post.get("files")
if files:
post["post_name"] = post["name"]
post["date"] = text.parse_timestamp(post["timestamp"])
del post["files"]
del post["name"]
for file in files:
file.update(thread)
file.update(post)
file["filename"] = file["fullname"].rpartition(".")[0]
file["tim"], _, file["extension"] = \
file["name"].rpartition(".")
yield Message.Url, self.root + file["path"], file
class _2chBoardExtractor(Extractor):
"""Extractor for 2ch boards"""
category = "2ch"
subcategory = "board"
root = "https://2ch.hk"
pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
example = "https://2ch.hk/a/"
def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)
def items(self):
# index page
url = "{}/{}/index.json".format(self.root, self.board)
index = self.request(url).json()
index["_extractor"] = _2chThreadExtractor
for thread in index["threads"]:
url = "{}/{}/res/{}.html".format(
self.root, self.board, thread["thread_num"])
yield Message.Queue, url, index
# pages 1..n
for n in util.advance(index["pages"], 1):
url = "{}/{}/{}.json".format(self.root, self.board, n)
page = self.request(url).json()
page["_extractor"] = _2chThreadExtractor
for thread in page["threads"]:
url = "{}/{}/res/{}.html".format(
self.root, self.board, thread["thread_num"])
yield Message.Queue, url, page

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://4archive.org/"""
from .common import Extractor, Message
from .. import text, util
class _4archiveThreadExtractor(Extractor):
"""Extractor for 4archive threads"""
category = "4archive"
subcategory = "thread"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{no} {filename}.{extension}"
archive_fmt = "{board}_{thread}_{no}"
root = "https://4archive.org"
referer = False
pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)"
example = "https://4archive.org/board/a/thread/12345/"
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
def items(self):
url = "{}/board/{}/thread/{}".format(
self.root, self.board, self.thread)
page = self.request(url).text
data = self.metadata(page)
posts = self.posts(page)
if not data["title"]:
data["title"] = posts[0]["com"][:50]
for post in posts:
post.update(data)
post["time"] = int(util.datetime_to_timestamp(post["date"]))
yield Message.Directory, post
if "url" in post:
yield Message.Url, post["url"], text.nameext_from_url(
post["filename"], post)
def metadata(self, page):
return {
"board" : self.board,
"thread": text.parse_int(self.thread),
"title" : text.unescape(text.extr(
page, 'class="subject">', "</span>"))
}
def posts(self, page):
return [
self.parse(post)
for post in page.split('class="postContainer')[1:]
]
@staticmethod
def parse(post):
extr = text.extract_from(post)
data = {
"name": extr('class="name">', "</span>"),
"date": text.parse_datetime(
extr('class="dateTime postNum" >', "<").strip(),
"%Y-%m-%d %H:%M:%S"),
"no" : text.parse_int(extr('href="#p', '"')),
}
if 'class="file"' in post:
extr('class="fileText"', ">File: <a")
data.update({
"url" : extr('href="', '"'),
"filename": extr(
'rel="noreferrer noopener"', "</a>").strip()[1:],
"size" : text.parse_bytes(extr(" (", ", ")[:-1]),
"width" : text.parse_int(extr("", "x")),
"height" : text.parse_int(extr("", "px")),
})
extr("<blockquote ", "")
data["com"] = text.unescape(text.remove_html(
extr(">", "</blockquote>")))
return data
class _4archiveBoardExtractor(Extractor):
"""Extractor for 4archive boards"""
category = "4archive"
subcategory = "board"
root = "https://4archive.org"
pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$"
example = "https://4archive.org/board/a/"
def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)
self.num = text.parse_int(match.group(2), 1)
def items(self):
data = {"_extractor": _4archiveThreadExtractor}
while True:
url = "{}/board/{}/{}".format(self.root, self.board, self.num)
page = self.request(url).text
if 'class="thread"' not in page:
return
for thread in text.extract_iter(page, 'class="thread" id="t', '"'):
url = "{}/board/{}/thread/{}".format(
self.root, self.board, thread)
yield Message.Queue, url, data
self.num += 1

@ -20,6 +20,7 @@ class _4chanarchivesThreadExtractor(Extractor):
directory_fmt = ("{category}", "{board}", "{thread} - {title}")
filename_fmt = "{no}-{filename}.{extension}"
archive_fmt = "{board}_{thread}_{no}"
referer = False
pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)"
example = "https://4chanarchives.com/board/a/thread/12345/"

@ -10,11 +10,13 @@ import sys
import re
modules = [
"2ch",
"2chan",
"2chen",
"35photo",
"3dbooru",
"4chan",
"4archive",
"4chanarchives",
"500px",
"8chan",
@ -23,11 +25,13 @@ modules = [
"architizer",
"artstation",
"aryion",
"batoto",
"bbc",
"behance",
"blogger",
"bunkr",
"catbox",
"chevereto",
"comicvine",
"cyberdrop",
"danbooru",
@ -50,7 +54,7 @@ modules = [
"gelbooru_v01",
"gelbooru_v02",
"gofile",
"hbrowse",
"hatenablog",
"hentai2read",
"hentaicosplays",
"hentaifoundry",
@ -73,7 +77,6 @@ modules = [
"issuu",
"itaku",
"itchio",
"jpgfish",
"jschan",
"kabeuchi",
"keenspot",
@ -106,7 +109,6 @@ modules = [
"nitter",
"nozomi",
"nsfwalbum",
"nudecollect",
"paheal",
"patreon",
"philomena",
@ -116,12 +118,15 @@ modules = [
"piczel",
"pillowfort",
"pinterest",
"pixeldrain",
"pixiv",
"pixnet",
"plurk",
"poipiku",
"poringa",
"pornhub",
"pornpics",
"postmill",
"pururin",
"rawkuma",
"reactor",
@ -142,17 +147,20 @@ modules = [
"smugmug",
"soundgasm",
"speakerdeck",
"steamgriddb",
"subscribestar",
"szurubooru",
"tapas",
"tcbscans",
"telegraph",
"tmohentai",
"toyhouse",
"tsumino",
"tumblr",
"tumblrgallery",
"twibooru",
"twitter",
"urlgalleries",
"unsplash",
"uploadir",
"urlshortener",
@ -170,9 +178,11 @@ modules = [
"weibo",
"wikiart",
"wikifeet",
"wikimedia",
"xhamster",
"xvideos",
"zerochan",
"zzup",
"booru",
"moebooru",
"foolfuuka",

@ -40,7 +40,7 @@ class AryionExtractor(Extractor):
if username:
self.cookies_update(self._login_impl(username, password))
@cache(maxage=14*24*3600, keyarg=1)
@cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)

@ -0,0 +1,123 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://bato.to/"""
from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception
import re
BASE_PATTERN = (r"(?:https?://)?(?:"
r"(?:ba|d|h|m|w)to\.to|"
r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|"
r"comiko\.(?:net|org)|"
r"bat(?:otoo|o?two)\.com)")
class BatotoBase():
"""Base class for batoto extractors"""
category = "batoto"
root = "https://bato.to"
def request(self, url, **kwargs):
kwargs["encoding"] = "utf-8"
return Extractor.request(self, url, **kwargs)
class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
"""Extractor for bato.to manga chapters"""
pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
example = "https://bato.to/title/12345-MANGA/54321"
def __init__(self, match):
self.root = text.root_from_url(match.group(0))
self.chapter_id = match.group(1)
url = "{}/title/0/{}".format(self.root, self.chapter_id)
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
extr = text.extract_from(page)
manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)
manga_id = text.extr(
extr('rel="canonical" href="', '"'), "/title/", "/")
match = re.match(
r"(?:Volume\s+(\d+) )?"
r"\w+\s+(\d+)(.*)", info)
if match:
volume, chapter, minor = match.groups()
title = text.remove_html(extr(
"selected>", "</option")).partition(" : ")[2]
else:
volume = chapter = 0
minor = ""
title = info
return {
"manga" : text.unescape(manga),
"manga_id" : text.parse_int(manga_id),
"title" : text.unescape(title),
"volume" : text.parse_int(volume),
"chapter" : text.parse_int(chapter),
"chapter_minor": minor,
"chapter_id" : text.parse_int(self.chapter_id),
"date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
}
def images(self, page):
images_container = text.extr(page, 'pageOpts', ':[0,0]}"')
images_container = text.unescape(images_container)
return [
(url, None)
for url in text.extract_iter(images_container, r"\"", r"\"")
]
class BatotoMangaExtractor(BatotoBase, MangaExtractor):
"""Extractor for bato.to manga"""
reverse = False
chapterclass = BatotoChapterExtractor
pattern = (BASE_PATTERN +
r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
example = "https://bato.to/title/12345-MANGA/"
def __init__(self, match):
self.root = text.root_from_url(match.group(0))
self.manga_id = match.group(1) or match.group(2)
url = "{}/title/{}".format(self.root, self.manga_id)
MangaExtractor.__init__(self, match, url)
def chapters(self, page):
extr = text.extract_from(page)
warning = extr(' class="alert alert-warning">', "</div><")
if warning:
raise exception.StopExtraction("'%s'", text.remove_html(warning))
data = {
"manga_id": text.parse_int(self.manga_id),
"manga" : text.unescape(extr(
"<title>", "<").rpartition(" - ")[0]),
}
extr('<div data-hk="0-0-0-0"', "")
results = []
while True:
href = extr('<a href="/title/', '"')
if not href:
break
chapter = href.rpartition("-ch_")[2]
chapter, sep, minor = chapter.partition(".")
data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = sep + minor
data["date"] = text.parse_datetime(
extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
url = "{}/title/{}".format(self.root, href)
results.append((url, data.copy()))
return results

@ -89,6 +89,17 @@ class BehanceGalleryExtractor(BehanceExtractor):
BehanceExtractor.__init__(self, match)
self.gallery_id = match.group(1)
def _init(self):
BehanceExtractor._init(self)
modules = self.config("modules")
if modules:
if isinstance(modules, str):
modules = modules.split(",")
self.modules = set(modules)
else:
self.modules = {"image", "video", "mediacollection", "embed"}
def items(self):
data = self.get_gallery_data()
imgs = self.get_images(data)
@ -97,7 +108,8 @@ class BehanceGalleryExtractor(BehanceExtractor):
yield Message.Directory, data
for data["num"], (url, module) in enumerate(imgs, 1):
data["module"] = module
data["extension"] = text.ext_from_url(url)
data["extension"] = (module.get("extension") or
text.ext_from_url(url))
yield Message.Url, url, data
def get_gallery_data(self):
@ -133,13 +145,17 @@ class BehanceGalleryExtractor(BehanceExtractor):
append = result.append
for module in data["modules"]:
mtype = module["__typename"]
mtype = module["__typename"][:-6].lower()
if mtype == "ImageModule":
if mtype not in self.modules:
self.log.debug("Skipping '%s' module", mtype)
continue
if mtype == "image":
url = module["imageSizes"]["size_original"]["url"]
append((url, module))
elif mtype == "VideoModule":
elif mtype == "video":
try:
renditions = module["videoData"]["renditions"]
except Exception:
@ -158,7 +174,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
append((url, module))
elif mtype == "MediaCollectionModule":
elif mtype == "mediacollection":
for component in module["components"]:
for size in component["imageSizes"].values():
if size:
@ -167,10 +183,16 @@ class BehanceGalleryExtractor(BehanceExtractor):
append(("/".join(parts), module))
break
elif mtype == "EmbedModule":
elif mtype == "embed":
embed = module.get("originalEmbed") or module.get("fluidEmbed")
if embed:
append(("ytdl:" + text.extr(embed, 'src="', '"'), module))
embed = text.unescape(text.extr(embed, 'src="', '"'))
module["extension"] = "mp4"
append(("ytdl:" + embed, module))
elif mtype == "text":
module["extension"] = "txt"
append(("text:" + module["text"], module))
return result

@ -8,30 +8,22 @@
"""Extractors for Blogger blogs"""
from .common import Extractor, Message
from .common import BaseExtractor, Message
from .. import text, util
import re
BASE_PATTERN = (
r"(?:blogger:(?:https?://)?([^/]+)|"
r"(?:https?://)?([\w-]+\.blogspot\.com))")
class BloggerExtractor(Extractor):
class BloggerExtractor(BaseExtractor):
"""Base class for blogger extractors"""
category = "blogger"
directory_fmt = ("{category}", "{blog[name]}",
basecategory = "blogger"
directory_fmt = ("blogger", "{blog[name]}",
"{post[date]:%Y-%m-%d} {post[title]}")
filename_fmt = "{num:>03}.{extension}"
archive_fmt = "{post[id]}_{num}"
root = "https://www.blogger.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.blog = match.group(1) or match.group(2)
def _init(self):
self.api = BloggerAPI(self)
self.blog = self.root.rpartition("/")[2]
self.videos = self.config("videos", True)
def items(self):
@ -45,7 +37,7 @@ class BloggerExtractor(Extractor):
findall_image = re.compile(
r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|'
r'lh\d+\.googleusercontent\.com/|'
r'lh\d+(?:-\w+)?\.googleusercontent\.com|'
r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
findall_video = re.compile(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
@ -92,6 +84,18 @@ class BloggerExtractor(Extractor):
"""Return additional metadata"""
BASE_PATTERN = BloggerExtractor.update({
"blogspot": {
"root": None,
"pattern": r"[\w-]+\.blogspot\.com",
},
"micmicidol": {
"root": "https://www.micmicidol.club",
"pattern": r"(?:www\.)?micmicidol\.club",
},
})
class BloggerPostExtractor(BloggerExtractor):
"""Extractor for a single blog post"""
subcategory = "post"
@ -100,7 +104,7 @@ class BloggerPostExtractor(BloggerExtractor):
def __init__(self, match):
BloggerExtractor.__init__(self, match)
self.path = match.group(3)
self.path = match.group(match.lastindex)
def posts(self, blog):
return (self.api.post_by_path(blog["id"], self.path),)
@ -124,7 +128,7 @@ class BloggerSearchExtractor(BloggerExtractor):
def __init__(self, match):
BloggerExtractor.__init__(self, match)
self.query = text.unquote(match.group(3))
self.query = text.unquote(match.group(match.lastindex))
def posts(self, blog):
return self.api.blog_search(blog["id"], self.query)
@ -141,7 +145,7 @@ class BloggerLabelExtractor(BloggerExtractor):
def __init__(self, match):
BloggerExtractor.__init__(self, match)
self.label = text.unquote(match.group(3))
self.label = text.unquote(match.group(match.lastindex))
def posts(self, blog):
return self.api.blog_posts(blog["id"], self.label)

@ -6,12 +6,14 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://bunkrr.su/"""
"""Extractors for https://bunkr.sk/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
from urllib.parse import urlsplit, urlunsplit
BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:sk|[rs]u|la|is|to)"
MEDIA_DOMAIN_OVERRIDES = {
"cdn9.bunkr.ru" : "c9.bunkr.ru",
"cdn12.bunkr.ru": "media-files12.bunkr.la",
@ -25,11 +27,11 @@ CDN_HOSTED_EXTENSIONS = (
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for bunkrr.su albums"""
"""Extractor for bunkr.sk albums"""
category = "bunkr"
root = "https://bunkrr.su"
pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)"
example = "https://bunkrr.su/a/ID"
root = "https://bunkr.sk"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://bunkr.sk/a/ID"
def fetch_album(self, album_id):
# album metadata
@ -38,36 +40,67 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
page, "<h1", "</div>").partition(">")[2])
count, _, size = info[1].split(None, 2)
# files
cdn = None
files = []
append = files.append
pos = page.index('class="grid-images')
for url in text.extract_iter(page, '<a href="', '"', pos):
if url.startswith("/"):
if not cdn:
# fetch cdn root from download page
durl = "{}/d/{}".format(self.root, url[3:])
cdn = text.extr(self.request(
durl).text, 'link.href = "', '"')
cdn = cdn[:cdn.index("/", 8)]
url = cdn + url[2:]
url = text.unescape(url)
if url.lower().endswith(CDN_HOSTED_EXTENSIONS):
scheme, domain, path, query, fragment = urlsplit(url)
if domain in MEDIA_DOMAIN_OVERRIDES:
domain = MEDIA_DOMAIN_OVERRIDES[domain]
else:
domain = domain.replace("cdn", "media-files", 1)
url = urlunsplit((scheme, domain, path, query, fragment))
append({"file": url})
return files, {
urls = list(text.extract_iter(page, '<a href="', '"', pos))
return self._extract_files(urls), {
"album_id" : self.album_id,
"album_name" : text.unescape(info[0]),
"album_size" : size[1:-1],
"description": text.unescape(info[2]) if len(info) > 2 else "",
"count" : len(files),
"count" : len(urls),
}
def _extract_files(self, urls):
for url in urls:
if url.startswith("/"):
try:
url = self._extract_file(text.unescape(url))
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
continue
else:
if url.lower().endswith(CDN_HOSTED_EXTENSIONS):
scheme, domain, path, query, fragment = urlsplit(url)
if domain in MEDIA_DOMAIN_OVERRIDES:
domain = MEDIA_DOMAIN_OVERRIDES[domain]
else:
domain = domain.replace("cdn", "media-files", 1)
url = urlunsplit((scheme, domain, path, query, fragment))
yield {"file": text.unescape(url)}
def _extract_file(self, path):
page = self.request(self.root + path).text
if path[1] == "v":
url = text.extr(page, '<source src="', '"')
else:
url = text.extr(page, '<img src="', '"')
if not url:
url = text.rextract(
page, ' href="', '"', page.rindex("Download"))[0]
return url
class BunkrMediaExtractor(BunkrAlbumExtractor):
"""Extractor for bunkr.sk media links"""
subcategory = "media"
directory_fmt = ("{category}",)
pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)"
example = "https://bunkr.sk/v/FILENAME"
def fetch_album(self, album_id):
try:
url = self._extract_file(urlsplit(self.url).path)
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
return ({"file": text.unescape(url)},), {
"album_id" : "",
"album_name" : "",
"album_size" : -1,
"description": "",
"count" : 1,
}

@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Chevereto galleries"""
from .common import BaseExtractor, Message
from .. import text
class CheveretoExtractor(BaseExtractor):
"""Base class for chevereto extractors"""
basecategory = "chevereto"
directory_fmt = ("{category}", "{user}", "{album}",)
archive_fmt = "{id}"
def __init__(self, match):
BaseExtractor.__init__(self, match)
self.path = match.group(match.lastindex)
def _pagination(self, url):
while url:
page = self.request(url).text
for item in text.extract_iter(
page, '<div class="list-item-image ', 'image-container'):
yield text.extr(item, '<a href="', '"')
url = text.extr(page, '<a data-pagination="next" href="', '" ><')
BASE_PATTERN = CheveretoExtractor.update({
"jpgfish": {
"root": "https://jpg4.su",
"pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
},
"imgkiwi": {
"root": "https://img.kiwi",
"pattern": r"img\.kiwi",
},
"deltaporno": {
"root": "https://gallery.deltaporno.com",
"pattern": r"gallery\.deltaporno\.com",
},
})
class CheveretoImageExtractor(CheveretoExtractor):
"""Extractor for chevereto Images"""
subcategory = "image"
pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)"
example = "https://jpg2.su/img/TITLE.ID"
def items(self):
url = self.root + self.path
extr = text.extract_from(self.request(url).text)
image = {
"id" : self.path.rpartition(".")[2],
"url" : extr('<meta property="og:image" content="', '"'),
"album": text.extr(extr("Added to <a", "/a>"), ">", "<"),
"user" : extr('username: "', '"'),
}
text.nameext_from_url(image["url"], image)
yield Message.Directory, image
yield Message.Url, image["url"], image
class CheveretoAlbumExtractor(CheveretoExtractor):
"""Extractor for chevereto Albums"""
subcategory = "album"
pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
example = "https://jpg2.su/album/TITLE.ID"
def items(self):
url = self.root + self.path
data = {"_extractor": CheveretoImageExtractor}
if self.path.endswith("/sub"):
albums = self._pagination(url)
else:
albums = (url,)
for album in albums:
for image in self._pagination(album):
yield Message.Queue, image, data
class CheveretoUserExtractor(CheveretoExtractor):
"""Extractor for chevereto Users"""
subcategory = "user"
pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)"
example = "https://jpg2.su/USER"
def items(self):
url = self.root + self.path
if self.path.endswith("/albums"):
data = {"_extractor": CheveretoAlbumExtractor}
else:
data = {"_extractor": CheveretoImageExtractor}
for url in self._pagination(url):
yield Message.Queue, url, data

@ -32,13 +32,15 @@ class Extractor():
directory_fmt = ("{category}",)
filename_fmt = "{filename}.{extension}"
archive_fmt = ""
root = ""
cookies_domain = ""
referer = True
ciphers = None
tls12 = True
browser = None
root = ""
request_interval = 0.0
request_interval_min = 0.0
request_timestamp = 0.0
tls12 = True
def __init__(self, match):
self.log = logging.getLogger(self.category)
@ -76,6 +78,12 @@ class Extractor():
def config(self, key, default=None):
return config.interpolate(self._cfgpath, key, default)
def config2(self, key, key2, default=None, sentinel=util.SENTINEL):
value = self.config(key, sentinel)
if value is not sentinel:
return value
return self.config(key2, default)
def config_deprecated(self, key, deprecated, default=None,
sentinel=util.SENTINEL, history=set()):
value = self.config(deprecated, sentinel)
@ -94,6 +102,9 @@ class Extractor():
def config_accumulate(self, key):
return config.accumulate(self._cfgpath, key)
def config_instance(self, key, default=None):
return default
def _config_shared(self, key, default=None):
return config.interpolate_common(
("extractor",), self._cfgpath, key, default)
@ -128,6 +139,18 @@ class Extractor():
kwargs["timeout"] = self._timeout
if "verify" not in kwargs:
kwargs["verify"] = self._verify
if "json" in kwargs:
json = kwargs["json"]
if json is not None:
kwargs["data"] = util.json_dumps(json).encode()
del kwargs["json"]
headers = kwargs.get("headers")
if headers:
headers["Content-Type"] = "application/json"
else:
kwargs["headers"] = {"Content-Type": "application/json"}
response = None
tries = 1
@ -225,7 +248,7 @@ class Extractor():
password = None
if username:
password = self.config("password")
password = self.config("password") or util.LazyPrompt()
elif self.config("netrc", False):
try:
info = netrc.netrc().authenticators(self.category)
@ -304,16 +327,17 @@ class Extractor():
headers["User-Agent"] = useragent
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
ssl_ciphers = self.ciphers
if BROTLI:
headers["Accept-Encoding"] = "gzip, deflate, br"
else:
headers["Accept-Encoding"] = "gzip, deflate"
custom_referer = self.config("referer", True)
if custom_referer:
if isinstance(custom_referer, str):
headers["Referer"] = custom_referer
referer = self.config("referer", self.referer)
if referer:
if isinstance(referer, str):
headers["Referer"] = referer
elif self.root:
headers["Referer"] = self.root + "/"
@ -505,12 +529,15 @@ class Extractor():
if include == "all":
include = extractors
elif isinstance(include, str):
include = include.split(",")
include = include.replace(" ", "").split(",")
result = [(Message.Version, 1)]
for category in include:
if category in extractors:
try:
extr, url = extractors[category]
except KeyError:
self.log.warning("Invalid include '%s'", category)
else:
result.append((Message.Queue, url, {"_extractor": extr}))
return iter(result)
@ -711,9 +738,10 @@ class BaseExtractor(Extractor):
for index, group in enumerate(match.groups()):
if group is not None:
if index:
self.category, self.root = self.instances[index-1]
self.category, self.root, info = self.instances[index-1]
if not self.root:
self.root = text.root_from_url(match.group(0))
self.config_instance = info.get
else:
self.root = group
self.category = group.partition("://")[2]
@ -733,7 +761,7 @@ class BaseExtractor(Extractor):
root = info["root"]
if root:
root = root.rstrip("/")
instance_list.append((category, root))
instance_list.append((category, root, info))
pattern = info.get("pattern")
if not pattern:

@ -7,6 +7,7 @@
"""Extractors for https://cyberdrop.me/"""
from . import lolisafe
from .common import Message
from .. import text
@ -16,24 +17,43 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)"
example = "https://cyberdrop.me/a/ID"
def items(self):
files, data = self.fetch_album(self.album_id)
yield Message.Directory, data
for data["num"], file in enumerate(files, 1):
file.update(data)
text.nameext_from_url(file["name"], file)
file["name"], sep, file["id"] = file["filename"].rpartition("-")
yield Message.Url, file["url"], file
def fetch_album(self, album_id):
url = self.root + "/a/" + self.album_id
extr = text.extract_from(self.request(url).text)
files = []
append = files.append
while True:
url = text.unescape(extr('id="file" href="', '"'))
if not url:
break
append({"file": url,
"_fallback": (self.root + url[url.find("/", 8):],)})
return files, {
url = "{}/a/{}".format(self.root, album_id)
page = self.request(url).text
extr = text.extract_from(page)
desc = extr('property="og:description" content="', '"')
if desc.startswith("A privacy-focused censorship-resistant file "
"sharing platform free for everyone."):
desc = ""
extr('id="title"', "")
album = {
"album_id" : self.album_id,
"album_name" : extr("name: '", "'"),
"date" : text.parse_timestamp(extr("timestamp: ", ",")),
"album_size" : text.parse_int(extr("totalSize: ", ",")),
"description": extr("description: `", "`"),
"count" : len(files),
"album_name" : text.unescape(extr('title="', '"')),
"album_size" : text.parse_bytes(extr(
'<p class="title">', "B")),
"date" : text.parse_datetime(extr(
'<p class="title">', '<'), "%d.%m.%Y"),
"description": text.unescape(text.unescape( # double
desc.rpartition(" [R")[0])),
}
file_ids = list(text.extract_iter(page, 'id="file" href="/f/', '"'))
album["count"] = len(file_ids)
return self._extract_files(file_ids), album
def _extract_files(self, file_ids):
for file_id in file_ids:
url = "{}/api/f/{}".format(self.root, file_id)
yield self.request(url).json()

@ -20,7 +20,7 @@ class DanbooruExtractor(BaseExtractor):
page_limit = 1000
page_start = None
per_page = 200
request_interval = 1.0
request_interval = (0.5, 1.5)
def _init(self):
self.ugoira = self.config("ugoira", False)
@ -36,7 +36,7 @@ class DanbooruExtractor(BaseExtractor):
username, api_key = self._get_auth_info()
if username:
self.log.debug("Using HTTP Basic Auth for user '%s'", username)
self.session.auth = (username, api_key)
self.session.auth = util.HTTPBasicAuth(username, api_key)
def skip(self, num):
pages = num // self.per_page
@ -72,6 +72,25 @@ class DanbooruExtractor(BaseExtractor):
post["date"] = text.parse_datetime(
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
post["tags"] = (
post["tag_string"].split(" ")
if post["tag_string"] else ())
post["tags_artist"] = (
post["tag_string_artist"].split(" ")
if post["tag_string_artist"] else ())
post["tags_character"] = (
post["tag_string_character"].split(" ")
if post["tag_string_character"] else ())
post["tags_copyright"] = (
post["tag_string_copyright"].split(" ")
if post["tag_string_copyright"] else ())
post["tags_general"] = (
post["tag_string_general"].split(" ")
if post["tag_string_general"] else ())
post["tags_meta"] = (
post["tag_string_meta"].split(" ")
if post["tag_string_meta"] else ())
if post["extension"] == "zip":
if self.ugoira:
post["frames"] = self._ugoira_frames(post)
@ -150,7 +169,8 @@ class DanbooruExtractor(BaseExtractor):
BASE_PATTERN = DanbooruExtractor.update({
"danbooru": {
"root": None,
"pattern": r"(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us",
"pattern": r"(?:(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us"
r"|donmai\.moe)",
},
"atfbooru": {
"root": "https://booru.allthefallen.moe",
@ -158,7 +178,7 @@ BASE_PATTERN = DanbooruExtractor.update({
},
"aibooru": {
"root": None,
"pattern": r"(?:safe.)?aibooru\.online",
"pattern": r"(?:safe\.)?aibooru\.online",
},
"booruvar": {
"root": "https://booru.borvar.art",

@ -38,15 +38,17 @@ class DeviantartExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1) or match.group(2)
self.user = (match.group(1) or match.group(2) or "").lower()
self.offset = 0
def _init(self):
self.jwt = self.config("jwt", False)
self.flat = self.config("flat", True)
self.extra = self.config("extra", False)
self.quality = self.config("quality", "100")
self.original = self.config("original", True)
self.comments = self.config("comments", False)
self.intermediary = self.config("intermediary", True)
self.api = DeviantartOAuthAPI(self)
self.group = False
@ -59,6 +61,9 @@ class DeviantartExtractor(Extractor):
else:
self.unwatch = None
if self.quality:
self.quality = ",q_{}".format(self.quality)
if self.original != "image":
self._update_content = self._update_content_default
else:
@ -87,14 +92,19 @@ class DeviantartExtractor(Extractor):
return True
def items(self):
if self.user and self.config("group", True):
profile = self.api.user_profile(self.user)
self.group = not profile
if self.group:
self.subcategory = "group-" + self.subcategory
self.user = self.user.lower()
else:
self.user = profile["user"]["username"]
if self.user:
group = self.config("group", True)
if group:
profile = self.api.user_profile(self.user)
if profile:
self.user = profile["user"]["username"]
self.group = False
elif group == "skip":
self.log.info("Skipping group '%s'", self.user)
raise exception.StopExtraction()
else:
self.subcategory = "group-" + self.subcategory
self.group = True
for deviation in self.deviations():
if isinstance(deviation, tuple):
@ -125,6 +135,19 @@ class DeviantartExtractor(Extractor):
self._update_content(deviation, content)
elif self.jwt:
self._update_token(deviation, content)
elif content["src"].startswith("https://images-wixmp-"):
if self.intermediary and deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*",
r"/intermediary\1", content["src"], 1)
if count:
deviation["is_original"] = False
deviation["_fallback"] = (content["src"],)
content["src"] = intermediary
if self.quality:
content["src"] = re.sub(
r",q_\d+", self.quality, content["src"], 1)
yield self.commit(deviation, content)
@ -212,7 +235,7 @@ class DeviantartExtractor(Extractor):
if self.comments:
deviation["comments"] = (
self.api.comments(deviation["deviationid"], target="deviation")
self._extract_comments(deviation["deviationid"], "deviation")
if deviation["stats"]["comments"] else ()
)
@ -332,7 +355,11 @@ class DeviantartExtractor(Extractor):
yield url, folder
def _update_content_default(self, deviation, content):
public = False if "premium_folder_data" in deviation else None
if "premium_folder_data" in deviation or deviation.get("is_mature"):
public = False
else:
public = None
data = self.api.deviation_download(deviation["deviationid"], public)
content.update(data)
deviation["is_original"] = True
@ -355,6 +382,9 @@ class DeviantartExtractor(Extractor):
if not sep:
return
# 'images-wixmp' returns 401 errors, but just 'wixmp' still works
url = url.replace("//images-wixmp", "//wixmp", 1)
# header = b'{"typ":"JWT","alg":"none"}'
payload = (
b'{"sub":"urn:app:","iss":"urn:app:","obj":[[{"path":"/f/' +
@ -363,14 +393,37 @@ class DeviantartExtractor(Extractor):
)
deviation["_fallback"] = (content["src"],)
deviation["is_original"] = True
content["src"] = (
"{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format(
url,
# base64 of 'header' is precomputed as 'eyJ0eX...'
# binascii.a2b_base64(header).rstrip(b"=\n").decode(),
# binascii.b2a_base64(header).rstrip(b"=\n").decode(),
binascii.b2a_base64(payload).rstrip(b"=\n").decode())
)
def _extract_comments(self, target_id, target_type="deviation"):
results = None
comment_ids = [None]
while comment_ids:
comments = self.api.comments(
target_id, target_type, comment_ids.pop())
if results:
results.extend(comments)
else:
results = comments
# parent comments, i.e. nodes with at least one child
parents = {c["parentid"] for c in comments}
# comments with more than one reply
replies = {c["commentid"] for c in comments if c["replies"]}
# add comment UUIDs with replies that are not parent to any node
comment_ids.extend(replies - parents)
return results
def _limited_request(self, url, **kwargs):
"""Limits HTTP requests to one every 2 seconds"""
kwargs["fatal"] = None
@ -399,9 +452,11 @@ class DeviantartExtractor(Extractor):
return None
dev = self.api.deviation(deviation["deviationid"], False)
folder = dev["premium_folder_data"]
folder = deviation["premium_folder_data"]
username = dev["author"]["username"]
has_access = folder["has_access"]
# premium_folder_data is no longer present when user has access (#5063)
has_access = ("premium_folder_data" not in dev) or folder["has_access"]
if not has_access and folder["type"] == "watchers" and \
self.config("auto-watch"):
@ -459,11 +514,13 @@ class DeviantartUserExtractor(DeviantartExtractor):
def items(self):
base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
(DeviantartGalleryExtractor , base + "gallery"),
(DeviantartScrapsExtractor , base + "gallery/scraps"),
(DeviantartJournalExtractor , base + "posts"),
(DeviantartStatusExtractor , base + "posts/statuses"),
(DeviantartFavoriteExtractor, base + "favourites"),
(DeviantartAvatarExtractor , base + "avatar"),
(DeviantartBackgroundExtractor, base + "banner"),
(DeviantartGalleryExtractor , base + "gallery"),
(DeviantartScrapsExtractor , base + "gallery/scraps"),
(DeviantartJournalExtractor , base + "posts"),
(DeviantartStatusExtractor , base + "posts/statuses"),
(DeviantartFavoriteExtractor , base + "favourites"),
), ("gallery",))
@ -484,6 +541,70 @@ class DeviantartGalleryExtractor(DeviantartExtractor):
return self._folder_urls(folders, "gallery", DeviantartFolderExtractor)
class DeviantartAvatarExtractor(DeviantartExtractor):
"""Extractor for an artist's avatar"""
subcategory = "avatar"
archive_fmt = "a_{_username}_{index}"
pattern = BASE_PATTERN + r"/avatar"
example = "https://www.deviantart.com/USER/avatar/"
def deviations(self):
name = self.user.lower()
profile = self.api.user_profile(name)
if not profile:
return ()
user = profile["user"]
icon = user["usericon"]
index = icon.rpartition("?")[2]
formats = self.config("formats")
if not formats:
url = icon.replace("/avatars/", "/avatars-big/", 1)
return (self._make_deviation(url, user, index, ""),)
if isinstance(formats, str):
formats = formats.replace(" ", "").split(",")
results = []
for fmt in formats:
fmt, _, ext = fmt.rpartition(".")
if fmt:
fmt = "-" + fmt
url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format(
fmt, name[0], name[1], name, ext, index)
results.append(self._make_deviation(url, user, index, fmt))
return results
def _make_deviation(self, url, user, index, fmt):
return {
"author" : user,
"category" : "avatar",
"index" : text.parse_int(index),
"is_deleted" : False,
"is_downloadable": False,
"published_time" : 0,
"title" : "avatar" + fmt,
"stats" : {"comments": 0},
"content" : {"src": url},
}
class DeviantartBackgroundExtractor(DeviantartExtractor):
"""Extractor for an artist's banner"""
subcategory = "background"
archive_fmt = "b_{index}"
pattern = BASE_PATTERN + r"/ba(?:nner|ckground)"
example = "https://www.deviantart.com/USER/banner/"
def deviations(self):
try:
return (self.api.user_profile(self.user.lower())
["cover_deviation"]["cover_deviation"],)
except Exception:
return ()
class DeviantartFolderExtractor(DeviantartExtractor):
"""Extractor for deviations inside an artist's gallery folder"""
subcategory = "folder"
@ -674,7 +795,7 @@ class DeviantartStatusExtractor(DeviantartExtractor):
deviation["stats"] = {"comments": comments_count}
if self.comments:
deviation["comments"] = (
self.api.comments(deviation["statusid"], target="status")
self._extract_comments(deviation["statusid"], "status")
if comments_count else ()
)
@ -951,8 +1072,9 @@ class DeviantartOAuthAPI():
self.strategy = extractor.config("pagination")
self.public = extractor.config("public", True)
self.client_id = extractor.config("client-id")
if self.client_id:
client_id = extractor.config("client-id")
if client_id:
self.client_id = str(client_id)
self.client_secret = extractor.config("client-secret")
else:
self.client_id = self.CLIENT_ID
@ -960,7 +1082,7 @@ class DeviantartOAuthAPI():
token = extractor.config("refresh-token")
if token is None or token == "cache":
token = "#" + str(self.client_id)
token = "#" + self.client_id
if not _refresh_token_cache(token):
token = None
self.refresh_token_key = token
@ -1048,17 +1170,28 @@ class DeviantartOAuthAPI():
"mature_content": self.mature}
return self._pagination_list(endpoint, params)
def comments(self, id, target, offset=0):
def comments(self, target_id, target_type="deviation",
comment_id=None, offset=0):
"""Fetch comments posted on a target"""
endpoint = "/comments/{}/{}".format(target, id)
params = {"maxdepth": "5", "offset": offset, "limit": 50,
"mature_content": self.mature}
endpoint = "/comments/{}/{}".format(target_type, target_id)
params = {
"commentid" : comment_id,
"maxdepth" : "5",
"offset" : offset,
"limit" : 50,
"mature_content": self.mature,
}
return self._pagination_list(endpoint, params=params, key="thread")
def deviation(self, deviation_id, public=None):
"""Query and return info about a single Deviation"""
endpoint = "/deviation/" + deviation_id
deviation = self._call(endpoint, public=public)
if deviation.get("is_mature") and public is None and \
self.refresh_token_key:
deviation = self._call(endpoint, public=False)
if self.metadata:
self._metadata((deviation,))
if self.folders:
@ -1176,7 +1309,7 @@ class DeviantartOAuthAPI():
self.log.info("Requesting public access token")
data = {"grant_type": "client_credentials"}
auth = (self.client_id, self.client_secret)
auth = util.HTTPBasicAuth(self.client_id, self.client_secret)
response = self.extractor.request(
url, method="POST", data=data, auth=auth, fatal=False)
data = response.json()
@ -1214,8 +1347,12 @@ class DeviantartOAuthAPI():
return data
if not fatal and status != 429:
return None
if data.get("error_description") == "User not found.":
error = data.get("error_description")
if error == "User not found.":
raise exception.NotFoundError("user or group")
if error == "Deviation not downloadable.":
raise exception.AuthorizationError()
self.log.debug(response.text)
msg = "API responded with {} {}".format(
@ -1239,6 +1376,17 @@ class DeviantartOAuthAPI():
self.log.error(msg)
return data
def _switch_tokens(self, results, params):
if len(results) < params["limit"]:
return True
if not self.extractor.jwt:
for item in results:
if item.get("is_mature"):
return True
return False
def _pagination(self, endpoint, params,
extend=True, public=None, unpack=False, key="results"):
warn = True
@ -1257,7 +1405,7 @@ class DeviantartOAuthAPI():
results = [item["journal"] for item in results
if "journal" in item]
if extend:
if public and len(results) < params["limit"]:
if public and self._switch_tokens(results, params):
if self.refresh_token_key:
self.log.debug("Switching to private access token")
public = False
@ -1265,9 +1413,10 @@ class DeviantartOAuthAPI():
elif data["has_more"] and warn:
warn = False
self.log.warning(
"Private deviations detected! Run 'gallery-dl "
"oauth:deviantart' and follow the instructions to "
"be able to access them.")
"Private or mature deviations detected! "
"Run 'gallery-dl oauth:deviantart' and follow the "
"instructions to be able to access them.")
# "statusid" cannot be used instead
if results and "deviationid" in results[0]:
if self.metadata:
@ -1377,12 +1526,14 @@ class DeviantartEclipseAPI():
self.csrf_token = None
def deviation_extended_fetch(self, deviation_id, user, kind=None):
endpoint = "/_napi/da-browse/shared_api/deviation/extended_fetch"
endpoint = "/_puppy/dadeviation/init"
params = {
"deviationid" : deviation_id,
"username" : user,
"type" : kind,
"include_session": "false",
"deviationid" : deviation_id,
"username" : user,
"type" : kind,
"include_session" : "false",
"expand" : "deviation.related",
"da_minor_version": "20230710",
}
return self._call(endpoint, params)
@ -1410,7 +1561,7 @@ class DeviantartEclipseAPI():
return self._pagination(endpoint, params)
def search_deviations(self, params):
endpoint = "/_napi/da-browse/api/networkbar/search/deviations"
endpoint = "/_puppy/dabrowse/search/deviations"
return self._pagination(endpoint, params, key="deviations")
def user_info(self, user, expand=False):
@ -1497,7 +1648,7 @@ class DeviantartEclipseAPI():
return token
@cache(maxage=100*365*86400, keyarg=0)
@cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(token):
if token and token[0] == "#":
return None

@ -44,20 +44,26 @@ class EromeExtractor(Extractor):
pos = page.index('<div class="user-profile', pos)
user, pos = text.extract(
page, 'href="https://www.erome.com/', '"', pos)
urls = []
groups = page.split('<div class="media-group"')
for group in util.advance(groups, 1):
url = (text.extr(group, '<source src="', '"') or
text.extr(group, 'data-src="', '"'))
if url:
urls.append(url)
data = {
"album_id" : album_id,
"title" : text.unescape(title),
"user" : text.unquote(user),
"count" : len(urls),
"_http_headers": {"Referer": url},
}
yield Message.Directory, data
groups = page.split('<div class="media-group"')
for data["num"], group in enumerate(util.advance(groups, 1), 1):
url = (text.extr(group, '<source src="', '"') or
text.extr(group, 'data-src="', '"'))
if url:
yield Message.Url, url, text.nameext_from_url(url, data)
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def albums(self):
return ()

@ -26,7 +26,8 @@ class ExhentaiExtractor(Extractor):
cookies_domain = ".exhentai.org"
cookies_names = ("ipb_member_id", "ipb_pass_hash")
root = "https://exhentai.org"
request_interval = 5.0
request_interval = (3.0, 6.0)
ciphers = "DEFAULT:!DH"
LIMIT = False
@ -39,20 +40,13 @@ class ExhentaiExtractor(Extractor):
if domain == "auto":
domain = ("ex" if self.version == "ex" else "e-") + "hentai.org"
self.root = "https://" + domain
self.api_url = self.root + "/api.php"
self.cookies_domain = "." + domain
Extractor.initialize(self)
if self.version != "ex":
self.cookies.set("nw", "1", domain=self.cookies_domain)
self.original = self.config("original", True)
limits = self.config("limits", False)
if limits and limits.__class__ is int:
self.limits = limits
self._remaining = 0
else:
self.limits = False
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
@ -73,16 +67,18 @@ class ExhentaiExtractor(Extractor):
if username:
return self.cookies_update(self._login_impl(username, password))
self.log.info("no username given; using e-hentai.org")
self.root = "https://e-hentai.org"
self.cookies_domain = ".e-hentai.org"
self.cookies.set("nw", "1", domain=self.cookies_domain)
if self.version == "ex":
self.log.info("No username or cookies given; using e-hentai.org")
self.root = "https://e-hentai.org"
self.cookies_domain = ".e-hentai.org"
self.cookies.set("nw", "1", domain=self.cookies_domain)
self.original = False
self.limits = False
@cache(maxage=90*24*3600, keyarg=1)
@cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
headers = {
"Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1",
@ -96,10 +92,19 @@ class ExhentaiExtractor(Extractor):
"ipb_login_submit": "Login!",
}
self.cookies.clear()
response = self.request(url, method="POST", headers=headers, data=data)
if b"You are now logged in as:" not in response.content:
raise exception.AuthenticationError()
return {c: response.cookies[c] for c in self.cookies_names}
# collect more cookies
url = self.root + "/favorites.php"
response = self.request(url)
if response.history:
self.request(url)
return self.cookies
class ExhentaiGalleryExtractor(ExhentaiExtractor):
@ -112,18 +117,38 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
self.key = {}
self.count = 0
self.gallery_id = text.parse_int(match.group(2) or match.group(5))
self.gallery_token = match.group(3)
self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1)
self.key_start = None
self.key_show = None
self.key_next = None
self.count = 0
self.data = None
def _init(self):
source = self.config("source")
if source == "hitomi":
self.items = self._items_hitomi
limits = self.config("limits", False)
if limits and limits.__class__ is int:
self.limits = limits
self._remaining = 0
else:
self.limits = False
self.fallback_retries = self.config("fallback-retries", 2)
self.original = self.config("original", True)
def finalize(self):
if self.data:
self.log.info("Use '%s/s/%s/%s-%s' as input URL "
"to continue downloading from the current position",
self.root, self.data["image_token"],
self.gallery_id, self.data["num"])
def favorite(self, slot="0"):
url = self.root + "/gallerypopups.php"
params = {
@ -145,39 +170,32 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
gpage = self._gallery_page()
self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
if not self.image_token:
self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage)
return
raise exception.StopExtraction(
"Failed to extract initial image token")
ipage = self._image_page()
else:
ipage = self._image_page()
part = text.extr(ipage, 'hentai.org/g/', '"')
if not part:
self.log.error("Failed to extract gallery token")
self.log.debug("Page content:\n%s", ipage)
return
raise exception.StopExtraction(
"Failed to extract gallery token")
self.gallery_token = part.split("/")[1]
gpage = self._gallery_page()
data = self.get_metadata(gpage)
self.data = data = self.get_metadata(gpage)
self.count = text.parse_int(data["filecount"])
yield Message.Directory, data
def _validate_response(response):
# declared inside 'items()' to be able to access 'data'
if not response.history and response.headers.get(
"content-type", "").startswith("text/html"):
self._report_limits(data)
return True
images = itertools.chain(
(self.image_from_page(ipage),), self.images_from_api())
for url, image in images:
data.update(image)
if self.limits:
self._check_limits(data)
if "/fullimg.php" in url:
data["_http_validate"] = _validate_response
if "/fullimg" in url:
data["_http_validate"] = self._validate_response
else:
data["_http_validate"] = None
yield Message.Url, url, data
@ -185,6 +203,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
fav = self.config("fav")
if fav is not None:
self.favorite(fav)
self.data = None
def _items_hitomi(self):
if self.config("metadata", False):
@ -208,6 +227,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def metadata_from_page(self, page):
extr = text.extract_from(page)
api_url = extr('var api_url = "', '"')
if api_url:
self.api_url = api_url
data = {
"gid" : self.gallery_id,
"token" : self.gallery_token,
@ -225,7 +249,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
'>Visible:</td><td class="gdt2">', '<'),
"language" : extr('>Language:</td><td class="gdt2">', ' '),
"filesize" : text.parse_bytes(extr(
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
'>File Size:</td><td class="gdt2">', '<').rstrip("Bbi")),
"filecount" : extr('>Length:</td><td class="gdt2">', ' '),
"favorites" : extr('id="favcount">', ' '),
"rating" : extr(">Average: ", "<"),
@ -251,14 +275,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
return data
def metadata_from_api(self):
url = self.root + "/api.php"
data = {
"method": "gdata",
"gidlist": ((self.gallery_id, self.gallery_token),),
"method" : "gdata",
"gidlist" : ((self.gallery_id, self.gallery_token),),
"namespace": 1,
}
data = self.request(url, method="POST", json=data).json()
data = self.request(self.api_url, method="POST", json=data).json()
if "error" in data:
raise exception.StopExtraction(data["error"])
@ -269,54 +292,71 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
extr = text.extract_from(page, pos)
self.key["next"] = extr("'", "'")
self.key_next = extr("'", "'")
iurl = extr('<img id="img" src="', '"')
orig = extr('hentai.org/fullimg.php', '"')
nl = extr(" nl(", ")").strip("\"'")
orig = extr('hentai.org/fullimg', '"')
try:
if self.original and orig:
url = self.root + "/fullimg.php" + text.unescape(orig)
url = self.root + "/fullimg" + text.unescape(orig)
data = self._parse_original_info(extr('ownload original', '<'))
data["_fallback"] = self._fallback_original(nl, url)
else:
url = iurl
data = self._parse_image_info(url)
data["_fallback"] = self._fallback_1280(nl, self.image_num)
except IndexError:
self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction(
"Unable to parse image info for '%s'", url)
data["num"] = self.image_num
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
self.key["show"] = extr('var showkey="', '";')
data["image_token"] = self.key_start = extr('var startkey="', '";')
data["_url_1280"] = iurl
data["_nl"] = nl
self.key_show = extr('var showkey="', '";')
self._check_509(iurl, data)
return url, text.nameext_from_url(iurl, data)
self._check_509(iurl)
return url, text.nameext_from_url(url, data)
def images_from_api(self):
"""Get image url and data from api calls"""
api_url = self.root + "/api.php"
nextkey = self.key["next"]
api_url = self.api_url
nextkey = self.key_next
request = {
"method" : "showpage",
"gid" : self.gallery_id,
"page" : 0,
"imgkey" : nextkey,
"showkey": self.key["show"],
"showkey": self.key_show,
}
for request["page"] in range(self.image_num + 1, self.count + 1):
page = self.request(api_url, method="POST", json=request).json()
i3 = page["i3"]
i6 = page["i6"]
imgkey = nextkey
nextkey, pos = text.extract(page["i3"], "'", "'")
imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
origurl, pos = text.extract(page["i7"], '<a href="', '"')
nextkey, pos = text.extract(i3, "'", "'")
imgurl , pos = text.extract(i3, 'id="img" src="', '"', pos)
nl , pos = text.extract(i3, " nl(", ")", pos)
nl = (nl or "").strip("\"'")
try:
if self.original and origurl:
pos = i6.find("hentai.org/fullimg")
if self.original and pos >= 0:
origurl, pos = text.rextract(i6, '"', '"', pos)
url = text.unescape(origurl)
data = self._parse_original_info(text.extract(
page["i7"], "ownload original", "<", pos)[0])
i6, "ownload original", "<", pos)[0])
data["_fallback"] = self._fallback_original(nl, url)
else:
url = imgurl
data = self._parse_image_info(url)
data["_fallback"] = self._fallback_1280(
nl, request["page"], imgkey)
except IndexError:
self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction(
@ -324,34 +364,54 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["num"] = request["page"]
data["image_token"] = imgkey
data["_url_1280"] = imgurl
data["_nl"] = nl
self._check_509(imgurl, data)
yield url, text.nameext_from_url(imgurl, data)
self._check_509(imgurl)
yield url, text.nameext_from_url(url, data)
request["imgkey"] = nextkey
def _report_limits(self, data):
def _validate_response(self, response):
if not response.history and response.headers.get(
"content-type", "").startswith("text/html"):
page = response.text
self.log.warning("'%s'", page)
if " requires GP" in page:
gp = self.config("gp")
if gp == "stop":
raise exception.StopExtraction("Not enough GP")
elif gp == "wait":
input("Press ENTER to continue.")
return response.url
self.log.info("Falling back to non-original downloads")
self.original = False
return self.data["_url_1280"]
self._report_limits()
return True
def _report_limits(self):
ExhentaiExtractor.LIMIT = True
raise exception.StopExtraction(
"Image limit reached! "
"Continue with '%s/s/%s/%s-%s' as URL after resetting it.",
self.root, data["image_token"], self.gallery_id, data["num"])
raise exception.StopExtraction("Image limit reached!")
def _check_limits(self, data):
if not self._remaining or data["num"] % 25 == 0:
self._update_limits()
self._remaining -= data["cost"]
if self._remaining <= 0:
self._report_limits(data)
self._report_limits()
def _check_509(self, url, data):
def _check_509(self, url):
# full 509.gif URLs
# - https://exhentai.org/img/509.gif
# - https://ehgt.org/g/509.gif
if url.endswith(("hentai.org/img/509.gif",
"ehgt.org/g/509.gif")):
self.log.debug(url)
self._report_limits(data)
self._report_limits()
def _update_limits(self):
url = "https://e-hentai.org/home.php"
@ -390,6 +450,27 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.NotFoundError("image page")
return page
def _fallback_original(self, nl, fullimg):
url = "{}?nl={}".format(fullimg, nl)
for _ in util.repeat(self.fallback_retries):
yield url
def _fallback_1280(self, nl, num, token=None):
if not token:
token = self.key_start
for _ in util.repeat(self.fallback_retries):
url = "{}/s/{}/{}-{}?nl={}".format(
self.root, token, self.gallery_id, num, nl)
page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
return
url, data = self.image_from_page(page)
yield url
nl = data["_nl"]
@staticmethod
def _parse_image_info(url):
for part in url.split("/")[4:]:

@ -8,6 +8,7 @@
from .common import Extractor, Message
from .. import text
from ..cache import memcache
import re
BASE_PATTERN = (
@ -27,8 +28,20 @@ class FanboxExtractor(Extractor):
_warning = True
def _init(self):
self.headers = {"Origin": self.root}
self.embeds = self.config("embeds", True)
includes = self.config("metadata")
if includes:
if isinstance(includes, str):
includes = includes.split(",")
elif not isinstance(includes, (list, tuple)):
includes = ("user", "plan")
self._meta_user = ("user" in includes)
self._meta_plan = ("plan" in includes)
else:
self._meta_user = self._meta_plan = False
if self._warning:
if not self.cookies_check(("FANBOXSESSID",)):
self.log.warning("no 'FANBOXSESSID' cookie set")
@ -43,11 +56,9 @@ class FanboxExtractor(Extractor):
"""Return all relevant post objects"""
def _pagination(self, url):
headers = {"Origin": self.root}
while url:
url = text.ensure_http_scheme(url)
body = self.request(url, headers=headers).json()["body"]
body = self.request(url, headers=self.headers).json()["body"]
for item in body["items"]:
try:
yield self._get_post_data(item["id"])
@ -58,9 +69,8 @@ class FanboxExtractor(Extractor):
def _get_post_data(self, post_id):
"""Fetch and process post data"""
headers = {"Origin": self.root}
url = "https://api.fanbox.cc/post.info?postId="+post_id
post = self.request(url, headers=headers).json()["body"]
post = self.request(url, headers=self.headers).json()["body"]
content_body = post.pop("body", None)
if content_body:
@ -98,8 +108,47 @@ class FanboxExtractor(Extractor):
post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False
if self._meta_user:
post["user"] = self._get_user_data(post["creatorId"])
if self._meta_plan:
plans = self._get_plan_data(post["creatorId"])
post["plan"] = plans[post["feeRequired"]]
return content_body, post
@memcache(keyarg=1)
def _get_user_data(self, creator_id):
url = "https://api.fanbox.cc/creator.get"
params = {"creatorId": creator_id}
data = self.request(url, params=params, headers=self.headers).json()
user = data["body"]
user.update(user.pop("user"))
return user
@memcache(keyarg=1)
def _get_plan_data(self, creator_id):
url = "https://api.fanbox.cc/plan.listCreator"
params = {"creatorId": creator_id}
data = self.request(url, params=params, headers=self.headers).json()
plans = {0: {
"id" : "",
"title" : "",
"fee" : 0,
"description" : "",
"coverImageUrl" : "",
"creatorId" : creator_id,
"hasAdultContent": None,
"paymentMethod" : None,
}}
for plan in data["body"]:
del plan["user"]
plans[plan["fee"]] = plan
return plans
def _get_urls_from_post(self, content_body, post):
num = 0
cover_image = post.get("coverImageUrl")

@ -42,7 +42,11 @@ class FantiaExtractor(Extractor):
post = self._get_post_data(post_id)
post["num"] = 0
for content in self._get_post_contents(post):
contents = self._get_post_contents(post)
post["content_count"] = len(contents)
post["content_num"] = 0
for content in contents:
files = self._process_content(post, content)
yield Message.Directory, post
@ -59,6 +63,8 @@ class FantiaExtractor(Extractor):
post["content_filename"] or file["file_url"], post)
yield Message.Url, file["file_url"], post
post["content_num"] += 1
def posts(self):
"""Return post IDs"""
@ -102,7 +108,7 @@ class FantiaExtractor(Extractor):
"fanclub_user_name": resp["fanclub"]["user"]["name"],
"fanclub_name": resp["fanclub"]["name"],
"fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]),
"tags": resp["tags"],
"tags": [t["name"] for t in resp["tags"]],
"_data": resp,
}
@ -131,6 +137,7 @@ class FantiaExtractor(Extractor):
post["content_filename"] = content.get("filename") or ""
post["content_id"] = content["id"]
post["content_comment"] = content.get("comment") or ""
post["content_num"] += 1
post["plan"] = content["plan"] or self._empty_plan
files = []

@ -10,6 +10,9 @@ from .common import Extractor, Message
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?fapello\.(?:com|su)"
class FapelloPostExtractor(Extractor):
"""Extractor for individual posts on fapello.com"""
category = "fapello"
@ -17,16 +20,16 @@ class FapelloPostExtractor(Extractor):
directory_fmt = ("{category}", "{model}")
filename_fmt = "{model}_{id}.{extension}"
archive_fmt = "{type}_{model}_{id}"
pattern = (r"(?:https?://)?(?:www\.)?fapello\.com"
r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)")
pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)"
example = "https://fapello.com/MODEL/12345/"
def __init__(self, match):
Extractor.__init__(self, match)
self.root = text.root_from_url(match.group(0))
self.model, self.id = match.groups()
def items(self):
url = "https://fapello.com/{}/{}/".format(self.model, self.id)
url = "{}/{}/{}/".format(self.root, self.model, self.id)
page = text.extr(
self.request(url, allow_redirects=False).text,
'class="uk-align-center"', "</div>", None)
@ -48,27 +51,29 @@ class FapelloModelExtractor(Extractor):
"""Extractor for all posts from a fapello model"""
category = "fapello"
subcategory = "model"
pattern = (r"(?:https?://)?(?:www\.)?fapello\.com"
r"/(?!top-(?:likes|followers)|popular_videos"
pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos"
r"|videos|trending|search/?$)"
r"([^/?#]+)/?$")
example = "https://fapello.com/model/"
def __init__(self, match):
Extractor.__init__(self, match)
self.root = text.root_from_url(match.group(0))
self.model = match.group(1)
def items(self):
num = 1
data = {"_extractor": FapelloPostExtractor}
while True:
url = "https://fapello.com/ajax/model/{}/page-{}/".format(
self.model, num)
url = "{}/ajax/model/{}/page-{}/".format(
self.root, self.model, num)
page = self.request(url).text
if not page:
return
for url in text.extract_iter(page, '<a href="', '"'):
if url == "javascript:void(0);":
continue
yield Message.Queue, url, data
num += 1
@ -77,13 +82,14 @@ class FapelloPathExtractor(Extractor):
"""Extractor for models and posts from fapello.com paths"""
category = "fapello"
subcategory = "path"
pattern = (r"(?:https?://)?(?:www\.)?fapello\.com"
pattern = (BASE_PATTERN +
r"/(?!search/?$)(top-(?:likes|followers)|videos|trending"
r"|popular_videos/[^/?#]+)/?$")
example = "https://fapello.com/trending/"
def __init__(self, match):
Extractor.__init__(self, match)
self.root = text.root_from_url(match.group(0))
self.path = match.group(1)
def items(self):
@ -93,9 +99,14 @@ class FapelloPathExtractor(Extractor):
else:
data = {"_extractor": FapelloPostExtractor}
if "fapello.su" in self.root:
self.path = self.path.replace("-", "/")
if self.path == "trending":
data = {"_extractor": FapelloModelExtractor}
while True:
page = self.request("https://fapello.com/ajax/{}/page-{}/".format(
self.path, num)).text
page = self.request("{}/ajax/{}/page-{}/".format(
self.root, self.path, num)).text
if not page:
return

@ -24,6 +24,8 @@ class FoolfuukaExtractor(BaseExtractor):
BaseExtractor.__init__(self, match)
if self.category == "b4k":
self.remote = self._remote_direct
elif self.category == "archivedmoe":
self.referer = False
def items(self):
yield Message.Directory, self.metadata()
@ -53,9 +55,12 @@ class FoolfuukaExtractor(BaseExtractor):
def remote(self, media):
"""Resolve a remote media link"""
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
return text.extr(page, needle, '"')
url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"')
if url.endswith(".webm") and \
url.startswith("https://thebarchive.com/"):
return url[:-1]
return url
@staticmethod
def _remote_direct(media):
@ -169,7 +174,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
directory_fmt = ("{category}", "search", "{search}")
pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
example = "https://archived.moe/_/search/text/QUERY/"
request_interval = 1.0
request_interval = (0.5, 1.5)
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)

@ -38,10 +38,6 @@ class FoolslideExtractor(BaseExtractor):
BASE_PATTERN = FoolslideExtractor.update({
"powermanga": {
"root": "https://read.powermanga.org",
"pattern": r"read(?:er)?\.powermanga\.org",
},
})

@ -22,7 +22,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
def __init__(self, match):
self.gallery_hash = match.group(1)
url = "{}/thumbs/{}/".format(self.root, self.gallery_hash)
url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@ -50,15 +50,16 @@ class FuskatorGalleryExtractor(GalleryExtractor):
"gallery_id" : text.parse_int(gallery_id),
"gallery_hash": self.gallery_hash,
"title" : text.unescape(title[:-15]),
"views" : data["hits"],
"score" : data["rating"],
"tags" : data["tags"].split(","),
"count" : len(data["images"]),
"views" : data.get("hits"),
"score" : data.get("rating"),
"tags" : (data.get("tags") or "").split(","),
}
def images(self, page):
for image in self.data["images"]:
yield "https:" + image["imageUrl"], image
return [
("https:" + image["imageUrl"], image)
for image in self.data["images"]
]
class FuskatorSearchExtractor(Extractor):

@ -23,7 +23,7 @@ class GelbooruBase():
root = "https://gelbooru.com"
offset = 0
def _api_request(self, params, key="post"):
def _api_request(self, params, key="post", log=False):
if "s" not in params:
params["s"] = "post"
params["api_key"] = self.api_key
@ -32,10 +32,14 @@ class GelbooruBase():
url = self.root + "/index.php?page=dapi&q=index&json=1"
data = self.request(url, params=params).json()
if key not in data:
return ()
try:
posts = data[key]
except KeyError:
if log:
self.log.error("Incomplete API response (missing '%s')", key)
self.log.debug("%s", data)
return []
posts = data[key]
if not isinstance(posts, list):
return (posts,)
return posts
@ -114,7 +118,7 @@ class GelbooruBase():
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)"
pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)"
example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG"
@ -165,15 +169,16 @@ class GelbooruFavoriteExtractor(GelbooruBase,
"id" : self.favorite_id,
"limit": "1",
}
count = self._api_request(params, "@attributes")[0]["count"]
count = self._api_request(params, "@attributes", True)[0]["count"]
if count <= self.offset:
return
pnum, last = divmod(count + 1, self.per_page)
if self.offset >= last:
pnum, last = divmod(count-1, self.per_page)
if self.offset > last:
# page number change
self.offset -= last
diff, self.offset = divmod(self.offset, self.per_page)
diff, self.offset = divmod(self.offset-1, self.per_page)
pnum -= diff + 1
skip = self.offset
@ -182,9 +187,9 @@ class GelbooruFavoriteExtractor(GelbooruBase,
params["limit"] = self.per_page
while True:
favs = self._api_request(params, "favorite")
favs = self._api_request(params, "favorite", True)
favs.reverse()
if skip:
favs = favs[skip:]
skip = 0

@ -22,14 +22,10 @@ class GelbooruV02Extractor(booru.BooruExtractor):
def _init(self):
self.api_key = self.config("api-key")
self.user_id = self.config("user-id")
try:
self.api_root = INSTANCES[self.category]["api_root"]
except KeyError:
self.api_root = self.root
self.api_root = self.config_instance("api_root") or self.root
if self.category == "realbooru":
self.items = self._items_realbooru
self._file_url = self._file_url_realbooru
self._tags = self._tags_realbooru
def _api_request(self, params):
@ -128,28 +124,6 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
def _items_realbooru(self):
from .common import Message
data = self.metadata()
for post in self.posts():
try:
html = self._html(post)
url = post["file_url"] = text.rextract(
html, 'href="', '"', html.index(">Original<"))[0]
except Exception:
self.log.debug("Unable to fetch download URL for post %s "
"(md5: %s)", post.get("id"), post.get("md5"))
continue
text.nameext_from_url(url, post)
post.update(data)
self._prepare(post)
self._tags(post, html)
yield Message.Directory, post
yield Message.Url, url, post
def _tags_realbooru(self, post, page):
tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list)
@ -161,14 +135,14 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["tags_" + key] = " ".join(value)
INSTANCES = {
BASE_PATTERN = GelbooruV02Extractor.update({
"realbooru": {
"root": "https://realbooru.com",
"pattern": r"realbooru\.com",
},
"rule34": {
"root": "https://rule34.xxx",
"pattern": r"rule34\.xxx",
"pattern": r"(?:www\.)?rule34\.xxx",
"api_root": "https://api.rule34.xxx",
},
"safebooru": {
@ -187,16 +161,14 @@ INSTANCES = {
"root": "https://xbooru.com",
"pattern": r"xbooru\.com",
},
}
BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES)
})
class GelbooruV02TagExtractor(GelbooruV02Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG"
def __init__(self, match):
@ -208,6 +180,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
return {"search_tags": self.tags}
def posts(self):
if self.tags == "all":
self.tags = ""
return self._pagination({"tags": self.tags})

@ -73,7 +73,7 @@ class GofileFolderExtractor(Extractor):
def _get_website_token(self):
self.log.debug("Fetching website token")
page = self.request(self.root + "/dist/js/alljs.js").text
return text.extr(page, 'fetchData.websiteToken = "', '"')
return text.extr(page, 'fetchData.wt = "', '"')
def _get_content(self, content_id, password=None):
if password is not None:
@ -81,7 +81,7 @@ class GofileFolderExtractor(Extractor):
return self._api_request("getContent", {
"contentId" : content_id,
"token" : self.api_token,
"websiteToken": self.website_token,
"wt" : self.website_token,
"password" : password,
})

@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://hatenablog.com"""
import re
from .common import Extractor, Message
from .. import text
BASE_PATTERN = (
r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
r"|hatenadiary\.com|hateblo\.jp)))"
)
QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
class HatenablogExtractor(Extractor):
"""Base class for HatenaBlog extractors"""
category = "hatenablog"
directory_fmt = ("{category}", "{domain}")
filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
archive_fmt = "{filename}"
def __init__(self, match):
Extractor.__init__(self, match)
self.domain = match.group(1) or match.group(2)
def _init(self):
self._find_img = re.compile(r'<img +([^>]+)').finditer
def _handle_article(self, article: str):
extr = text.extract_from(article)
date = text.parse_datetime(extr('<time datetime="', '"'))
entry_link = text.unescape(extr('<a href="', '"'))
entry = entry_link.partition("/entry/")[2]
title = text.unescape(extr('>', '<'))
content = extr(
'<div class="entry-content hatenablog-entry">', '</div>')
images = []
for i in self._find_img(content):
attributes = i.group(1)
if 'class="hatena-fotolife"' not in attributes:
continue
image = text.unescape(text.extr(attributes, 'src="', '"'))
images.append(image)
data = {
"domain": self.domain,
"date": date,
"entry": entry,
"title": title,
"count": len(images),
}
yield Message.Directory, data
for data["num"], url in enumerate(images, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
class HatenablogEntriesExtractor(HatenablogExtractor):
"""Base class for a list of entries"""
allowed_parameters = ()
def __init__(self, match):
HatenablogExtractor.__init__(self, match)
self.path = match.group(3)
self.query = {key: value for key, value in text.parse_query(
match.group(4)).items() if self._acceptable_query(key)}
def _init(self):
HatenablogExtractor._init(self)
self._find_pager_url = re.compile(
r' class="pager-next">\s*<a href="([^"]+)').search
def items(self):
url = "https://" + self.domain + self.path
query = self.query
while url:
page = self.request(url, params=query).text
extr = text.extract_from(page)
attributes = extr('<body ', '>')
if "page-archive" in attributes:
yield from self._handle_partial_articles(extr)
else:
yield from self._handle_full_articles(extr)
match = self._find_pager_url(page)
url = text.unescape(match.group(1)) if match else None
query = None
def _handle_partial_articles(self, extr):
while True:
section = extr('<section class="archive-entry', '</section>')
if not section:
break
url = "hatenablog:" + text.unescape(text.extr(
section, '<a class="entry-title-link" href="', '"'))
data = {"_extractor": HatenablogEntryExtractor}
yield Message.Queue, url, data
def _handle_full_articles(self, extr):
while True:
attributes = extr('<article ', '>')
if not attributes:
break
if "no-entry" in attributes:
continue
article = extr('', '</article>')
yield from self._handle_article(article)
def _acceptable_query(self, key):
return key == "page" or key in self.allowed_parameters
class HatenablogEntryExtractor(HatenablogExtractor):
"""Extractor for a single entry URL"""
subcategory = "entry"
pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
example = "https://BLOG.hatenablog.com/entry/PATH"
def __init__(self, match):
HatenablogExtractor.__init__(self, match)
self.path = match.group(3)
def items(self):
url = "https://" + self.domain + "/entry/" + self.path
page = self.request(url).text
extr = text.extract_from(page)
while True:
attributes = extr('<article ', '>')
if "no-entry" in attributes:
continue
article = extr('', '</article>')
return self._handle_article(article)
class HatenablogHomeExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's home page"""
subcategory = "home"
pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
example = "https://BLOG.hatenablog.com"
class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's archive page"""
subcategory = "archive"
pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
r"|/category/[^?#]+)?)" + QUERY_RE)
example = "https://BLOG.hatenablog.com/archive/2024"
class HatenablogSearchExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
example = "https://BLOG.hatenablog.com/search?q=QUERY"
allowed_parameters = ("q",)

@ -1,92 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.hbrowse.com/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text, util, exception
class HbrowseBase():
"""Base class for hbrowse extractors"""
category = "hbrowse"
root = "https://www.hbrowse.com"
def parse_page(self, page, data):
"""Parse metadata on 'page' and add it to 'data'"""
data, pos = text.extract_all(page, (
('manga' , '<td class="listLong">', '</td>'),
('artist', '<td class="listLong">', '</td>'),
('total' , '<td class="listLong">', ' '),
('origin', '<td class="listLong">', '</td>'),
), values=data)
if not data["manga"] and "<b>Warning</b>" in page:
msg = page.rpartition(">")[2].strip()
raise exception.StopExtraction("Site is not accessible: '%s'", msg)
tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
data["manga"] = text.unescape(data["manga"])
data["total"] = text.parse_int(data["total"])
data["artist"] = text.remove_html(data["artist"])
data["origin"] = text.remove_html(data["origin"])
data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
return data
class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
"""Extractor for manga-chapters from hbrowse.com"""
directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
"{page:>03}.{extension}")
archive_fmt = "{manga_id}_{chapter}_{page}"
pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
example = "https://www.hbrowse.com/12345/c00000"
def __init__(self, match):
self.path, self.gid, self.chapter = match.groups()
self.path += "/"
ChapterExtractor.__init__(self, match)
def metadata(self, page):
return self.parse_page(page, {
"manga_id": text.parse_int(self.gid),
"chapter": text.parse_int(self.chapter)
})
def images(self, page):
base = self.root + "/data" + self.path
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
return [(base + name, None) for name in util.json_loads(json_data)]
class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
"""Extractor for manga from hbrowse.com"""
chapterclass = HbrowseChapterExtractor
reverse = False
pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
example = "https://www.hbrowse.com/12345"
def chapters(self, page):
results = []
data = self.parse_page(page, {
"manga_id": text.parse_int(
self.manga_url.rstrip("/").rpartition("/")[2])
})
pos = 0
needle = '<td class="listMiddle">\n<a class="listLink" href="'
while True:
url, pos = text.extract(page, needle, '"', pos)
if not url:
return results
title, pos = text.extract(page, '>View ', '<', pos)
data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
data["title"] = title
results.append((text.urljoin(self.root, url), data.copy()))

@ -42,7 +42,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor):
def images(self, page):
return [
(url, None)
(url.replace("http:", "https:", 1), None)
for url in text.extract_iter(
page, '<amp-img class="auto-style" src="', '"')
]

@ -72,13 +72,11 @@ class HentaifoundryExtractor(Extractor):
extr = text.extract_from(page, page.index('id="picBox"'))
data = {
"index" : text.parse_int(path.rsplit("/", 2)[1]),
"title" : text.unescape(extr('class="imageTitle">', '<')),
"artist" : text.unescape(extr('/profile">', '<')),
"width" : text.parse_int(extr('width="', '"')),
"height" : text.parse_int(extr('height="', '"')),
"index" : text.parse_int(path.rsplit("/", 2)[1]),
"src" : text.urljoin(self.root, text.unescape(extr(
'src="', '"'))),
"_body" : extr(
'<div class="boxbody"', '<div class="boxfooter"'),
"description": text.unescape(text.remove_html(extr(
'>Description</div>', '</section>')
.replace("\r\n", "\n"), "", "")),
@ -92,6 +90,20 @@ class HentaifoundryExtractor(Extractor):
">Tags </span>", "</div>")),
}
body = data["_body"]
if "<object " in body:
data["src"] = text.urljoin(self.root, text.unescape(text.extr(
body, 'name="movie" value="', '"')))
data["width"] = text.parse_int(text.extr(
body, "name='width' value='", "'"))
data["height"] = text.parse_int(text.extr(
body, "name='height' value='", "'"))
else:
data["src"] = text.urljoin(self.root, text.unescape(text.extr(
body, 'src="', '"')))
data["width"] = text.parse_int(text.extr(body, 'width="', '"'))
data["height"] = text.parse_int(text.extr(body, 'height="', '"'))
return text.nameext_from_url(data["src"], data)
def _parse_story(self, html):
@ -121,9 +133,25 @@ class HentaifoundryExtractor(Extractor):
return text.nameext_from_url(data["src"], data)
def _init_site_filters(self):
def _request_check(self, url, **kwargs):
self.request = self._request_original
# check for Enter button / front page
# and update PHPSESSID and content filters if necessary
response = self.request(url, **kwargs)
content = response.content
if len(content) < 5000 and \
b'<div id="entryButtonContainer"' in content:
self._init_site_filters(False)
response = self.request(url, **kwargs)
return response
def _init_site_filters(self, check_cookies=True):
"""Set site-internal filters to show all images"""
if self.cookies.get("PHPSESSID", domain=self.cookies_domain):
if check_cookies and self.cookies.get(
"PHPSESSID", domain=self.cookies_domain):
self._request_original = self.request
self.request = self._request_check
return
url = self.root + "/?enterAgree=1"

@ -30,10 +30,10 @@ class HiperdexBase():
extr = text.extract_from(page)
return {
"manga" : text.unescape(extr(
"<title>", "<").rpartition(" Manga - ")[0].strip()),
"url" : text.unescape(extr(
'property="og:url" content="', '"')),
"manga" : text.unescape(extr(
'"headline": "', '"')),
"score" : text.parse_float(extr(
'id="averagerate">', '<')),
"author" : text.remove_html(extr(

@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
category = "hitomi"
root = "https://hitomi.la"
pattern = (r"(?:https?://)?hitomi\.la"
r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)"
r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)"
r"/(?:[^/?#]+-)?(\d+)")
example = "https://hitomi.la/manga/TITLE-867789.html"

@ -15,14 +15,17 @@ from .. import text, util, exception
import collections
import re
BASE_PATTERN = r"(?:https?://)?idol\.sankakucomplex\.com(?:/[a-z]{2})?"
class IdolcomplexExtractor(SankakuExtractor):
"""Base class for idolcomplex extractors"""
category = "idolcomplex"
root = "https://idol.sankakucomplex.com"
cookies_domain = "idol.sankakucomplex.com"
cookies_names = ("login", "pass_hash")
root = "https://" + cookies_domain
request_interval = 5.0
cookies_names = ("_idolcomplex_session",)
referer = False
request_interval = (3.0, 6.0)
def __init__(self, match):
SankakuExtractor.__init__(self, match)
@ -31,14 +34,19 @@ class IdolcomplexExtractor(SankakuExtractor):
self.start_post = 0
def _init(self):
self.extags = self.config("tags", False)
self.find_pids = re.compile(
r" href=[\"#]/\w\w/posts/([0-9a-f]+)"
).findall
self.find_tags = re.compile(
r'tag-type-([^"]+)">\s*<a [^>]*?href="/[^?]*\?tags=([^"]+)'
).findall
def items(self):
self.login()
data = self.metadata()
for post_id in util.advance(self.post_ids(), self.start_post):
post = self._parse_post(post_id)
post = self._extract_post(post_id)
url = post["file_url"]
post.update(data)
text.nameext_from_url(url, post)
@ -62,67 +70,79 @@ class IdolcomplexExtractor(SankakuExtractor):
self.logged_in = False
@cache(maxage=90*24*3600, keyarg=1)
@cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = self.root + "/user/authenticate"
url = self.root + "/users/login"
page = self.request(url).text
headers = {
"Referer": url,
}
url = self.root + (text.extr(page, '<form action="', '"') or
"/en/user/authenticate")
data = {
"authenticity_token": text.unescape(text.extr(
page, 'name="authenticity_token" value="', '"')),
"url" : "",
"user[name]" : username,
"user[password]": password,
"commit" : "Login",
}
response = self.request(url, method="POST", data=data)
response = self.request(url, method="POST", headers=headers, data=data)
if not response.history or response.url != self.root + "/user/home":
if not response.history or response.url.endswith("/user/home"):
raise exception.AuthenticationError()
cookies = response.history[0].cookies
return {c: cookies[c] for c in self.cookies_names}
return {c.name: c.value for c in response.history[0].cookies}
def _parse_post(self, post_id):
"""Extract metadata of a single post"""
url = self.root + "/post/show/" + post_id
def _extract_post(self, post_id):
url = self.root + "/posts/" + post_id
page = self.request(url, retries=10).text
extr = text.extract
extr = text.extract_from(page)
tags , pos = extr(page, "<title>", " | ")
vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
_ , pos = extr(page, "Posted: <", "", pos)
created, pos = extr(page, ' title="', '"', pos)
rating = extr(page, "<li>Rating: ", "<", pos)[0]
tags = extr("<title>", " | ")
vavg = extr('itemprop="ratingValue">', "<")
vcnt = extr('itemprop="reviewCount">', "<")
pid = extr(">Post ID:", "<")
created = extr(' title="', '"')
file_url, pos = extr(page, '<li>Original: <a href="', '"', pos)
file_url = extr('>Original:', 'id=')
if file_url:
width , pos = extr(page, '>', 'x', pos)
height, pos = extr(page, '', ' ', pos)
file_url = extr(' href="', '"')
width = extr(">", "x")
height = extr("", " ")
else:
width , pos = extr(page, '<object width=', ' ', pos)
height, pos = extr(page, 'height=', '>', pos)
file_url = extr(page, '<embed src="', '"', pos)[0]
width = extr('<object width=', ' ')
height = extr('height=', '>')
file_url = extr('<embed src="', '"')
rating = extr(">Rating:", "<br")
data = {
"id": text.parse_int(post_id),
"md5": file_url.rpartition("/")[2].partition(".")[0],
"tags": text.unescape(tags),
"id" : text.parse_int(pid),
"md5" : file_url.rpartition("/")[2].partition(".")[0],
"tags" : text.unescape(tags),
"vote_average": text.parse_float(vavg),
"vote_count": text.parse_int(vcnt),
"created_at": created,
"rating": (rating or "?")[0].lower(),
"file_url": "https:" + text.unescape(file_url),
"width": text.parse_int(width),
"height": text.parse_int(height),
"vote_count" : text.parse_int(vcnt),
"created_at" : created,
"date" : text.parse_datetime(
created, "%Y-%m-%d %H:%M:%S.%f"),
"rating" : text.remove_html(rating).lower(),
"file_url" : "https:" + text.unescape(file_url),
"width" : text.parse_int(width),
"height" : text.parse_int(height),
}
if self.extags:
tags = collections.defaultdict(list)
tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>')
pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
for tag_type, tag_name in pattern.findall(tags_html or ""):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
data["tags_" + key] = " ".join(value)
tags = collections.defaultdict(list)
tags_list = []
tags_html = text.extr(page, '<ul id="tag-sidebar"', '</ul>')
for tag_type, tag_name in self.find_tags(tags_html or ""):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
data["tags_" + key] = " ".join(value)
tags_list += value
data["tags"] = " ".join(tags_list)
return data
@ -132,8 +152,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
example = "https://idol.sankakucomplex.com/?tags=TAGS"
pattern = BASE_PATTERN + r"/(?:posts/?)?\?([^#]*)"
example = "https://idol.sankakucomplex.com/en/posts?tags=TAGS"
per_page = 20
def __init__(self, match):
@ -177,15 +197,17 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
while True:
page = self.request(self.root, params=params, retries=10).text
pos = page.find("<div id=more-popular-posts-link>") + 1
yield from text.extract_iter(page, '" id=p', '>', pos)
pos = ((page.find('id="more-popular-posts-link"') + 1) or
(page.find('<span class="thumb') + 1))
yield from self.find_pids(page, pos)
next_url = text.extract(page, 'next-page-url="', '"', pos)[0]
if not next_url:
return
next_params = text.parse_query(text.unescape(
next_url).lstrip("?/"))
next_params = text.parse_query(text.unescape(text.unescape(
next_url).lstrip("?/")))
if "next" in next_params:
# stop if the same "next" value occurs twice in a row (#265)
@ -200,8 +222,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)"
example = "https://idol.sankakucomplex.com/pool/show/12345"
pattern = BASE_PATTERN + r"/pools?/show/(\d+)"
example = "https://idol.sankakucomplex.com/pools/show/12345"
per_page = 24
def __init__(self, match):
@ -218,15 +240,16 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
return {"pool": self.pool_id}
def post_ids(self):
url = self.root + "/pool/show/" + self.pool_id
url = self.root + "/pools/show/" + self.pool_id
params = {"page": self.start_page}
while True:
page = self.request(url, params=params, retries=10).text
ids = list(text.extract_iter(page, '" id=p', '>'))
pos = page.find('id="pool-show"') + 1
post_ids = self.find_pids(page, pos)
yield from ids
if len(ids) < self.per_page:
yield from post_ids
if len(post_ids) < self.per_page:
return
params["page"] += 1
@ -235,8 +258,8 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor):
"""Extractor for single images from idol.sankakucomplex.com"""
subcategory = "post"
archive_fmt = "{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)"
example = "https://idol.sankakucomplex.com/post/show/12345"
pattern = BASE_PATTERN + r"/posts?/(?:show/)?([0-9a-f]+)"
example = "https://idol.sankakucomplex.com/posts/0123456789abcdef"
def __init__(self, match):
IdolcomplexExtractor.__init__(self, match)

@ -44,7 +44,7 @@ class ImagechestGalleryExtractor(GalleryExtractor):
}
def images(self, page):
if " More Files</button>" in page:
if ' load-all">' in page:
url = "{}/p/{}/loadAll".format(self.root, self.gallery_id)
headers = {
"X-Requested-With": "XMLHttpRequest",

@ -126,14 +126,15 @@ class ImagefapImageExtractor(ImagefapExtractor):
url = "{}/photo/{}/".format(self.root, self.image_id)
page = self.request(url).text
url, pos = text.extract(
page, 'original="', '"')
info, pos = text.extract(
page, '<script type="application/ld+json">', '</script>')
page, '<script type="application/ld+json">', '</script>', pos)
image_id, pos = text.extract(
page, 'id="imageid_input" value="', '"', pos)
gallery_id, pos = text.extract(
page, 'id="galleryid_input" value="', '"', pos)
info = util.json_loads(info)
url = info["contentUrl"]
return url, text.nameext_from_url(url, {
"title": text.unescape(info["name"]),

@ -64,7 +64,7 @@ class ImgbbExtractor(Extractor):
if username:
self.cookies_update(self._login_impl(username, password))
@cache(maxage=360*24*3600, keyarg=1)
@cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
@ -84,6 +84,13 @@ class ImgbbExtractor(Extractor):
raise exception.AuthenticationError()
return self.cookies
def _extract_resource(self, page):
return util.json_loads(text.extr(
page, "CHV.obj.resource=", "};") + "}")
def _extract_user(self, page):
return self._extract_resource(page).get("user") or {}
def _pagination(self, page, endpoint, params):
data = None
seek, pos = text.extract(page, 'data-seek="', '"')
@ -99,7 +106,7 @@ class ImgbbExtractor(Extractor):
for img in text.extract_iter(page, "data-object='", "'"):
yield util.json_loads(text.unquote(img))
if data:
if params["seek"] == data["seekEnd"]:
if not data["seekEnd"] or params["seek"] == data["seekEnd"]:
return
params["seek"] = data["seekEnd"]
params["page"] += 1
@ -124,12 +131,14 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
self.page_url = "https://ibb.co/album/" + self.album_id
def metadata(self, page):
album, pos = text.extract(page, '"og:title" content="', '"')
user , pos = text.extract(page, 'rel="author">', '<', pos)
album = text.extr(page, '"og:title" content="', '"')
user = self._extract_user(page)
return {
"album_id" : self.album_id,
"album_name": text.unescape(album),
"user" : user.lower() if user else "",
"album_id" : self.album_id,
"album_name" : text.unescape(album),
"user" : user.get("username") or "",
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
def images(self, page):
@ -158,7 +167,12 @@ class ImgbbUserExtractor(ImgbbExtractor):
self.page_url = "https://{}.imgbb.com/".format(self.user)
def metadata(self, page):
return {"user": self.user}
user = self._extract_user(page)
return {
"user" : user.get("username") or self.user,
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
def images(self, page):
user = text.extr(page, '.obj.resource={"id":"', '"')
@ -181,15 +195,20 @@ class ImgbbImageExtractor(ImgbbExtractor):
def items(self):
url = "https://ibb.co/" + self.image_id
extr = text.extract_from(self.request(url).text)
page = self.request(url).text
extr = text.extract_from(page)
user = self._extract_user(page)
image = {
"id" : self.image_id,
"title" : text.unescape(extr('"og:title" content="', '"')),
"title" : text.unescape(extr(
'"og:title" content="', ' hosted at ImgBB"')),
"url" : extr('"og:image" content="', '"'),
"width" : text.parse_int(extr('"og:image:width" content="', '"')),
"height": text.parse_int(extr('"og:image:height" content="', '"')),
"user" : extr('rel="author">', '<').lower(),
"user" : user.get("username") or "",
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
image["extension"] = text.ext_from_url(image["url"])

@ -103,7 +103,8 @@ class InkbunnyPoolExtractor(InkbunnyExtractor):
subcategory = "pool"
pattern = (BASE_PATTERN + r"/(?:"
r"poolview_process\.php\?pool_id=(\d+)|"
r"submissionsviewall\.php\?([^#]+&mode=pool&[^#]+))")
r"submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=pool(?:&[^#]+)?))")
example = "https://inkbunny.net/poolview_process.php?pool_id=12345"
def __init__(self, match):
@ -133,7 +134,8 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
subcategory = "favorite"
pattern = (BASE_PATTERN + r"/(?:"
r"userfavorites_process\.php\?favs_user_id=(\d+)|"
r"submissionsviewall\.php\?([^#]+&mode=userfavs&[^#]+))")
r"submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=userfavs(?:&[^#]+)?))")
example = ("https://inkbunny.net/userfavorites_process.php"
"?favs_user_id=12345")
@ -161,11 +163,31 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
return self.api.search(params)
class InkbunnyUnreadExtractor(InkbunnyExtractor):
"""Extractor for unread inkbunny submissions"""
subcategory = "unread"
pattern = (BASE_PATTERN + r"/submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=unreadsubs(?:&[^#]+)?)")
example = ("https://inkbunny.net/submissionsviewall.php"
"?text=&mode=unreadsubs&type=")
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
self.params = text.parse_query(match.group(1))
def posts(self):
params = self.params.copy()
params.pop("rid", None)
params.pop("mode", None)
params["unread_submissions"] = "yes"
return self.api.search(params)
class InkbunnySearchExtractor(InkbunnyExtractor):
"""Extractor for inkbunny search results"""
subcategory = "search"
pattern = (BASE_PATTERN +
r"/submissionsviewall\.php\?([^#]+&mode=search&[^#]+)")
pattern = (BASE_PATTERN + r"/submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=search(?:&[^#]+)?)")
example = ("https://inkbunny.net/submissionsviewall.php"
"?text=TAG&mode=search&type=")
@ -201,7 +223,8 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor):
subcategory = "following"
pattern = (BASE_PATTERN + r"/(?:"
r"watchlist_process\.php\?mode=watching&user_id=(\d+)|"
r"usersviewall\.php\?([^#]+&mode=watching&[^#]+))")
r"usersviewall\.php"
r"\?((?:[^#]+&)?mode=watching(?:&[^#]+)?))")
example = ("https://inkbunny.net/watchlist_process.php"
"?mode=watching&user_id=12345")
@ -324,6 +347,9 @@ class InkbunnyAPI():
while True:
data = self._call("search", params)
if not data["submissions"]:
return
yield from self.detail(data["submissions"])
if data["page"] >= data["pages_count"]:
@ -334,7 +360,7 @@ class InkbunnyAPI():
params["page"] += 1
@cache(maxage=360*24*3600, keyarg=1)
@cache(maxage=365*86400, keyarg=1)
def _authenticate_impl(api, username, password):
api.extractor.log.info("Logging in as %s", username)

@ -217,9 +217,10 @@ class InstagramExtractor(Extractor):
data["post_shortcode"])
continue
if "video_versions" in item:
video_versions = item.get("video_versions")
if video_versions:
video = max(
item["video_versions"],
video_versions,
key=lambda x: (x["width"], x["height"], x["type"]),
)
media = video
@ -710,7 +711,8 @@ class InstagramRestAPI():
def user_by_name(self, screen_name):
endpoint = "/v1/users/web_profile_info/"
params = {"username": screen_name}
return self._call(endpoint, params=params)["data"]["user"]
return self._call(
endpoint, params=params, notfound="user")["data"]["user"]
@memcache(keyarg=1)
def user_by_id(self, user_id):
@ -777,13 +779,15 @@ class InstagramRestAPI():
kwargs["headers"] = {
"Accept" : "*/*",
"X-CSRFToken" : extr.csrf_token,
"X-Instagram-AJAX": "1006242110",
"X-IG-App-ID" : "936619743392459",
"X-ASBD-ID" : "198387",
"X-ASBD-ID" : "129477",
"X-IG-WWW-Claim" : extr.www_claim,
"X-Requested-With": "XMLHttpRequest",
"Alt-Used" : "www.instagram.com",
"Connection" : "keep-alive",
"Referer" : extr.root + "/",
"Sec-Fetch-Dest" : "empty",
"Sec-Fetch-Mode" : "cors",
"Sec-Fetch-Site" : "same-origin",
}
return extr.request(url, **kwargs).json()
@ -973,7 +977,7 @@ class InstagramGraphqlAPI():
variables["after"] = extr._update_cursor(info["end_cursor"])
@cache(maxage=90*24*3600, keyarg=1)
@cache(maxage=90*86400, keyarg=1)
def _login_impl(extr, username, password):
extr.log.error("Login with username & password is no longer supported. "
"Use browser cookies instead.")

@ -29,8 +29,9 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
example = "https://issuu.com/issuu/docs/TITLE/"
def metadata(self, page):
pos = page.rindex('id="initial-data"')
data = util.json_loads(text.rextract(
page, '<script data-json="', '"')[0].replace("&quot;", '"'))
page, '<script data-json="', '"', pos)[0].replace("&quot;", '"'))
doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime(

@ -1,105 +0,0 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://jpg1.su/"""
from .common import Extractor, Message
from .. import text
BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)"
class JpgfishExtractor(Extractor):
"""Base class for jpgfish extractors"""
category = "jpgfish"
root = "https://jpg1.su"
directory_fmt = ("{category}", "{user}", "{album}",)
archive_fmt = "{id}"
def _pagination(self, url):
while url:
page = self.request(url).text
for item in text.extract_iter(
page, '<div class="list-item-image ', 'image-container'):
yield text.extract(item, '<a href="', '"')[0]
url = text.extract(
page, '<a data-pagination="next" href="', '" ><')[0]
class JpgfishImageExtractor(JpgfishExtractor):
"""Extractor for jpgfish Images"""
subcategory = "image"
pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))"
example = "https://jpg1.su/img/TITLE.ID"
def __init__(self, match):
JpgfishExtractor.__init__(self, match)
self.path, self.image_id = match.groups()
def items(self):
url = "{}/img/{}".format(self.root, self.path)
extr = text.extract_from(self.request(url).text)
image = {
"id" : self.image_id,
"url" : extr('<meta property="og:image" content="', '"'),
"album": text.extract(extr(
"Added to <a", "/a>"), ">", "<")[0] or "",
"user" : extr('username: "', '"'),
}
text.nameext_from_url(image["url"], image)
yield Message.Directory, image
yield Message.Url, image["url"], image
class JpgfishAlbumExtractor(JpgfishExtractor):
"""Extractor for jpgfish Albums"""
subcategory = "album"
pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?"
example = "https://jpg1.su/album/TITLE.ID"
def __init__(self, match):
JpgfishExtractor.__init__(self, match)
self.album, self.sub_albums = match.groups()
def items(self):
url = "{}/a/{}".format(self.root, self.album)
data = {"_extractor": JpgfishImageExtractor}
if self.sub_albums:
albums = self._pagination(url + "/sub")
else:
albums = (url,)
for album in albums:
for image in self._pagination(album):
yield Message.Queue, image, data
class JpgfishUserExtractor(JpgfishExtractor):
"""Extractor for jpgfish Users"""
subcategory = "user"
pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?"
example = "https://jpg1.su/USER"
def __init__(self, match):
JpgfishExtractor.__init__(self, match)
self.user, self.albums = match.groups()
def items(self):
url = "{}/{}".format(self.root, self.user)
if self.albums:
url += "/albums"
data = {"_extractor": JpgfishAlbumExtractor}
else:
data = {"_extractor": JpgfishImageExtractor}
for url in self._pagination(url):
yield Message.Queue, url, data

@ -9,9 +9,10 @@
"""Extractors for https://kemono.party/"""
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
from .. import text, util, exception
from ..cache import cache, memcache
import itertools
import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
@ -24,7 +25,7 @@ class KemonopartyExtractor(Extractor):
category = "kemonoparty"
root = "https://kemono.party"
directory_fmt = ("{category}", "{service}", "{user}")
filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}"
filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}"
archive_fmt = "{service}_{user}_{id}_{num}"
cookies_domain = ".kemono.party"
@ -37,10 +38,16 @@ class KemonopartyExtractor(Extractor):
Extractor.__init__(self, match)
def _init(self):
self.revisions = self.config("revisions")
if self.revisions:
self.revisions_unique = (self.revisions == "unique")
self._prepare_ddosguard_cookies()
self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder(
ensure_ascii=False, check_circular=False,
sort_keys=True, separators=(",", ":")).encode
def items(self):
find_hash = re.compile(HASH_PATTERN).match
@ -69,9 +76,9 @@ class KemonopartyExtractor(Extractor):
headers["Referer"] = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
post["_http_headers"] = headers
post["date"] = text.parse_datetime(
post["published"] or post["added"],
"%a, %d %b %Y %H:%M:%S %Z")
post["date"] = self._parse_datetime(
post["published"] or post["added"])
if username:
post["username"] = username
if comments:
@ -129,7 +136,7 @@ class KemonopartyExtractor(Extractor):
self.cookies_update(self._login_impl(
(username, self.cookies_domain), password))
@cache(maxage=28*24*3600, keyarg=1)
@cache(maxage=28*86400, keyarg=1)
def _login_impl(self, username, password):
username = username[0]
self.log.info("Logging in as %s", username)
@ -197,14 +204,80 @@ class KemonopartyExtractor(Extractor):
dms = []
for dm in text.extract_iter(page, "<article", "</article>"):
footer = text.extr(dm, "<footer", "</footer>")
dms.append({
"body": text.unescape(text.extract(
"body": text.unescape(text.extr(
dm, "<pre>", "</pre></",
)[0].strip()),
"date": text.extr(dm, 'datetime="', '"'),
).strip()),
"date": text.extr(footer, 'Published: ', '\n'),
})
return dms
def _parse_datetime(self, date_string):
if len(date_string) > 19:
date_string = date_string[:19]
return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S")
@memcache(keyarg=1)
def _discord_channels(self, server):
url = "{}/api/v1/discord/channel/lookup/{}".format(
self.root, server)
return self.request(url).json()
def _revisions_post(self, post, url):
post["revision_id"] = 0
try:
revs = self.request(url + "/revisions").json()
except exception.HttpError:
post["revision_hash"] = self._revision_hash(post)
post["revision_index"] = 1
return (post,)
revs.insert(0, post)
for rev in revs:
rev["revision_hash"] = self._revision_hash(rev)
if self.revisions_unique:
uniq = []
last = None
for rev in revs:
if last != rev["revision_hash"]:
last = rev["revision_hash"]
uniq.append(rev)
revs = uniq
idx = len(revs)
for rev in revs:
rev["revision_index"] = idx
idx -= 1
return revs
def _revisions_all(self, url):
revs = self.request(url + "/revisions").json()
idx = len(revs)
for rev in revs:
rev["revision_hash"] = self._revision_hash(rev)
rev["revision_index"] = idx
idx -= 1
return revs
def _revision_hash(self, revision):
rev = revision.copy()
rev.pop("revision_id", None)
rev.pop("added", None)
rev.pop("next", None)
rev.pop("prev", None)
rev["file"] = rev["file"].copy()
rev["file"].pop("name", None)
rev["attachments"] = [a.copy() for a in rev["attachments"]]
for a in rev["attachments"]:
a.pop("name", None)
return util.sha1(self._json_dumps(rev))
def _validate(response):
return (response.headers["content-length"] != "9" or
@ -214,48 +287,68 @@ def _validate(response):
class KemonopartyUserExtractor(KemonopartyExtractor):
"""Extractor for all posts from a kemono.party user listing"""
subcategory = "user"
pattern = USER_PATTERN + r"/?(?:\?o=(\d+))?(?:$|[?#])"
pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])"
example = "https://kemono.party/SERVICE/user/12345"
def __init__(self, match):
_, _, service, user_id, offset = match.groups()
_, _, service, user_id, self.query = match.groups()
self.subcategory = service
KemonopartyExtractor.__init__(self, match)
self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id)
self.api_url = "{}/api/v1/{}/user/{}".format(
self.root, service, user_id)
self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
self.offset = text.parse_int(offset)
def posts(self):
url = self.api_url
params = {"o": self.offset}
params = text.parse_query(self.query)
params["o"] = text.parse_int(params.get("o"))
while True:
posts = self.request(url, params=params).json()
yield from posts
cnt = len(posts)
if cnt < 25:
return
params["o"] += cnt
if self.revisions:
for post in posts:
post_url = "{}/post/{}".format(self.api_url, post["id"])
yield from self._revisions_post(post, post_url)
else:
yield from posts
if len(posts) < 50:
break
params["o"] += 50
class KemonopartyPostExtractor(KemonopartyExtractor):
"""Extractor for a single kemono.party post"""
subcategory = "post"
pattern = USER_PATTERN + r"/post/([^/?#]+)"
pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?"
example = "https://kemono.party/SERVICE/user/12345/post/12345"
def __init__(self, match):
_, _, service, user_id, post_id = match.groups()
_, _, service, user_id, post_id, self.revision, self.revision_id = \
match.groups()
self.subcategory = service
KemonopartyExtractor.__init__(self, match)
self.api_url = "{}/api/{}/user/{}/post/{}".format(
self.api_url = "{}/api/v1/{}/user/{}/post/{}".format(
self.root, service, user_id, post_id)
self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
def posts(self):
posts = self.request(self.api_url).json()
return (posts[0],) if len(posts) > 1 else posts
if not self.revision:
post = self.request(self.api_url).json()
if self.revisions:
return self._revisions_post(post, self.api_url)
return (post,)
revs = self._revisions_all(self.api_url)
if not self.revision_id:
return revs
for rev in revs:
if str(rev["revision_id"]) == self.revision_id:
return (rev,)
raise exception.NotFoundError("revision")
class KemonopartyDiscordExtractor(KemonopartyExtractor):
@ -270,11 +363,29 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
def __init__(self, match):
KemonopartyExtractor.__init__(self, match)
_, _, self.server, self.channel, self.channel_name = match.groups()
_, _, self.server, self.channel_id, self.channel = match.groups()
self.channel_name = ""
def items(self):
self._prepare_ddosguard_cookies()
if self.channel_id:
self.channel_name = self.channel
else:
if self.channel.isdecimal() and len(self.channel) >= 16:
key = "id"
else:
key = "name"
for channel in self._discord_channels(self.server):
if channel[key] == self.channel:
break
else:
raise exception.NotFoundError("channel")
self.channel_id = channel["id"]
self.channel_name = channel["name"]
find_inline = re.compile(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
@ -298,8 +409,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
"name": path, "type": "inline", "hash": ""})
post["channel_name"] = self.channel_name
post["date"] = text.parse_datetime(
post["published"], "%a, %d %b %Y %H:%M:%S %Z")
post["date"] = self._parse_datetime(post["published"])
post["count"] = len(files)
yield Message.Directory, post
@ -319,27 +429,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
yield Message.Url, url, post
def posts(self):
if self.channel is None:
url = "{}/api/discord/channels/lookup?q={}".format(
self.root, self.server)
for channel in self.request(url).json():
if channel["name"] == self.channel_name:
self.channel = channel["id"]
break
else:
raise exception.NotFoundError("channel")
url = "{}/api/discord/channel/{}".format(self.root, self.channel)
params = {"skip": 0}
url = "{}/api/v1/discord/channel/{}".format(
self.root, self.channel_id)
params = {"o": 0}
while True:
posts = self.request(url, params=params).json()
yield from posts
cnt = len(posts)
if cnt < 25:
if len(posts) < 150:
break
params["skip"] += cnt
params["o"] += 150
class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
@ -352,11 +452,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
self.server = match.group(3)
def items(self):
url = "{}/api/discord/channels/lookup?q={}".format(
self.root, self.server)
channels = self.request(url).json()
for channel in channels:
for channel in self._discord_channels(self.server):
url = "{}/discord/server/{}/channel/{}#{}".format(
self.root, self.server, channel["id"], channel["name"])
channel["_extractor"] = KemonopartyDiscordExtractor

@ -6,19 +6,19 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://komikcast.site/"""
"""Extractors for https://komikcast.lol/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)"
BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)"
class KomikcastBase():
"""Base class for komikcast extractors"""
category = "komikcast"
root = "https://komikcast.site"
root = "https://komikcast.lol"
@staticmethod
def parse_chapter_string(chapter_string, data=None):
@ -46,9 +46,9 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for manga-chapters from komikcast.site"""
"""Extractor for manga-chapters from komikcast.lol"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
example = "https://komikcast.site/chapter/TITLE/"
example = "https://komikcast.lol/chapter/TITLE/"
def metadata(self, page):
info = text.extr(page, "<title>", " - Komikcast<")
@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for manga from komikcast.site"""
"""Extractor for manga from komikcast.lol"""
chapterclass = KomikcastChapterExtractor
pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
example = "https://komikcast.site/komik/TITLE"
example = "https://komikcast.lol/komik/TITLE"
def chapters(self, page):
results = []
@ -76,8 +76,10 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
for item in text.extract_iter(
page, '<a class="chapter-link-item" href="', '</a'):
url, _, chapter_string = item.rpartition('">Chapter ')
self.parse_chapter_string(chapter_string, data)
url, _, chapter = item.rpartition('">Chapter')
chapter, sep, minor = chapter.strip().partition(".")
data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = sep + minor
results.append((url, data.copy()))
return results

@ -18,8 +18,8 @@ class LynxchanExtractor(BaseExtractor):
BASE_PATTERN = LynxchanExtractor.update({
"bbw-chan": {
"root": "https://bbw-chan.nl",
"pattern": r"bbw-chan\.nl",
"root": "https://bbw-chan.link",
"pattern": r"bbw-chan\.(?:link|nl)",
},
"kohlchan": {
"root": "https://kohlchan.net",
@ -40,7 +40,7 @@ class LynxchanThreadExtractor(LynxchanExtractor):
filename_fmt = "{postId}{num:?-//} {filename}.{extension}"
archive_fmt = "{boardUri}_{postId}_{num}"
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
example = "https://bbw-chan.nl/a/res/12345.html"
example = "https://endchan.org/a/res/12345.html"
def __init__(self, match):
LynxchanExtractor.__init__(self, match)
@ -71,7 +71,7 @@ class LynxchanBoardExtractor(LynxchanExtractor):
"""Extractor for LynxChan boards"""
subcategory = "board"
pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
example = "https://bbw-chan.nl/a/"
example = "https://endchan.org/a/"
def __init__(self, match):
LynxchanExtractor.__init__(self, match)

@ -148,6 +148,32 @@ class MangadexFeedExtractor(MangadexExtractor):
return self.api.user_follows_manga_feed()
class MangadexListExtractor(MangadexExtractor):
"""Extractor for mangadex lists"""
subcategory = "list"
pattern = (BASE_PATTERN +
r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?")
example = ("https://mangadex.org/list"
"/01234567-89ab-cdef-0123-456789abcdef/NAME")
def __init__(self, match):
MangadexExtractor.__init__(self, match)
if match.group(2) == "feed":
self.subcategory = "list-feed"
else:
self.items = self._items_titles
def chapters(self):
return self.api.list_feed(self.uuid)
def _items_titles(self):
data = {"_extractor": MangadexMangaExtractor}
for item in self.api.list(self.uuid)["relationships"]:
if item["type"] == "manga":
url = "{}/title/{}".format(self.root, item["id"])
yield Message.Queue, url, data
class MangadexAPI():
"""Interface for the MangaDex API v5
@ -173,6 +199,12 @@ class MangadexAPI():
params = {"includes[]": ("scanlation_group",)}
return self._call("/chapter/" + uuid, params)["data"]
def list(self, uuid):
return self._call("/list/" + uuid)["data"]
def list_feed(self, uuid):
return self._pagination("/list/" + uuid + "/feed")
@memcache(keyarg=1)
def manga(self, uuid):
params = {"includes[]": ("artist", "author")}
@ -266,6 +298,6 @@ class MangadexAPI():
return
@cache(maxage=28*24*3600, keyarg=0)
@cache(maxage=28*86400, keyarg=0)
def _refresh_token_cache(username):
return None

@ -10,7 +10,11 @@ from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)"
BASE_PATTERN = (
r"(?:https?://)?"
r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o"
r"\.(?:to|com))"
)
class ManganeloBase():
@ -67,10 +71,11 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):
def images(self, page):
page = text.extr(
page, 'class="container-chapter-reader', '\n<div')
page, 'class="container-chapter-reader', 'class="container')
return [
(url, None)
for url in text.extract_iter(page, '<img src="', '"')
if not url.endswith("/gohome.png")
] or [
(url, None)
for url in text.extract_iter(

@ -50,8 +50,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor):
page = text.extr(
page, '<div class="reading-content">', '<div class="entry-header')
return [
(url.strip(), None)
for url in text.extract_iter(page, 'data-src="', '"')
(text.extr(img, 'src="', '"').strip(), None)
for img in text.extract_iter(page, '<img id="image-', '>')
]

@ -45,6 +45,9 @@ class MastodonExtractor(BaseExtractor):
attachments = status["media_attachments"]
del status["media_attachments"]
if status["reblog"]:
attachments.extend(status["reblog"]["media_attachments"])
status["instance"] = self.instance
acct = status["account"]["acct"]
status["instance_remote"] = \
@ -72,7 +75,7 @@ class MastodonExtractor(BaseExtractor):
account["acct"], account["moved"]["acct"])
INSTANCES = {
BASE_PATTERN = MastodonExtractor.update({
"mastodon.social": {
"root" : "https://mastodon.social",
"pattern" : r"mastodon\.social",
@ -97,9 +100,7 @@ INSTANCES = {
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
}
}
BASE_PATTERN = MastodonExtractor.update(INSTANCES) + "(?:/web)?"
}) + "(?:/web)?"
class MastodonUserExtractor(MastodonExtractor):
@ -113,7 +114,10 @@ class MastodonUserExtractor(MastodonExtractor):
return api.account_statuses(
api.account_id_by_username(self.item),
only_media=not self.config("text-posts", False),
only_media=(
not self.reblogs and
not self.config("text-posts", False)
),
exclude_replies=not self.replies,
)
@ -146,7 +150,7 @@ class MastodonFollowingExtractor(MastodonExtractor):
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
pattern = BASE_PATTERN + r"/@[^/?#]+/(\d+)"
pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)"
example = "https://mastodon.social/@USER/12345"
def statuses(self):
@ -168,10 +172,8 @@ class MastodonAPI():
if access_token is None or access_token == "cache":
access_token = _access_token_cache(extractor.instance)
if not access_token:
try:
access_token = INSTANCES[extractor.category]["access-token"]
except (KeyError, TypeError):
pass
access_token = extractor.config_instance("access-token")
if access_token:
self.headers = {"Authorization": "Bearer " + access_token}
else:
@ -271,6 +273,6 @@ class MastodonAPI():
params = None
@cache(maxage=100*365*24*3600, keyarg=0)
@cache(maxage=36500*86400, keyarg=0)
def _access_token_cache(instance):
return None

@ -70,6 +70,10 @@ BASE_PATTERN = MisskeyExtractor.update({
"root": "https://misskey.io",
"pattern": r"misskey\.io",
},
"misskey.design": {
"root": "https://misskey.design",
"pattern": r"misskey\.design",
},
"lesbian.energy": {
"root": "https://lesbian.energy",
"pattern": r"lesbian\.energy",

@ -124,6 +124,11 @@ class MoebooruPoolExtractor(MoebooruExtractor):
self.pool_id = match.group(match.lastindex)
def metadata(self):
if self.config("metadata"):
url = "{}/pool/show/{}.json".format(self.root, self.pool_id)
pool = self.request(url).json()
pool.pop("posts", None)
return {"pool": pool}
return {"pool": text.parse_int(self.pool_id)}
def posts(self):

@ -16,12 +16,12 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
root = "https://myhentaigallery.com"
directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}")
pattern = (r"(?:https?://)?myhentaigallery\.com"
r"/gallery/(?:thumbnails|show)/(\d+)")
example = "https://myhentaigallery.com/gallery/thumbnails/12345"
r"/g(?:allery/(?:thumbnails|show))?/(\d+)")
example = "https://myhentaigallery.com/g/12345"
def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id)
url = "{}/g/{}".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def _init(self):

@ -46,7 +46,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
"episode" : self.episode,
"comic" : extr('titleName: "', '"'),
"tags" : [t.strip() for t in text.extract_iter(
extr("tagList: [", "}],"), '"tagName":"', '"')],
extr("tagList: [", "],"), '"tagName":"', '"')],
"title" : extr('"subtitle":"', '"'),
"author" : [a.strip() for a in text.extract_iter(
extr('"writers":[', ']'), '"name":"', '"')],

@ -23,7 +23,7 @@ class NewgroundsExtractor(Extractor):
root = "https://www.newgrounds.com"
cookies_domain = ".newgrounds.com"
cookies_names = ("NG_GG_username", "vmk1du5I8m")
request_interval = 1.0
request_interval = (0.5, 1.5)
def __init__(self, match):
Extractor.__init__(self, match)
@ -54,14 +54,31 @@ class NewgroundsExtractor(Extractor):
if metadata:
post.update(metadata)
yield Message.Directory, post
post["num"] = 0
yield Message.Url, url, text.nameext_from_url(url, post)
for num, url in enumerate(text.extract_iter(
post["_comment"], 'data-smartload-src="', '"'), 1):
post["num"] = num
post["_index"] = "{}_{:>02}".format(post["index"], num)
if "_multi" in post:
for data in post["_multi"]:
post["num"] += 1
post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
post.update(data)
url = data["image"]
text.nameext_from_url(url, post)
yield Message.Url, url, post
if "_fallback" in post:
del post["_fallback"]
for url in text.extract_iter(
post["_comment"], 'data-smartload-src="', '"'):
post["num"] += 1
post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
url = text.ensure_http_scheme(url)
yield Message.Url, url, text.nameext_from_url(url, post)
text.nameext_from_url(url, post)
yield Message.Url, url, post
else:
self.log.warning(
"Unable to get download URL for '%s'", post_url)
@ -81,7 +98,7 @@ class NewgroundsExtractor(Extractor):
if username:
self.cookies_update(self._login_impl(username, password))
@cache(maxage=360*24*3600, keyarg=1)
@cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
@ -153,8 +170,7 @@ class NewgroundsExtractor(Extractor):
data["post_url"] = post_url
return data
@staticmethod
def _extract_image_data(extr, url):
def _extract_image_data(self, extr, url):
full = text.extract_from(util.json_loads(extr(
'"full_image_text":', '});')))
data = {
@ -172,8 +188,34 @@ class NewgroundsExtractor(Extractor):
index = data["url"].rpartition("/")[2].partition("_")[0]
data["index"] = text.parse_int(index)
data["_index"] = index
image_data = extr("let imageData =", "\n];")
if image_data:
data["_multi"] = self._extract_images_multi(image_data)
else:
art_images = extr('<div class="art-images', '\n</div>')
if art_images:
data["_multi"] = self._extract_images_art(art_images, data)
return data
def _extract_images_multi(self, html):
data = util.json_loads(html + "]")
yield from data[1:]
def _extract_images_art(self, html, data):
ext = text.ext_from_url(data["url"])
for url in text.extract_iter(html, 'data-smartload-src="', '"'):
url = text.ensure_http_scheme(url)
url = url.replace("/medium_views/", "/images/", 1)
if text.ext_from_url(url) == "webp":
yield {
"image" : url.replace(".webp", "." + ext),
"_fallback": (url,),
}
else:
yield {"image": url}
@staticmethod
def _extract_audio_data(extr, url):
index = url.split("/")[5]

@ -19,6 +19,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
directory_fmt = ("{category}", "{user_id}")
filename_fmt = "{image_id}_p{num}.{extension}"
archive_fmt = "{image_id}_{num}"
request_interval = (1.0, 2.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)
@ -54,9 +55,16 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
else:
data["user_id"] = data["artist_id"]
data["user_name"] = data["artist_name"]
yield Message.Directory, data
for image in self._extract_images(page):
urls = list(self._extract_images(image_id, page))
data["count"] = len(urls)
yield Message.Directory, data
for num, url in enumerate(urls):
image = text.nameext_from_url(url, {
"num": num,
"url": "https:" + url,
})
image.update(data)
if not image["extension"]:
image["extension"] = "jpg"
@ -71,7 +79,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
extr = text.extract_from(page)
keywords = text.unescape(extr(
'name="keywords" content="', '" />')).split(",")
data = {
return {
"title" : keywords[0].strip(),
"description": text.unescape(extr(
'"description": "', '"').replace("&amp;", "&")),
@ -81,7 +89,6 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"artist_name": keywords[1],
"tags" : keywords[2:-1],
}
return data
@staticmethod
def _extract_data_horne(page):
@ -89,7 +96,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
extr = text.extract_from(page)
keywords = text.unescape(extr(
'name="keywords" content="', '" />')).split(",")
data = {
return {
"title" : keywords[0].strip(),
"description": text.unescape(extr(
'property="og:description" content="', '"')),
@ -100,21 +107,17 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"itemprop='datePublished' content=", "<").rpartition(">")[2],
"%Y-%m-%d %H:%M:%S", 9),
}
return data
@staticmethod
def _extract_images(page):
"""Extract image URLs from 'page'"""
images = text.extract_iter(page, "/view_popup.php", "</a>")
for num, image in enumerate(images):
src = text.extr(image, 'src="', '"')
if not src:
continue
url = ("https:" + src).replace("/__rs_l120x120/", "/")
yield text.nameext_from_url(url, {
"num": num,
"url": url,
})
def _extract_images(self, image_id, page):
if '&#diff_1" ' in page:
# multiple images
url = "{}/view_popup.php?id={}".format(self.root, image_id)
page = self.request(url).text
yield from text.extract_iter(
page, 'href="javascript:void(0);"><img src="', '"')
else:
pos = page.find('id="view-center"') + 1
yield text.extract(page, 'itemprop="image" src="', '"', pos)[0]
@staticmethod
def _extract_user_name(page):
@ -125,15 +128,15 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
return
username, password = self._get_auth_info()
self.cookies_update(self._login_impl(username, password))
if username:
return self.cookies_update(self._login_impl(username, password))
@cache(maxage=90*24*3600, keyarg=1)
def _login_impl(self, username, password):
if not username or not password:
raise exception.AuthenticationError(
"Username and password required")
raise exception.AuthenticationError("Username and password required")
@cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = "{}/login_int.php".format(self.root)
data = {"email": username, "password": password, "save": "on"}

@ -96,6 +96,8 @@ class NitterExtractor(BaseExtractor):
for url in text.extract_iter(
attachments, '<source src="', '"'):
if url[0] == "/":
url = self.root + url
append(text.nameext_from_url(url, {"url": url}))
else:
@ -233,10 +235,6 @@ BASE_PATTERN = NitterExtractor.update({
"root": "https://nitter.net",
"pattern": r"nitter\.net",
},
"nitter.lacontrevoie.fr": {
"root": "https://nitter.lacontrevoie.fr",
"pattern": r"nitter\.lacontrevoie\.fr",
},
"nitter.1d4.us": {
"root": "https://nitter.1d4.us",
"pattern": r"nitter\.1d4\.us",

@ -20,6 +20,7 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
filename_fmt = "{album_id}_{num:>03}_{id}.{extension}"
directory_fmt = ("{category}", "{album_id} {title}")
archive_fmt = "{id}"
referer = False
pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
example = "https://nsfwalbum.com/album/12345"
@ -71,8 +72,8 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
@staticmethod
def _validate_response(response):
return not response.request.url.endswith(
("/no_image.jpg", "/placeholder.png"))
return not response.url.endswith(
("/no_image.jpg", "/placeholder.png", "/error.jpg"))
@staticmethod
def _annihilate(value, base=6):

@ -1,87 +0,0 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://nudecollect.com/"""
from .common import GalleryExtractor
from .. import text
class NudecollectExtractor(GalleryExtractor):
"""Base class for Nudecollect extractors"""
category = "nudecollect"
directory_fmt = ("{category}", "{title}")
filename_fmt = "{slug}_{num:>03}.{extension}"
archive_fmt = "{slug}_{num}"
root = "https://www.nudecollect.com"
def request(self, url, **kwargs):
kwargs["allow_redirects"] = False
return GalleryExtractor.request(self, url, **kwargs)
@staticmethod
def get_title(page):
return text.unescape(text.extr(page, "<title>", "</title>"))[31:]
@staticmethod
def get_image(page):
return text.extr(page, '<img src="', '"')
class NudecollectImageExtractor(NudecollectExtractor):
"""Extractor for individual images from nudecollect.com"""
subcategory = "image"
pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)"
r"-mirror-(\d+)\.html)")
example = ("https://www.nudecollect.com/content/12345_TITLE"
"/image-1-pics-108-mirror-1.html")
def __init__(self, match):
NudecollectExtractor.__init__(self, match)
_, self.slug, self.num, self.count, self.mirror = match.groups()
def metadata(self, page):
return {
"slug" : self.slug,
"title" : self.get_title(page),
"count" : text.parse_int(self.count),
"mirror": text.parse_int(self.mirror),
}
def images(self, page):
return ((self.get_image(page), {"num": text.parse_int(self.num)}),)
class NudecollectAlbumExtractor(NudecollectExtractor):
"""Extractor for image albums on nudecollect.com"""
subcategory = "album"
pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)"
r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html")
example = ("https://www.nudecollect.com/content/12345_TITLE"
"/index-mirror-01-123.html")
def __init__(self, match):
self.slug = match.group(1)
self.mirror = match.group(2) or match.group(5)
self.count = text.parse_int(match.group(3) or match.group(4))
url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format(
self.root, self.slug, self.count, self.mirror)
NudecollectExtractor.__init__(self, match, url)
def metadata(self, page):
return {
"slug" : self.slug,
"title" : self.get_title(page),
"mirror": text.parse_int(self.mirror),
}
def images(self, page):
url = self.get_image(page)
p1, _, p2 = url.partition("/image0")
ufmt = p1 + "/image{:>05}" + p2[4:]
return [(ufmt.format(num), None) for num in range(1, self.count + 1)]

@ -11,7 +11,7 @@
from .common import Extractor, Message
from .. import text, oauth, util, config, exception
from ..output import stdout_write
from ..cache import cache
from ..cache import cache, memcache
import urllib.parse
import binascii
import hashlib
@ -31,6 +31,9 @@ class OAuthBase(Extractor):
def _init(self):
self.cache = config.get(("extractor", self.category), "cache", True)
if self.cache and cache is memcache:
self.log.warning("cache file is not writeable")
self.cache = False
def oauth_config(self, key, default=None):
value = config.interpolate(("extractor", self.subcategory), key)
@ -180,7 +183,7 @@ class OAuthBase(Extractor):
}
if auth:
auth = (client_id, client_secret)
auth = util.HTTPBasicAuth(client_id, client_secret)
else:
auth = None
data["client_id"] = client_id
@ -355,8 +358,8 @@ class OAuthMastodon(OAuthBase):
yield Message.Version, 1
from . import mastodon
for application in mastodon.INSTANCES.values():
if self.instance == application["root"].partition("://")[2]:
for _, root, application in mastodon.MastodonExtractor.instances:
if self.instance == root.partition("://")[2]:
break
else:
application = self._register(self.instance)
@ -373,7 +376,7 @@ class OAuthMastodon(OAuthBase):
cache=mastodon._access_token_cache,
)
@cache(maxage=10*365*24*3600, keyarg=1)
@cache(maxage=36500*86400, keyarg=1)
def _register(self, instance):
self.log.info("Registering application for '%s'", instance)

@ -32,7 +32,7 @@ class PahealExtractor(Extractor):
post["tags"] = text.unquote(post["tags"])
post.update(data)
yield Message.Directory, post
yield Message.Url, url, text.nameext_from_url(url, post)
yield Message.Url, url, post
def get_metadata(self):
"""Return general metadata"""
@ -56,14 +56,16 @@ class PahealExtractor(Extractor):
"date" : text.parse_datetime(
extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
"source" : text.unescape(text.extr(
extr(">Source&nbsp;Link<", "</td>"), "href='", "'")),
extr(">Source Link<", "</td>"), "href='", "'")),
}
dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
post["width"], _, height = dimensions.partition("x")
dimensions, size, ext = extr("Info</th><td>", "<").split(" // ")
post["size"] = text.parse_bytes(size[:-1])
post["width"], _, height = dimensions.partition("x")
post["height"], _, duration = height.partition(", ")
post["duration"] = text.parse_float(duration[:-1])
post["filename"] = "{} - {}".format(post_id, post["tags"])
post["extension"] = ext
return post
@ -112,6 +114,7 @@ class PahealTagExtractor(PahealExtractor):
tags, data, date = data.split("\n")
dimensions, size, ext = data.split(" // ")
tags = text.unescape(tags)
width, _, height = dimensions.partition("x")
height, _, duration = height.partition(", ")
@ -119,9 +122,11 @@ class PahealTagExtractor(PahealExtractor):
"id": pid, "md5": md5, "file_url": url,
"width": width, "height": height,
"duration": text.parse_float(duration[:-1]),
"tags": text.unescape(tags),
"tags": tags,
"size": text.parse_bytes(size[:-1]),
"date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
"filename" : "{} - {}".format(pid, tags),
"extension": ext,
}
def _extract_data_ex(self, post):

@ -52,19 +52,29 @@ class PatreonExtractor(Extractor):
post["hash"] = fhash
post["type"] = kind
post["num"] += 1
yield Message.Url, url, text.nameext_from_url(name, post)
text.nameext_from_url(name, post)
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
post["extension"] = "mp4"
yield Message.Url, url, post
else:
self.log.debug("skipping %s (%s %s)", url, fhash, kind)
@staticmethod
def _postfile(post):
def _postfile(self, post):
postfile = post.get("post_file")
if postfile:
return (("postfile", postfile["url"], postfile["name"]),)
url = postfile["url"]
name = postfile.get("name")
if not name:
if url.startswith("https://stream.mux.com/"):
name = url
else:
name = self._filename(url) or url
return (("postfile", url, name),)
return ()
def _images(self, post):
for image in post["images"]:
for image in post.get("images") or ():
url = image.get("download_url")
if url:
name = image.get("file_name") or self._filename(url) or url
@ -80,7 +90,7 @@ class PatreonExtractor(Extractor):
return ()
def _attachments(self, post):
for attachment in post["attachments"]:
for attachment in post.get("attachments") or ():
url = self.request(
attachment["url"], method="HEAD",
allow_redirects=False, fatal=False,
@ -249,8 +259,39 @@ class PatreonExtractor(Extractor):
return [genmap[ft] for ft in filetypes]
def _extract_bootstrap(self, page):
return util.json_loads(text.extr(
page, "window.patreon.bootstrap,", "});") + "}")
data = text.extr(
page, 'id="__NEXT_DATA__" type="application/json">', '</script')
if data:
try:
return (util.json_loads(data)["props"]["pageProps"]
["bootstrapEnvelope"]["bootstrap"])
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
bootstrap = text.extr(
page, 'window.patreon = {"bootstrap":', '},"apiServer"')
if bootstrap:
return util.json_loads(bootstrap + "}")
bootstrap = text.extr(
page,
'window.patreon = wrapInProxy({"bootstrap":',
'},"apiServer"')
if bootstrap:
return util.json_loads(bootstrap + "}")
bootstrap = text.extr(page, "window.patreon.bootstrap,", "});")
if bootstrap:
return util.json_loads(bootstrap + "}")
data = text.extr(page, "window.patreon = {", "};\n")
if data:
try:
return util.json_loads("{" + data + "}")["bootstrap"]
except Exception:
pass
raise exception.StopExtraction("Unable to extract bootstrap data")
class PatreonCreatorExtractor(PatreonExtractor):
@ -267,34 +308,52 @@ class PatreonCreatorExtractor(PatreonExtractor):
def posts(self):
query = text.parse_query(self.query)
campaign_id = self._get_campaign_id(query)
filters = self._get_filters(query)
self.log.debug("campaign_id: %s", campaign_id)
url = self._build_url("posts", (
"&filter[campaign_id]=" + campaign_id +
"&filter[contains_exclusive_posts]=true"
"&filter[is_draft]=false" + filters +
"&sort=" + query.get("sort", "-published_at")
))
return self._pagination(url)
creator_id = query.get("u")
if creator_id:
url = "{}/user/posts?u={}".format(self.root, creator_id)
def _get_campaign_id(self, query):
if self.creator.startswith("id:"):
return self.creator[3:]
campaign_id = query.get("c") or query.get("campaign_id")
if campaign_id:
return campaign_id
user_id = query.get("u")
if user_id:
url = "{}/user/posts?u={}".format(self.root, user_id)
else:
url = "{}/{}/posts".format(self.root, self.creator)
page = self.request(url, notfound="creator").text
try:
data = None
data = self._extract_bootstrap(page)
campaign_id = data["creator"]["data"]["id"]
except (KeyError, ValueError):
raise exception.NotFoundError("creator")
filters = "".join(
return data["campaign"]["data"]["id"]
except (KeyError, ValueError) as exc:
if data:
self.log.debug(data)
raise exception.StopExtraction(
"Unable to extract campaign ID (%s: %s)",
exc.__class__.__name__, exc)
def _get_filters(self, query):
return "".join(
"&filter[{}={}".format(key[8:], text.escape(value))
for key, value in query.items()
if key.startswith("filters[")
)
url = self._build_url("posts", (
"&filter[campaign_id]=" + campaign_id +
"&filter[contains_exclusive_posts]=true"
"&filter[is_draft]=false" + filters +
"&sort=" + query.get("sort", "-published_at")
))
return self._pagination(url)
class PatreonUserExtractor(PatreonExtractor):
"""Extractor for media from creators supported by you"""

@ -18,7 +18,7 @@ class PhilomenaExtractor(BooruExtractor):
basecategory = "philomena"
filename_fmt = "{filename}.{extension}"
archive_fmt = "{id}"
request_interval = 1.0
request_interval = (0.5, 1.5)
page_start = 1
per_page = 50
@ -32,7 +32,7 @@ class PhilomenaExtractor(BooruExtractor):
post["date"] = text.parse_datetime(post["created_at"])
INSTANCES = {
BASE_PATTERN = PhilomenaExtractor.update({
"derpibooru": {
"root": "https://derpibooru.org",
"pattern": r"(?:www\.)?derpibooru\.org",
@ -48,9 +48,7 @@ INSTANCES = {
"pattern": r"furbooru\.org",
"filter_id": "2",
},
}
BASE_PATTERN = PhilomenaExtractor.update(INSTANCES)
})
class PhilomenaPostExtractor(PhilomenaExtractor):
@ -176,10 +174,7 @@ class PhilomenaAPI():
if filter_id:
params["filter_id"] = filter_id
elif not api_key:
try:
params["filter_id"] = INSTANCES[extr.category]["filter_id"]
except (KeyError, TypeError):
params["filter_id"] = "2"
params["filter_id"] = extr.config_instance("filter_id") or "2"
params["page"] = extr.page_start
params["per_page"] = extr.per_page

@ -56,7 +56,7 @@ class PillowfortExtractor(Extractor):
post["num"] = 0
for file in files:
url = file["url"]
url = file["url"] or file.get("b2_lg_url")
if not url:
continue
@ -91,7 +91,7 @@ class PillowfortExtractor(Extractor):
if username:
self.cookies_update(self._login_impl(username, password))
@cache(maxage=14*24*3600, keyarg=1)
@cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
@ -132,7 +132,7 @@ class PillowfortPostExtractor(PillowfortExtractor):
class PillowfortUserExtractor(PillowfortExtractor):
"""Extractor for all posts of a pillowfort user"""
subcategory = "user"
pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+)"
pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)"
example = "https://www.pillowfort.social/USER"
def posts(self):

@ -10,7 +10,6 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+"
@ -33,7 +32,6 @@ class PinterestExtractor(Extractor):
self.api = PinterestAPI(self)
def items(self):
self.api.login()
data = self.metadata()
videos = self.config("videos", True)
@ -49,6 +47,7 @@ class PinterestExtractor(Extractor):
carousel_data = pin.get("carousel_data")
if carousel_data:
pin["count"] = len(carousel_data["carousel_slots"])
for num, slot in enumerate(carousel_data["carousel_slots"], 1):
slot["media_id"] = slot.pop("id")
pin.update(slot)
@ -67,7 +66,7 @@ class PinterestExtractor(Extractor):
if videos or media.get("duration") is None:
pin.update(media)
pin["num"] = 0
pin["num"] = pin["count"] = 1
pin["media_id"] = ""
url = media["url"]
@ -416,41 +415,6 @@ class PinterestAPI():
options = {"query": query, "scope": "pins", "rs": "typed"}
return self._pagination("BaseSearch", options)
def login(self):
"""Login and obtain session cookies"""
username, password = self.extractor._get_auth_info()
if username:
self.cookies.update(self._login_impl(username, password))
@cache(maxage=180*24*3600, keyarg=1)
def _login_impl(self, username, password):
self.extractor.log.info("Logging in as %s", username)
url = self.root + "/resource/UserSessionResource/create/"
options = {
"username_or_email": username,
"password" : password,
}
data = {
"data" : util.json_dumps({"options": options}),
"source_url": "",
}
try:
response = self.extractor.request(
url, method="POST", headers=self.headers,
cookies=self.cookies, data=data)
resource = response.json()["resource_response"]
except (exception.HttpError, ValueError, KeyError):
raise exception.AuthenticationError()
if resource["status"] != "success":
raise exception.AuthenticationError()
return {
cookie.name: cookie.value
for cookie in response.cookies
}
def _call(self, resource, options):
url = "{}/resource/{}Resource/get/".format(self.root, resource)
params = {

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://pixeldrain.com/"""
from .common import Extractor, Message
from .. import text, util
BASE_PATTERN = r"(?:https?://)?pixeldrain\.com"
class PixeldrainExtractor(Extractor):
"""Base class for pixeldrain extractors"""
category = "pixeldrain"
root = "https://pixeldrain.com"
archive_fmt = "{id}"
def _init(self):
api_key = self.config("api-key")
if api_key:
self.session.auth = util.HTTPBasicAuth("", api_key)
def parse_datetime(self, date_string):
return text.parse_datetime(
date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
class PixeldrainFileExtractor(PixeldrainExtractor):
"""Extractor for pixeldrain files"""
subcategory = "file"
filename_fmt = "{filename[:230]} ({id}).{extension}"
pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)"
example = "https://pixeldrain.com/u/abcdefgh"
def __init__(self, match):
Extractor.__init__(self, match)
self.file_id = match.group(1)
def items(self):
url = "{}/api/file/{}".format(self.root, self.file_id)
file = self.request(url + "/info").json()
file["url"] = url + "?download"
file["date"] = self.parse_datetime(file["date_upload"])
text.nameext_from_url(file["name"], file)
yield Message.Directory, file
yield Message.Url, file["url"], file
class PixeldrainAlbumExtractor(PixeldrainExtractor):
"""Extractor for pixeldrain albums"""
subcategory = "album"
directory_fmt = ("{category}",
"{album[date]:%Y-%m-%d} {album[title]} ({album[id]})")
filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}"
pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)"
example = "https://pixeldrain.com/l/abcdefgh"
def __init__(self, match):
Extractor.__init__(self, match)
self.album_id = match.group(1)
def items(self):
url = "{}/api/list/{}".format(self.root, self.album_id)
album = self.request(url).json()
files = album["files"]
album["count"] = album["file_count"]
album["date"] = self.parse_datetime(album["date_created"])
del album["files"]
del album["file_count"]
yield Message.Directory, {"album": album}
for num, file in enumerate(files, 1):
file["album"] = album
file["num"] = num
file["url"] = url = "{}/api/file/{}?download".format(
self.root, file["id"])
file["date"] = self.parse_datetime(file["date_upload"])
text.nameext_from_url(file["name"], file)
yield Message.Url, url, file

@ -517,6 +517,7 @@ class PixivPixivisionExtractor(PixivExtractor):
directory_fmt = ("{category}", "pixivision",
"{pixivision_id} {pixivision_title}")
archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}"
cookies_domain = ".pixiv.net"
pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)"
example = "https://www.pixivision.net/en/a/12345"
@ -549,6 +550,9 @@ class PixivSeriesExtractor(PixivExtractor):
directory_fmt = ("{category}", "{user[id]} {user[account]}",
"{series[id]} {series[title]}")
filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
cookies_domain = ".pixiv.net"
browser = "firefox"
tls12 = False
pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)"
example = "https://www.pixiv.net/user/12345/series/12345"
@ -590,7 +594,7 @@ class PixivSeriesExtractor(PixivExtractor):
class PixivNovelExtractor(PixivExtractor):
"""Extractor for pixiv novels"""
subcategory = "novel"
request_interval = 1.0
request_interval = (0.5, 1.5)
pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)"
example = "https://www.pixiv.net/novel/show.php?id=12345"
@ -822,9 +826,9 @@ class PixivAppAPI():
extractor.session.headers.update({
"App-OS" : "ios",
"App-OS-Version": "13.1.2",
"App-Version" : "7.7.6",
"User-Agent" : "PixivIOSApp/7.7.6 (iOS 13.1.2; iPhone11,8)",
"App-OS-Version": "16.7.2",
"App-Version" : "7.19.1",
"User-Agent" : "PixivIOSApp/7.19.1 (iOS 16.7.2; iPhone12,8)",
"Referer" : "https://app-api.pixiv.net/",
})
@ -992,6 +996,6 @@ class PixivAppAPI():
params = text.parse_query(query)
@cache(maxage=10*365*24*3600, keyarg=0)
@cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(username):
return None

@ -18,7 +18,7 @@ class PlurkExtractor(Extractor):
"""Base class for plurk extractors"""
category = "plurk"
root = "https://www.plurk.com"
request_interval = 1.0
request_interval = (0.5, 1.5)
def items(self):
urls = self._urls_ex if self.config("comments", False) else self._urls

@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for http://www.poringa.net/"""
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import itertools
BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net"
class PoringaExtractor(Extractor):
category = "poringa"
directory_fmt = ("{category}", "{user}", "{post_id}")
filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}"
archive_fmt = "{post_id}_{num}"
root = "http://www.poringa.net"
def __init__(self, match):
Extractor.__init__(self, match)
self.item = match.group(1)
self.__cookies = True
def items(self):
for post_id in self.posts():
url = "{}/posts/imagenes/{}".format(self.root, post_id)
try:
response = self.request(url)
except exception.HttpError as exc:
self.log.warning(
"Unable to fetch posts for '%s' (%s)", post_id, exc)
continue
if "/registro-login?" in response.url:
self.log.warning("Private post '%s'", post_id)
continue
page = response.text
title, pos = text.extract(
page, 'property="og:title" content="', '"')
try:
pos = page.index('<div class="main-info', pos)
user, pos = text.extract(
page, 'href="http://www.poringa.net/', '"', pos)
except ValueError:
user = None
if not user:
user = "poringa"
data = {
"post_id" : post_id,
"title" : text.unescape(title),
"user" : text.unquote(user),
"_http_headers": {"Referer": url},
}
main_post = text.extr(
page, 'property="dc:content" role="main">', '</div>')
urls = list(text.extract_iter(
main_post, '<img class="imagen" border="0" src="', '"'))
data["count"] = len(urls)
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
def posts(self):
return ()
def request(self, url, **kwargs):
if self.__cookies:
self.__cookies = False
self.cookies_update(_cookie_cache())
for _ in range(5):
response = Extractor.request(self, url, **kwargs)
if response.cookies:
_cookie_cache.update("", response.cookies)
if response.content.find(
b"<title>Please wait a few moments</title>", 0, 600) < 0:
return response
self.sleep(5.0, "check")
def _pagination(self, url, params):
for params["p"] in itertools.count(1):
page = self.request(url, params=params).text
posts_ids = PoringaPostExtractor.pattern.findall(page)
posts_ids = list(dict.fromkeys(posts_ids))
yield from posts_ids
if len(posts_ids) < 19:
return
class PoringaPostExtractor(PoringaExtractor):
"""Extractor for posts on poringa.net"""
subcategory = "post"
pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)"
example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html"
def posts(self):
return (self.item,)
class PoringaUserExtractor(PoringaExtractor):
subcategory = "user"
pattern = BASE_PATTERN + r"/(\w+)$"
example = "http://www.poringa.net/USER"
def posts(self):
url = self.root + "/buscar/"
params = {"q": self.item}
return self._pagination(url, params)
class PoringaSearchExtractor(PoringaExtractor):
subcategory = "search"
pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)"
example = "http://www.poringa.net/buscar/?q=QUERY"
def posts(self):
url = self.root + "/buscar/"
params = {"q": self.item}
return self._pagination(url, params)
@cache()
def _cookie_cache():
return ()

@ -143,7 +143,7 @@ class PornhubGifExtractor(PornhubExtractor):
"url" : extr('"contentUrl": "', '"'),
"date" : text.parse_datetime(
extr('"uploadDate": "', '"'), "%Y-%m-%d"),
"user" : extr('data-mxptext="', '"'),
"user" : text.remove_html(extr("Created by:", "</div>")),
}
yield Message.Directory, gif

@ -0,0 +1,203 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Postmill instances"""
import re
from .common import BaseExtractor, Message
from .. import text, exception
class PostmillExtractor(BaseExtractor):
"""Base class for Postmill extractors"""
basecategory = "postmill"
directory_fmt = ("{category}", "{instance}", "{forum}")
filename_fmt = "{id}_{title[:220]}.{extension}"
archive_fmt = "{filename}"
def _init(self):
self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False)
self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search
self._search_image_tag = re.compile(
r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self):
for post_url in self.post_urls():
page = self.request(post_url).text
extr = text.extract_from(page)
title = text.unescape(extr(
'<meta property="og:title" content="', '">'))
date = text.parse_datetime(extr(
'<meta property="og:article:published_time" content="', '">'))
username = extr(
'<meta property="og:article:author" content="', '">')
post_canonical_url = text.unescape(extr(
'<link rel="canonical" href="', '">'))
url = text.unescape(extr(
'<h1 class="submission__title unheaderize inline"><a href="',
'"'))
body = extr(
'<div class="submission__body break-text text-flow">',
'</div>')
match = self._search_canonical_url(post_canonical_url)
forum = match.group(1)
id = int(match.group(2))
is_text_post = url.startswith("/")
is_image_post = self._search_image_tag(page) is not None
data = {
"title": title,
"date": date,
"username": username,
"forum": forum,
"id": id,
"flair": [text.unescape(i) for i in text.extract_iter(
page, '<span class="flair__label">', '</span>')],
"instance": self.instance,
}
urls = []
if is_text_post or self.save_link_post_body:
urls.append((Message.Url, "text:" + body))
if is_image_post:
urls.append((Message.Url, url))
elif not is_text_post:
urls.append((Message.Queue, url))
data["count"] = len(urls)
yield Message.Directory, data
for data["num"], (msg, url) in enumerate(urls, 1):
if url.startswith("text:"):
data["filename"], data["extension"] = "", "htm"
else:
data = text.nameext_from_url(url, data)
yield msg, url, data
class PostmillSubmissionsExtractor(PostmillExtractor):
"""Base class for Postmill submissions extractors"""
whitelisted_parameters = ()
def __init__(self, match):
PostmillExtractor.__init__(self, match)
groups = match.groups()
self.base = groups[-3]
self.sorting_path = groups[-2] or ""
self.query = {key: value for key, value in text.parse_query(
groups[-1]).items() if self.acceptable_query(key)}
def items(self):
url = self.root + self.base + self.sorting_path
while url:
response = self.request(url, params=self.query)
if response.history:
redirect_url = response.url
if redirect_url == self.root + "/login":
raise exception.StopExtraction(
"HTTP redirect to login page (%s)", redirect_url)
page = response.text
for nav in text.extract_iter(page,
'<nav class="submission__nav">',
'</nav>'):
post_url = text.unescape(text.extr(nav, '<a href="', '"'))
yield Message.Queue, text.urljoin(url, post_url), \
{"_extractor": PostmillPostExtractor}
url = text.unescape(text.extr(page,
'<link rel="next" href="', '">'))
def acceptable_query(self, key):
return key in self.whitelisted_parameters or key == "t" or \
(key.startswith("next[") and key.endswith("]"))
BASE_PATTERN = PostmillExtractor.update({
"raddle": {
"root" : None,
"pattern": (r"(?:raddle\.me|"
r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid"
r"\.onion)"),
}
})
QUERY_RE = r"(?:\?([^#]+))?$"
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
QUERY_RE
class PostmillPostExtractor(PostmillExtractor):
"""Extractor for a single submission URL"""
subcategory = "post"
pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
example = "https://raddle.me/f/FORUM/123/TITLE"
def __init__(self, match):
PostmillExtractor.__init__(self, match)
self.forum = match.group(3)
self.post_id = match.group(4)
def post_urls(self):
return (self.root + "/f/" + self.forum + "/" + self.post_id,)
class PostmillShortURLExtractor(PostmillExtractor):
"""Extractor for short submission URLs"""
subcategory = "shorturl"
pattern = BASE_PATTERN + r"/(\d+)$"
example = "https://raddle.me/123"
def __init__(self, match):
PostmillExtractor.__init__(self, match)
self.post_id = match.group(3)
def items(self):
url = self.root + "/" + self.post_id
response = self.request(url, method="HEAD", allow_redirects=False)
full_url = text.urljoin(url, response.headers["Location"])
yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor}
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
"""Extractor for the home page"""
subcategory = "home"
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
example = "https://raddle.me/"
class PostmillForumExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum"""
subcategory = "forum"
pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
example = "https://raddle.me/f/FORUM"
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions made by a user"""
subcategory = "usersubmissions"
pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
example = "https://raddle.me/user/USER/submissions"
class PostmillTagExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum with a specific tag"""
subcategory = "tag"
pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
example = "https://raddle.me/tag/TAG"
class PostmillSearchExtractor(PostmillSubmissionsExtractor):
"""Extractor for search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$"
example = "https://raddle.me/search?q=QUERY"
whitelisted_parameters = ("q",)

@ -18,7 +18,7 @@ class ReactorExtractor(BaseExtractor):
basecategory = "reactor"
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
archive_fmt = "{post_id}_{num}"
request_interval = 5.0
request_interval = (3.0, 6.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)

@ -23,7 +23,7 @@ class ReadcomiconlineBase():
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
archive_fmt = "{issue_id}_{page}"
root = "https://readcomiconline.li"
request_interval = (3.0, 7.0)
request_interval = (3.0, 6.0)
def request(self, url, **kwargs):
"""Detect and handle redirects to CAPTCHA pages"""

@ -115,12 +115,18 @@ class RedditExtractor(Extractor):
continue
if url[0] == "/":
url = "https://www.reddit.com" + url
if url.startswith((
"https://www.reddit.com/message/compose",
"https://reddit.com/message/compose",
)):
continue
match = match_submission(url)
if match:
extra.append(match.group(1))
elif not match_user(url) and not match_subreddit(url):
if previews and "preview" in data:
if previews and "comment" not in data and \
"preview" in data:
data["_fallback"] = self._previews(data)
yield Message.Queue, text.unescape(url), data
if "_fallback" in data:
@ -153,7 +159,7 @@ class RedditExtractor(Extractor):
data = meta[item["media_id"]]
if data["status"] != "valid" or "s" not in data:
self.log.warning(
"gallery %s: skipping item %s ('status: %s')",
"gallery %s: skipping item %s (status: %s)",
submission["id"], item["media_id"], data.get("status"))
continue
src = data["s"]
@ -286,6 +292,29 @@ class RedditImageExtractor(Extractor):
yield Message.Url, url, data
class RedditRedirectExtractor(Extractor):
"""Extractor for personalized share URLs produced by the mobile app"""
category = "reddit"
subcategory = "redirect"
pattern = (r"(?:https?://)?(?:"
r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))"
r"/s/([a-zA-Z0-9]{10})")
example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ"
def __init__(self, match):
Extractor.__init__(self, match)
self.subreddit = match.group(1)
self.share_url = match.group(2)
def items(self):
url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \
self.share_url
data = {"_extractor": RedditSubmissionExtractor}
response = self.request(url, method="HEAD", allow_redirects=False,
notfound="submission")
yield Message.Queue, response.headers["Location"], data
class RedditAPI():
"""Interface for the Reddit API
@ -394,9 +423,10 @@ class RedditAPI():
"grants/installed_client"),
"device_id": "DO_NOT_TRACK_THIS_DEVICE"}
auth = util.HTTPBasicAuth(self.client_id, "")
response = self.extractor.request(
url, method="POST", headers=self.headers,
data=data, auth=(self.client_id, ""), fatal=False)
data=data, auth=auth, fatal=False)
data = response.json()
if response.status_code != 200:
@ -501,7 +531,7 @@ class RedditAPI():
return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
@cache(maxage=100*365*24*3600, keyarg=0)
@cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(token):
if token and token[0] == "#":
return None

@ -89,14 +89,20 @@ class RedgifsUserExtractor(RedgifsExtractor):
"""Extractor for redgifs user profiles"""
subcategory = "user"
directory_fmt = ("{category}", "{userName}")
pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?$"
pattern = (r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?"
r"(?:\?([^#]+))?$")
example = "https://www.redgifs.com/users/USER"
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
self.query = match.group(2)
def metadata(self):
return {"userName": self.key}
def gifs(self):
return self.api.user(self.key)
order = text.parse_query(self.query).get("order")
return self.api.user(self.key, order or "new")
class RedgifsCollectionExtractor(RedgifsExtractor):
@ -140,11 +146,17 @@ class RedgifsCollectionsExtractor(RedgifsExtractor):
class RedgifsNichesExtractor(RedgifsExtractor):
"""Extractor for redgifs niches"""
subcategory = "niches"
pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)"
pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)/?"
r"(?:\?([^#]+))?$")
example = "https://www.redgifs.com/niches/NAME"
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
self.query = match.group(2)
def gifs(self):
return self.api.niches(self.key)
order = text.parse_query(self.query).get("order")
return self.api.niches(self.key, order or "new")
class RedgifsSearchExtractor(RedgifsExtractor):
@ -208,7 +220,7 @@ class RedgifsAPI():
endpoint = "/v2/gallery/" + gallery_id
return self._call(endpoint)
def user(self, user, order="best"):
def user(self, user, order="new"):
endpoint = "/v2/users/{}/search".format(user.lower())
params = {"order": order}
return self._pagination(endpoint, params)
@ -226,9 +238,10 @@ class RedgifsAPI():
endpoint = "/v2/users/{}/collections".format(user)
return self._pagination(endpoint, key="collections")
def niches(self, niche):
def niches(self, niche, order):
endpoint = "/v2/niches/{}/gifs".format(niche)
return self._pagination(endpoint)
params = {"count": 30, "order": order}
return self._pagination(endpoint, params)
def search(self, params):
endpoint = "/v2/gifs/search"

@ -38,7 +38,11 @@ class Rule34usExtractor(BooruExtractor):
"height" : extr(' x ', 'h'),
"file_url": extr(' src="', '"'),
}
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
url = post["file_url"]
if "//video-cdn1." in url:
post["_fallback"] = (url.replace("//video-cdn1.", "//video."),)
post["md5"] = url.rpartition("/")[2].partition(".")[0]
tags = collections.defaultdict(list)
for tag_type, tag_name in self._find_tags(page):

@ -87,7 +87,7 @@ class SankakuTagExtractor(SankakuExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
pattern = BASE_PATTERN + r"/?\?([^#]*)"
pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)"
example = "https://sankaku.app/?tags=TAG"
def __init__(self, match):
@ -117,7 +117,7 @@ class SankakuPoolExtractor(SankakuExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}")
archive_fmt = "p_{pool}_{id}"
pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)"
pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\d+)"
example = "https://sankaku.app/books/12345"
def __init__(self, match):
@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor):
"""Extractor for single posts from sankaku.app"""
subcategory = "post"
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)"
pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)"
example = "https://sankaku.app/post/show/12345"
def __init__(self, match):
@ -179,12 +179,16 @@ class SankakuAPI():
def __init__(self, extractor):
self.extractor = extractor
self.headers = {
"Accept" : "application/vnd.sankaku.api+json;v=2",
"Platform": "web-app",
"Origin" : extractor.root,
"Accept" : "application/vnd.sankaku.api+json;v=2",
"Platform" : "web-app",
"Api-Version": None,
"Origin" : extractor.root,
}
self.username, self.password = self.extractor._get_auth_info()
if extractor.config("id-format") in ("alnum", "alphanumeric"):
self.headers["Api-Version"] = "2"
self.username, self.password = extractor._get_auth_info()
if not self.username:
self.authenticate = util.noop
@ -285,7 +289,7 @@ class SankakuAPI():
return
@cache(maxage=365*24*3600, keyarg=1)
@cache(maxage=365*86400, keyarg=1)
def _authenticate_impl(extr, username, password):
extr.log.info("Logging in as %s", username)

@ -19,17 +19,12 @@ class Shimmie2Extractor(BaseExtractor):
archive_fmt = "{id}"
def _init(self):
try:
instance = INSTANCES[self.category]
except KeyError:
return
cookies = instance.get("cookies")
cookies = self.config_instance("cookies")
if cookies:
domain = self.root.rpartition("/")[2]
self.cookies_update_dict(cookies, domain=domain)
file_url = instance.get("file_url")
file_url = self.config_instance("file_url")
if file_url:
self.file_url_fmt = file_url
@ -41,8 +36,9 @@ class Shimmie2Extractor(BaseExtractor):
for post in self.posts():
for key in ("id", "width", "height"):
post[key] = text.parse_int(post[key])
post["id"] = text.parse_int(post["id"])
post["width"] = text.parse_int(post["width"])
post["height"] = text.parse_int(post["height"])
post["tags"] = text.unquote(post["tags"])
post.update(data)
@ -64,20 +60,23 @@ class Shimmie2Extractor(BaseExtractor):
"""Return an iterable containing data of all relevant posts"""
return ()
def _quote_type(self, page):
"""Return quoting character used in 'page' (' or ")"""
try:
return page[page.index("<link rel=")+10]
except Exception:
return "'"
INSTANCES = {
"mememuseum": {
"root": "https://meme.museum",
"pattern": r"meme\.museum",
},
BASE_PATTERN = Shimmie2Extractor.update({
"loudbooru": {
"root": "https://loudbooru.com",
"pattern": r"loudbooru\.com",
"cookies": {"ui-tnc-agreed": "true"},
},
"giantessbooru": {
"root": "https://giantessbooru.com",
"pattern": r"giantessbooru\.com",
"root": "https://sizechangebooru.com",
"pattern": r"(?:sizechange|giantess)booru\.com",
"cookies": {"agreed": "true"},
},
"tentaclerape": {
@ -89,9 +88,11 @@ INSTANCES = {
"pattern": r"booru\.cavemanon\.xyz",
"file_url": "{0}/index.php?q=image/{2}.{4}",
},
}
BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=/?)?"
"rule34hentai": {
"root": "https://rule34hentai.net",
"pattern": r"rule34hentai\.net",
},
}) + r"/(?:index\.php\?q=/?)?"
class Shimmie2TagExtractor(Shimmie2Extractor):
@ -125,21 +126,26 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
if init:
init = False
has_mime = ("data-mime='" in page)
has_pid = ("data-post-id='" in page)
quote = self._quote_type(page)
has_mime = (" data-mime=" in page)
has_pid = (" data-post-id=" in page)
while True:
if has_mime:
mime = extr("data-mime='", "'")
mime = extr(" data-mime="+quote, quote)
if has_pid:
pid = extr("data-post-id='", "'")
pid = extr(" data-post-id="+quote, quote)
else:
pid = extr("href='/post/view/", "?")
pid = extr(" href='/post/view/", quote)
if not pid:
break
tags, dimensions, size = extr("title='", "'").split(" // ")
data = extr("title="+quote, quote).split(" // ")
tags = data[0]
dimensions = data[1]
size = data[2]
width, _, height = dimensions.partition("x")
md5 = extr("/_thumbs/", "/")
@ -170,25 +176,25 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
extr = text.extract_from(self.request(url).text)
while True:
pid = extr('href="./index.php?q=/post/view/', '&')
pid = extr("href='./index.php?q=/post/view/", "&")
if not pid:
break
tags, dimensions, size = extr('title="', '"').split(" // ")
tags, dimensions, size = extr("title='", "'").split(" // ")
width, _, height = dimensions.partition("x")
yield {
"file_url": file_url_fmt(pid),
"id": pid,
"md5": "",
"tags": tags,
"width": width,
"height": height,
"size": text.parse_bytes(size[:-1]),
"id" : pid,
"md5" : "",
"tags" : tags,
"width" : width,
"height" : height,
"size" : text.parse_bytes(size[:-1]),
}
pnum += 1
if not extr('/{}">{}<'.format(pnum, pnum), ">"):
if not extr("/{0}'>{0}<".format(pnum), ">"):
return
@ -204,15 +210,17 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
def posts(self):
url = "{}/post/view/{}".format(self.root, self.post_id)
extr = text.extract_from(self.request(url).text)
page = self.request(url).text
extr = text.extract_from(page)
quote = self._quote_type(page)
post = {
"id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : extr("/_thumbs/", "/"),
"file_url": self.root + (
extr("id='main_image' src='", "'") or
extr("<source src='", "'")).lstrip("."),
extr("id={0}main_image{0} src={0}".format(quote), quote) or
extr("<source src="+quote, quote)).lstrip("."),
"width" : extr("data-width=", " ").strip("\"'"),
"height" : extr("data-height=", ">").partition(
" ")[0].strip("\"'"),
@ -233,7 +241,7 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
"id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : "",
"file_url": self.root + extr('id="main_image" src=".', '"'),
"file_url": self.root + extr("id='main_image' src='.", "'"),
"width" : extr("orig_width =", ";"),
"height" : 0,
"size" : 0,

@ -0,0 +1,211 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.steamgriddb.com"""
from .common import Extractor, Message
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com"
LANGUAGE_CODES = (
"aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az",
"ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce",
"ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee",
"el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr",
"fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr",
"ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is",
"it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn",
"ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln",
"lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms",
"mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv",
"ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu",
"rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk",
"sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta",
"te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw",
"ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi",
"yo", "za", "zh", "zu",
)
FILE_EXT_TO_MIME = {
"png": "image/png",
"jpeg": "image/jpeg",
"jpg": "image/jpeg",
"webp": "image/webp",
"ico": "image/vnd.microsoft.icon",
"all": "all",
}
class SteamgriddbExtractor(Extractor):
"""Base class for SteamGridDB"""
category = "steamgriddb"
directory_fmt = ("{category}", "{subcategory}", "{game[id]}")
filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}"
archive_fmt = "{filename}"
root = "https://www.steamgriddb.com"
def _init(self):
self.cookies_update({
"userprefs": "%7B%22adult%22%3Afalse%7D",
})
def items(self):
download_fake_png = self.config("download-fake-png", True)
for asset in self.assets():
if download_fake_png and asset.get("fake_png"):
urls = (asset["url"], asset["fake_png"])
else:
urls = (asset["url"],)
asset["count"] = len(urls)
yield Message.Directory, asset
for asset["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, asset)
def _call(self, endpoint, **kwargs):
data = self.request(self.root + endpoint, **kwargs).json()
if not data["success"]:
raise exception.StopExtraction(data["error"])
return data["data"]
class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
"""Base class for extracting a list of assets"""
def __init__(self, match):
SteamgriddbExtractor.__init__(self, match)
list_type = match.group(1)
id = int(match.group(2))
self.game_id = id if list_type == "game" else None
self.collection_id = id if list_type == "collection" else None
self.page = int(match.group(3) or 1)
def assets(self):
limit = 48
page = min(self.page - 1, 0)
sort = self.config("sort", "score_desc")
if sort not in ("score_desc", "score_asc", "score_old_desc",
"score_old_asc", "age_desc", "age_asc"):
raise exception.StopExtractor("Invalid sort '%s'", sort)
json = {
"static" : self.config("static", True),
"animated": self.config("animated", True),
"humor" : self.config("humor", True),
"nsfw" : self.config("nsfw", True),
"epilepsy": self.config("epilepsy", True),
"untagged": self.config("untagged", True),
"asset_type": self.asset_type,
"limit": limit,
"order": sort,
}
if self.valid_dimensions:
json["dimensions"] = self.config_list(
"dimensions", "dimension", self.valid_dimensions)
json["styles"] = self.config_list("styles", "style", self.valid_styles)
json["languages"] = self.config_list(
"languages", "language", LANGUAGE_CODES)
file_types = self.config_list(
"file-types", "file type", self.valid_file_types)
json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types]
if self.game_id:
json["game_id"] = [self.game_id]
else:
json["collection_id"] = self.collection_id
while True:
json["page"] = page
data = self._call(
"/api/public/search/assets", method="POST", json=json)
for asset in data["assets"]:
if not asset.get("game"):
asset["game"] = data["game"]
yield asset
if data["total"] <= limit * page:
break
page += 1
def config_list(self, key, type_name, valid_values):
value = self.config(key)
if isinstance(value, str):
value = value.split(",")
if value is None or "all" in value:
return ["all"]
for i in value:
if i not in valid_values:
raise exception.StopExtraction("Invalid %s '%s'", type_name, i)
return value
class SteamgriddbAssetExtractor(SteamgriddbExtractor):
"""Extractor for a single asset"""
subcategory = "asset"
pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)"
example = "https://www.steamgriddb.com/grid/1234"
def __init__(self, match):
SteamgriddbExtractor.__init__(self, match)
self.asset_type = match.group(1)
self.asset_id = match.group(2)
def assets(self):
endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id
asset = self._call(endpoint)["asset"]
return (asset,)
class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor):
subcategory = "grids"
asset_type = "grid"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/grids"
valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930",
"512x512", "1024x1024")
valid_styles = ("alternate", "blurred", "no_logo", "material",
"white_logo")
valid_file_types = ("png", "jpeg", "jpg", "webp")
class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor):
subcategory = "heroes"
asset_type = "hero"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/heroes"
valid_dimensions = ("1920x620", "3840x1240", "1600x650")
valid_styles = ("alternate", "blurred", "material")
valid_file_types = ("png", "jpeg", "jpg", "webp")
class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor):
subcategory = "logos"
asset_type = "logo"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/logos"
valid_dimensions = None
valid_styles = ("official", "white", "black", "custom")
valid_file_types = ("png", "webp")
class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor):
subcategory = "icons"
asset_type = "icon"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/icons"
valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24,
28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90,
96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192,
194, 256, 310, 512, 768, 1024)]
valid_styles = ("official", "custom")
valid_file_types = ("png", "ico")

@ -56,7 +56,7 @@ class SubscribestarExtractor(Extractor):
if username:
self.cookies_update(self._login_impl(username, password))
@cache(maxage=28*24*3600, keyarg=1)
@cache(maxage=28*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)

@ -87,6 +87,10 @@ BASE_PATTERN = SzurubooruExtractor.update({
"root": "https://booru.bcbnsfw.space",
"pattern": r"booru\.bcbnsfw\.space",
},
"snootbooru": {
"root": "https://snootbooru.com",
"pattern": r"snootbooru\.com",
},
})

@ -81,7 +81,7 @@ class TapasExtractor(Extractor):
self.cookies.set(
"adjustedBirthDate", "1981-02-03", domain=self.cookies_domain)
@cache(maxage=14*24*3600, keyarg=1)
@cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://tmohentai.com/"""
from .common import GalleryExtractor
from .. import text
BASE_PATTERN = r"(?:https?://)?tmohentai\.com"
class TmohentaiGalleryExtractor(GalleryExtractor):
category = "tmohentai"
root = "http://tmohentai.com"
directory_fmt = ("{category}", "{title} ({gallery_id})")
pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)"
example = "https://tmohentai.com/contents/12345a67b89c0"
def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/contents/{}".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def images(self, page):
fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format(
self.gallery_id).format
cnt = page.count('class="lanzador')
return [(fmt(i), None) for i in range(0, cnt)]
def metadata(self, page):
extr = text.extract_from(page)
return {
"gallery_id": self.gallery_id,
"title" : text.unescape(extr("<h3>", "<").strip()),
"artists" : text.split_html(extr(
"<label>Artists and Artists Groups</label>", "</ul>")),
"genres" : text.split_html(extr(
"<label>Genders</label>", "</ul>")),
"tags" : text.split_html(extr(
"<label>Tags</label>", "</ul>")),
"uploader" : text.remove_html(extr(
"<label>Uploaded By</label>", "</ul>")),
"language" : extr("&nbsp;", "\n"),
}

@ -27,7 +27,7 @@ class TsuminoBase():
self.cookies.setdefault(
"ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
@cache(maxage=14*24*3600, keyarg=1)
@cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = "{}/Account/Login".format(self.root)

@ -9,7 +9,7 @@
"""Extractors for https://www.tumblr.com/"""
from .common import Extractor, Message
from .. import text, oauth, exception
from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta
import re
@ -262,7 +262,7 @@ class TumblrExtractor(Extractor):
return updated, (resized == updated)
def _original_image_fallback(self, url, post_id):
for _ in range(self.fallback_retries):
for _ in util.repeat(self.fallback_retries):
self.sleep(self.fallback_delay, "image token")
yield self._update_image_token(url)[0]
self.log.warning("Unable to fetch higher-resolution "
@ -322,12 +322,15 @@ class TumblrDayExtractor(TumblrExtractor):
def __init__(self, match):
TumblrExtractor.__init__(self, match)
year, month, day = match.group(4).split("/")
self.date_min = (
# 719163 == date(1970, 1, 1).toordinal()
date(int(year), int(month), int(day)).toordinal() - 719163) * 86400
self.ordinal = date(int(year), int(month), int(day)).toordinal()
def _init(self):
TumblrExtractor._init(self)
self.date_min = (
# 719163 == date(1970, 1, 1).toordinal()
(self.ordinal - 719163) * 86400)
self.api.before = self.date_min + 86400
def posts(self):
@ -401,66 +404,70 @@ class TumblrAPI(oauth.OAuth1API):
def _call(self, endpoint, params, **kwargs):
url = self.ROOT + endpoint
kwargs["params"] = params
response = self.request(url, **kwargs)
try:
data = response.json()
except ValueError:
data = response.text
status = response.status_code
else:
status = data["meta"]["status"]
if 200 <= status < 400:
return data["response"]
self.log.debug(data)
if status == 403:
raise exception.AuthorizationError()
while True:
response = self.request(url, **kwargs)
elif status == 404:
try:
error = data["errors"][0]["detail"]
board = ("only viewable within the Tumblr dashboard" in error)
except Exception:
board = False
if board:
self.log.info("Run 'gallery-dl oauth:tumblr' "
"to access dashboard-only blogs")
raise exception.AuthorizationError(error)
raise exception.NotFoundError("user or post")
elif status == 429:
# daily rate limit
if response.headers.get("x-ratelimit-perday-remaining") == "0":
self.log.info("Daily API rate limit exceeded")
reset = response.headers.get("x-ratelimit-perday-reset")
api_key = self.api_key or self.session.auth.consumer_key
if api_key == self.API_KEY:
self.log.info("Register your own OAuth application and "
"use its credentials to prevent this error: "
"https://github.com/mikf/gallery-dl/blob/mas"
"ter/docs/configuration.rst#extractortumblra"
"pi-key--api-secret")
if self.extractor.config("ratelimit") == "wait":
data = response.json()
except ValueError:
data = response.text
status = response.status_code
else:
status = data["meta"]["status"]
if 200 <= status < 400:
return data["response"]
self.log.debug(data)
if status == 403:
raise exception.AuthorizationError()
elif status == 404:
try:
error = data["errors"][0]["detail"]
board = ("only viewable within the Tumblr dashboard"
in error)
except Exception:
board = False
if board:
self.log.info("Run 'gallery-dl oauth:tumblr' "
"to access dashboard-only blogs")
raise exception.AuthorizationError(error)
raise exception.NotFoundError("user or post")
elif status == 429:
# daily rate limit
if response.headers.get("x-ratelimit-perday-remaining") == "0":
self.log.info("Daily API rate limit exceeded")
reset = response.headers.get("x-ratelimit-perday-reset")
api_key = self.api_key or self.session.auth.consumer_key
if api_key == self.API_KEY:
self.log.info(
"Register your own OAuth application and use its "
"credentials to prevent this error: https://githu"
"b.com/mikf/gallery-dl/blob/master/docs/configurat"
"ion.rst#extractortumblrapi-key--api-secret")
if self.extractor.config("ratelimit") == "wait":
self.extractor.wait(seconds=reset)
continue
t = (datetime.now() + timedelta(0, float(reset))).time()
raise exception.StopExtraction(
"Aborting - Rate limit will reset at %s",
"{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
# hourly rate limit
reset = response.headers.get("x-ratelimit-perhour-reset")
if reset:
self.log.info("Hourly API rate limit exceeded")
self.extractor.wait(seconds=reset)
return self._call(endpoint, params, **kwargs)
t = (datetime.now() + timedelta(seconds=float(reset))).time()
raise exception.StopExtraction(
"Aborting - Rate limit will reset at %s",
"{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
# hourly rate limit
reset = response.headers.get("x-ratelimit-perhour-reset")
if reset:
self.log.info("Hourly API rate limit exceeded")
self.extractor.wait(seconds=reset)
return self._call(endpoint, params, **kwargs)
continue
raise exception.StopExtraction(data)
raise exception.StopExtraction(data)
def _pagination(self, blog, endpoint, params, key="posts", cache=False):
endpoint = "/v2/blog/{}{}".format(blog, endpoint)

@ -22,7 +22,7 @@ class TwibooruExtractor(BooruExtractor):
root = "https://twibooru.org"
filename_fmt = "{id}_{filename}.{extension}"
archive_fmt = "{id}"
request_interval = 6.05
request_interval = (6.0, 6.1)
page_start = 1
per_page = 50
@ -44,7 +44,7 @@ class TwibooruExtractor(BooruExtractor):
class TwibooruPostExtractor(TwibooruExtractor):
"""Extractor for single twibooru posts"""
subcategory = "post"
request_interval = 1.0
request_interval = (0.5, 1.5)
pattern = BASE_PATTERN + r"/(\d+)"
example = "https://twibooru.org/12345"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save