diff --git a/docs/configuration.rst b/docs/configuration.rst index 175f8690..05c75798 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -353,6 +353,20 @@ Description Additional key-value pairs to be added to each metadata dictionary. =========== ===== +extractor.*.archive +------------------- +=========== ===== +Type ``string`` +Default ``null`` +Description File to store IDs of downloaded files in. Downloads of files + already recorded in this archive file will be skipped_. + + The resulting archive file is not a plain text file but an SQLite3 + database, as lookup operations are significantly faster when the + amount of stored IDs gets reasonably large. +=========== ===== + + Extractor-specific Options ========================== @@ -788,6 +802,7 @@ How To - login and visit Tumblr's Applications_ section .. |datetime.max| replace:: ``datetime.max`` .. |strptime| replace:: strftime() and strptime() Behavior +.. _skipped: `extractor.*.skip`_ .. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_ .. _date-format: extractor.reddit.date-format_ diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index f0a7baff..22f7276d 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -16,6 +16,7 @@ }, "extractor": { + "archive": null, "skip": true, "sleep": 0, diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index 28c518c1..b831bdcc 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,8 +17,10 @@ class FutabaThreadExtractor(Extractor): category = "2chan" subcategory = "thread" directory_fmt = ["{category}", "{board_name}", "{thread}"] - pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"] + filename_fmt = "{tim}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" urlfmt = "https://{server}.2chan.net/{board}/src/{filename}" + pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"] test = [("http://dec.2chan.net/70/res/947.htm", { "url": "c5c12b80b290e224b6758507b3bb952044f4595b", "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 30bb24ec..9caaaa30 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -20,6 +20,7 @@ class BooruExtractor(SharedConfigExtractor): """Base class for all booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" api_url = "" per_page = 50 page_start = 1 diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py index 65e942d8..2e4f308b 100644 --- a/gallery_dl/extractor/chan.py +++ b/gallery_dl/extractor/chan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,7 +19,8 @@ class ChanThreadExtractor(Extractor): category = "chan" subcategory = "thread" directory_fmt = ["{category}", "{board}", "{thread} - {title}"] - filename_fmt = "{tim}-{filename}{ext}" + filename_fmt = "{tim}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" api_url = "" file_url = "" @@ -69,6 +70,7 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor): directory_fmt = ["{category}", "{board[shortname]}", "{thread_num}{title:? - //}"] filename_fmt = "{media[media]}" + archive_fmt = "{{board[shortname]}}_{num}_{timestamp}" root = "" referer = True diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5914aa67..2246f008 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -28,6 +28,7 @@ class Extractor(): categorytransfer = False directory_fmt = ["{category}"] filename_fmt = "{name}.{extension}" + archive_fmt = "" cookiedomain = "" def __init__(self): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 01d914a3..801e9e8d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,9 @@ import re class DeviantartExtractor(Extractor): """Base class for deviantart extractors""" category = "deviantart" - filename_fmt = "{category}_{index}_{title}.{extension}" directory_fmt = ["{category}", "{author[username]!l}"] + filename_fmt = "{category}_{index}_{title}.{extension}" + archive_fmt = "{index}.{extension}" def __init__(self, match=None): Extractor.__init__(self) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 9c79385c..7a64f1d3 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,6 +22,7 @@ class ExhentaiGalleryExtractor(Extractor): subcategory = "gallery" directory_fmt = ["{category}", "{gallery_id}"] filename_fmt = "{gallery_id}_{num:>04}_{image_token}_{name}.{extension}" + archive_fmt = "{gallery_id}_{num}" pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] test = [ ("https://exhentai.org/g/960460/4f0e369d82/", { diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 723b19a2..f818d756 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,6 +16,7 @@ class FlickrExtractor(Extractor): """Base class for flickr extractors""" category = "flickr" filename_fmt = "{category}_{id}.{extension}" + archive_fmt = "{id}" def __init__(self, match): Extractor.__init__(self) diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 2aef5326..117c3bdc 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -62,6 +62,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): directory_fmt = ["{category}", "{manga}", "{chapter_string}"] filename_fmt = ( "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") + archive_fmt = "{id}" method = "default" def __init__(self, match, url=None): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 1ca0b3b0..f9a30e7c 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class GelbooruExtractor(SharedConfigExtractor): basecategory = "booru" category = "gelbooru" filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index" def __init__(self): diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 4130b41d..ba48770d 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,6 +15,7 @@ from .. import exception class GfycatExtractor(Extractor): """Base class for gfycat extractors""" category = "gfycat" + archive_fmt = "{gfyName}" def __init__(self, match): Extractor.__init__(self) diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index 4f3f84fb..03232799 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -69,6 +69,7 @@ class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor): directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"] filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" "{page:>03}.{extension}") + archive_fmt = "{manga_id}_{chapter}_{page}" pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"] test = [("http://www.hbrowse.com/10363/c00000", { "url": "634f4800858913f097bc3b62a8fedaf74b5254bd", diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index f9c334d7..ba74ca80 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class HentaifoundryUserExtractor(Extractor): subcategory = "user" directory_fmt = ["{category}", "{artist}"] filename_fmt = "{category}_{index}_{title}.{extension}" + archive_fmt = "{index}" pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/" r"(?:pictures/user/([^/]+)/?$|user/([^/]+)/profile)"] test = [ diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index aee62a01..e72dad9a 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,6 +19,7 @@ class HitomiGalleryExtractor(Extractor): subcategory = "gallery" directory_fmt = ["{category}", "{gallery_id} {title}"] filename_fmt = "{category}_{gallery_id}_{num:>03}_{name}.{extension}" + archive_fmt = "{gallery_id}_{num}" pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"] test = [("https://hitomi.la/galleries/867789.html", { "url": "e42a47dfadda93e4bf37e82b1dc9ad29edfa9130", diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index e88aa8c7..b365484f 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,7 +17,8 @@ class ImagebamGalleryExtractor(AsynchronousExtractor): category = "imagebam" subcategory = "gallery" directory_fmt = ["{category}", "{title} - {gallery_key}"] - filename_fmt = "{num:>03}-{filename}" + filename_fmt = "{num:>03}-{name}.{extension}" + archive_fmt = "{image_id}" pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"] test = [(("http://www.imagebam.com/" "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), { @@ -76,6 +77,7 @@ class ImagebamImageExtractor(Extractor): """Extractor for single images from imagebam.com""" category = "imagebam" subcategory = "image" + archive_fmt = "{token}" pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/image/([0-9a-f]{15})"] test = [("http://www.imagebam.com/image/94d56c502511890", { "url": "b384893c35a01a09c58018db71ddc4cf2480be95", diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 367baea5..97d8cb68 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,12 +13,17 @@ from .. import text, util import json -class ImagefapGalleryExtractor(Extractor): - """Extractor for image galleries from imagefap.com""" +class ImagefapExtractor(Extractor): + """Base class for imagefap extractors""" category = "imagefap" - subcategory = "gallery" directory_fmt = ["{category}", "{gallery_id} {title}"] filename_fmt = "{category}_{gallery_id}_{name}.{extension}" + archive_fmt = "{gallery_id}_{image_id}" + + +class ImagefapGalleryExtractor(ImagefapExtractor): + """Extractor for image galleries from imagefap.com""" + subcategory = "gallery" pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/" r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")] test = [ @@ -35,7 +40,7 @@ class ImagefapGalleryExtractor(Extractor): ] def __init__(self, match): - Extractor.__init__(self) + ImagefapExtractor.__init__(self) self.gid = match.group(1) self.image_id = "" @@ -80,12 +85,9 @@ class ImagefapGalleryExtractor(Extractor): params["idx"] += 24 -class ImagefapImageExtractor(Extractor): +class ImagefapImageExtractor(ImagefapExtractor): """Extractor for single images from imagefap.com""" - category = "imagefap" subcategory = "image" - directory_fmt = ["{category}", "{gallery_id} {title}"] - filename_fmt = "{category}_{gallery_id}_{name}.{extension}" pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"] test = [("http://www.imagefap.com/photo/1369341772/", { "url": "24cc4312e4a5084f39f1e35af5ba92e5f7c1ad3c", @@ -93,7 +95,7 @@ class ImagefapImageExtractor(Extractor): })] def __init__(self, match): - Extractor.__init__(self) + ImagefapExtractor.__init__(self) self.image_id = match.group(1) def items(self): @@ -132,9 +134,8 @@ class ImagefapImageExtractor(Extractor): return json_dict -class ImagefapUserExtractor(Extractor): +class ImagefapUserExtractor(ImagefapExtractor): """Extractor for all galleries from a user at imagefap.com""" - category = "imagefap" subcategory = "user" categorytransfer = True pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/" @@ -146,7 +147,7 @@ class ImagefapUserExtractor(Extractor): })] def __init__(self, match): - Extractor.__init__(self) + ImagefapExtractor.__init__(self) try: self.user_id = int(match.group(1)) self.user = None diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 8bd2ec39..9d14b2ea 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ from urllib.parse import urljoin class ImagehostImageExtractor(Extractor): """Base class for single-image extractors for various imagehosts""" subcategory = "image" + archive_fmt = "{token}" https = False method = "post" params = "simple" diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index e4c12a0a..07e7564d 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,6 +16,7 @@ import re class ImgboxExtractor(Extractor): """Base class for imgbox extractors""" category = "imgbox" + archive_fmt = "{image_key}" root = "https://imgbox.com" def items(self): @@ -62,7 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor): """Extractor for image galleries from imgbox.com""" subcategory = "gallery" directory_fmt = ["{category}", "{title} - {gallery_key}"] - filename_fmt = "{num:>03}-{filename}" + filename_fmt = "{num:>03}-{name}.{extension}" pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"] test = [ ("https://imgbox.com/g/JaX5V5HX7g", { diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py index 30689fd8..bdb0434c 100644 --- a/gallery_dl/extractor/imgchili.py +++ b/gallery_dl/extractor/imgchili.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,6 +15,7 @@ from .. import text class ImgchiliExtractor(Extractor): """Base class for imgchili extractors""" category = "imgchili" + archive_fmt = "{image_id}" root = "https://imgchili.net" def __init__(self, match): diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 689c3740..4c583a46 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class ImgthGalleryExtractor(Extractor): subcategory = "gallery" directory_fmt = ["{category}", "{gallery_id} {title}"] filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"] test = [("http://imgth.com/gallery/37/wallpaper-anime", { "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 13622917..bd52eee4 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -16,6 +16,7 @@ import json class ImgurExtractor(Extractor): """Base class for imgur extractors""" category = "imgur" + archive_fmt = "{hash}" def __init__(self, match): Extractor.__init__(self) diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 77a9bf51..61f1a3f0 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor): category = "khinsider" subcategory = "soundtrack" directory_fmt = ["{category}", "{album}"] + archive_fmt = "{album}_{name}" pattern = [r"(?:https?://)?downloads\.khinsider\.com/" r"game-soundtracks/album/([^/?&#]+)"] test = [(("https://downloads.khinsider.com/game-soundtracks/" diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index af6e2e81..9325da6a 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class LusciousAlbumExtractor(AsynchronousExtractor): subcategory = "album" directory_fmt = ["{category}", "{gallery_id} {title}"] filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{image_id}" pattern = [(r"(?:https?://)?(?:www\.|members\.)?luscious\.net/" r"(?:c/[^/?&#]+/)?(?:pictures/album|albums)/([^/?&#]+_(\d+))")] test = [ diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 38ee21ac..669d3cae 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -59,6 +59,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): """Extractor for manga-chapters from mangareader.net""" + archive_fmt = "{manga}_{chapter}_{page}" pattern = [ (r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"), (r"(?:https?://)?(?:www\.)?mangareader\.net" diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py index 6f21dc3b..14e48553 100644 --- a/gallery_dl/extractor/mangastream.py +++ b/gallery_dl/extractor/mangastream.py @@ -16,6 +16,7 @@ from urllib.parse import urljoin class MangastreamChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from mangastream.com""" category = "mangastream" + archive_fmt = "{chapter_id}_{page}" pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/" r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")] test = [("https://readms.net/r/onepunch_man/087/4874/1", None)] diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index ec0a65ac..82020442 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,6 +19,7 @@ class NhentaiGalleryExtractor(Extractor): subcategory = "gallery" directory_fmt = ["{category}", "{gallery_id} {title}"] filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"] test = [("http://nhentai.net/g/147850/", { "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0", diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 3839ad37..6165eb88 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -18,6 +18,7 @@ class NijieExtractor(AsynchronousExtractor): category = "nijie" directory_fmt = ["{category}", "{artist_id}"] filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}" + archive_fmt = "{image_id}_{index}" cookiedomain = "nijie.info" popup_url = "https://nijie.info/view_popup.php?id=" diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 1d0d03a5..4f27f598 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -17,6 +17,7 @@ class PahealExtractor(SharedConfigExtractor): basecategory = "booru" category = "paheal" filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" root = "http://rule34.paheal.net" def items(self): diff --git a/gallery_dl/extractor/pawoo.py b/gallery_dl/extractor/pawoo.py index ec8561ca..a87366e8 100644 --- a/gallery_dl/extractor/pawoo.py +++ b/gallery_dl/extractor/pawoo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,7 @@ class PawooExtractor(Extractor): category = "pawoo" directory_fmt = ["{category}", "{account[username]}"] filename_fmt = "{category}_{id}_{media[id]}.{extension}" + archive_fmt = "{media[id]}" def __init__(self): Extractor.__init__(self) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 8fa1aecc..be629e60 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,6 +16,7 @@ class PinterestExtractor(Extractor): """Base class for pinterest extractors""" category = "pinterest" filename_fmt = "{category}_{pin_id}.{extension}" + archive_fmt = "{pin_id}" def __init__(self): Extractor.__init__(self) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index b44d8579..7c8bb5e6 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,6 +19,7 @@ class PixivExtractor(Extractor): category = "pixiv" directory_fmt = ["{category}", "{user[id]} {user[account]}"] filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" + archive_fmt = "{id}{num}" illust_url = "https://www.pixiv.net/member_illust.php?mode=medium" def __init__(self): diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 12766f39..fb626db6 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -18,6 +18,7 @@ class ReadcomiconlineBase(): category = "readcomiconline" directory_fmt = ["{category}", "{comic}", "{issue:>03}"] filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" + archive_fmt = "{comic}_{issue}_{page}" root = "http://readcomiconline.to" useragent = "Wget/1.19.2 (linux-gnu)" diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index ce175d1f..55f15f1d 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -118,6 +118,7 @@ class RedditImageExtractor(Extractor): """Extractor for reddit-hosted images""" category = "reddit" subcategory = "image" + archive_fmt = "{name}" pattern = [r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" r"/[^/?&#]+(?:\?[^#]*)?"] test = [ diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 86e626ed..a7f58b11 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2017 Mike Fährmann +# Copyright 2014-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,6 +20,7 @@ class SankakuExtractor(SharedConfigExtractor): basecategory = "booru" category = "sankaku" filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" cookienames = ("login", "pass_hash") cookiedomain = "chan.sankakucomplex.com" subdomain = "chan" diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index ad3eea01..89f93d5c 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,6 +16,7 @@ from ..cache import cache class SeigaExtractor(Extractor): """Base class for seiga extractors""" category = "seiga" + archive_fmt = "{image_id}" cookiedomain = ".nicovideo.jp" def __init__(self): diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py index 3f6bc11a..d46d5e7c 100644 --- a/gallery_dl/extractor/senmanga.py +++ b/gallery_dl/extractor/senmanga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class SenmangaChapterExtractor(Extractor): subcategory = "chapter" directory_fmt = ["{category}", "{manga}", "{chapter_string}"] filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" + archive_fmt = "{manga}_{chapter_string}_{page}" pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"] test = [ ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index c12d241d..1a680d34 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -18,6 +18,7 @@ class SlideshareExtractor(Extractor): subcategory = "presentation" directory_fmt = ["{category}", "{user}"] filename_fmt = "{presentation}-{num:>02}.{extension}" + archive_fmt = "{presentation}_{num}" pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net" r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)"] test = [ diff --git a/gallery_dl/extractor/spectrumnexus.py b/gallery_dl/extractor/spectrumnexus.py index 93630196..8bc2aa1c 100644 --- a/gallery_dl/extractor/spectrumnexus.py +++ b/gallery_dl/extractor/spectrumnexus.py @@ -38,7 +38,7 @@ class SpectrumnexusChapterExtractor(ChapterExtractor): category = "spectrumnexus" directory_fmt = ["{category}", "{manga}", "{chapter_string}"] filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" - + archive_fmt = "{manga}_{chapter_string}_{page}" pattern = [r"(?:https?://)?view\.thespectrum\.net/series/" r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"] test = [(("http://view.thespectrum.net/series/" diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index d02cde9e..2752ec32 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -17,7 +17,7 @@ import re def _original_image(url): match = re.match( r"https?://\d+\.media\.tumblr\.com" - r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+)_\d+\.([0-9a-z]+)", + r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+_)\d+(\.[0-9a-z]+)", url) if not match: @@ -26,8 +26,8 @@ def _original_image(url): path, key, ext = match.groups() return ( - "".join((root, path, "_raw." if key else "_1280.", ext)), - "".join((root, path, "_500.", ext)), + "".join((root, path, "raw" if key else "1280", ext)), + "".join((root, path, "500", ext)), url, ) @@ -53,6 +53,7 @@ class TumblrExtractor(Extractor): category = "tumblr" directory_fmt = ["{category}", "{name}"] filename_fmt = "{category}_{blog_name}_{id}o{offset}.{extension}" + archive_fmt = "{id}_{offset}" def __init__(self, match): Extractor.__init__(self) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 0d13077a..36779000 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2017 Mike Fährmann +# Copyright 2016-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class TwitterTweetExtractor(Extractor): subcategory = "tweet" directory_fmt = ["{category}", "{user}"] filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{num}" pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/" r"(([^/]+)/status/(\d+))"] test = [ diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 843d62db..2804cef2 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ class WarosuThreadExtractor(Extractor): subcategory = "thread" directory_fmt = ["{category}", "{board}", "{thread} - {title}"] filename_fmt = "{tim}-{filename}{ext}" + archive_fmt = "{board}_{thread}_{tim}" pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"] test = [ ("https://warosu.org/jp/thread/16656025", { diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index a11d5108..d3575389 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -29,6 +29,7 @@ class XvideosGalleryExtractor(XvideosExtractor): subcategory = "gallery" directory_fmt = ["{category}", "{user[name]}", "{title}"] filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" pattern = [r"(?:https?://)?(?:www\.)?xvideos\.com" r"/profiles/([^/?&#]+)/photos/(\d+)"] test = [ diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 5dd7839e..902288f1 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -10,6 +10,7 @@ import sys import time import json import hashlib +import logging from . import extractor, downloader, config, util, output, exception from .extractor.message import Message @@ -152,34 +153,57 @@ class DownloadJob(Job): def __init__(self, url, parent=None): Job.__init__(self, url, parent) + self.log = logging.getLogger("download") self.pathfmt = None + self.archive = None self.sleep = None self.downloaders = {} self.out = output.select() - def handle_url(self, url, keywords): + def handle_url(self, url, keywords, fallback=None): """Download the resource specified in 'url'""" - if self._prepare_download(keywords): - dlobj = self.get_downloader(url) - if not dlobj.download(url, self.pathfmt): - self._report_failure(dlobj) + # prepare download + self.pathfmt.set_keywords(keywords) + + if self.pathfmt.exists(self.archive): + self.out.skip(self.pathfmt.path) + return + + if self.sleep: + time.sleep(self.sleep) + + # download from URL + if not self.get_downloader(url).download(url, self.pathfmt): + + # use fallback URLs if available + for num, url in enumerate(fallback or (), 1): + self.log.info("Trying fallback URL #%d", num) + if self.get_downloader(url).download(url, self.pathfmt): + break + else: + # download failed + self.log.error( + "Failed to download %s", self.pathfmt.filename) + return + + # download succeeded + if self.archive: + self.archive.add() def handle_urllist(self, urls, keywords): """Download the resource specified in 'url'""" - if self._prepare_download(keywords): - for num, url in enumerate(urls): - dlobj = self.get_downloader(url) - if num: - dlobj.log.info("Trying fallback URL #%d", num) - if dlobj.download(url, self.pathfmt): - return - self._report_failure(dlobj) + fallback = iter(urls) + url = next(fallback) + self.handle_url(url, keywords, fallback) def handle_directory(self, keywords): """Set and create the target directory for downloads""" if not self.pathfmt: self.pathfmt = util.PathFormat(self.extractor) self.sleep = self.extractor.config("sleep") + archive = self.extractor.config("archive") + if archive: + self.archive = util.DownloadArchive(self.extractor, archive) self.pathfmt.set_directory(keywords) def handle_queue(self, url, keywords): @@ -201,18 +225,6 @@ class DownloadJob(Job): self.downloaders[scheme] = instance return instance - def _prepare_download(self, keywords): - self.pathfmt.set_keywords(keywords) - if self.pathfmt.exists(): - self.out.skip(self.pathfmt.path) - return False - if self.sleep: - time.sleep(self.sleep) - return True - - def _report_failure(self, dlobj): - dlobj.log.error("Failed to download %s", self.pathfmt.filename) - class KeywordJob(Job): """Print available keywords""" diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 3379ca75..b464a28a 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -206,6 +206,12 @@ def build_parser(): ) selection = parser.add_argument_group("Selection Options") + selection.add_argument( + "--download-archive", + metavar="FILE", dest="archive", action=ConfigAction, + help=("Record all downloaded files in the archive file and " + "skip downloading any file already in it.") + ) selection.add_argument( "--range", metavar="RANGE", dest="image_range", diff --git a/gallery_dl/util.py b/gallery_dl/util.py index a52daae2..9c54cbf9 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -19,6 +19,7 @@ import shutil import string import _string import hashlib +import sqlite3 import datetime import itertools import urllib.parse @@ -373,22 +374,31 @@ class PathFormat(): if os.altsep: self.basedirectory = self.basedirectory.replace(os.altsep, os.sep) - skipmode = extractor.config("skip", True) - if skipmode == "abort": - self.exists = self._exists_abort - elif skipmode == "exit": - self.exists = self._exists_exit - elif not skipmode: - self.exists = lambda: False + skip = extractor.config("skip", True) + if skip: + if skip == "abort": + self._skipexc = exception.StopExtraction + elif skip == "exit": + self._skipexc = exit + else: + self._skipexc = None + else: + self.exists = lambda x=None: False def open(self, mode="wb"): """Open file and return a corresponding file object""" return open(self.partpath or self.realpath, mode) - def exists(self): - """Return True if 'path' is complete and refers to an existing path""" - if self.has_extension: - return os.path.exists(self.realpath) + def exists(self, archive=None): + if (self.has_extension and os.path.exists(self.realpath) or + archive and archive.check(self.keywords)): + if self._skipexc: + raise self._skipexc() + if not self.has_extension: + self.set_extension("") + if self.path[-1] == ".": + self.path = self.path[:-1] + return True return False def set_directory(self, keywords): @@ -473,16 +483,6 @@ class PathFormat(): shutil.copyfile(self.partpath, self.realpath) os.unlink(self.partpath) - def _exists_abort(self): - if self.has_extension and os.path.exists(self.realpath): - raise exception.StopExtraction() - return False - - def _exists_exit(self): - if self.has_extension and os.path.exists(self.realpath): - exit() - return False - @staticmethod def adjust_path(path): """Enable longer-than-260-character paths on windows""" @@ -535,3 +535,30 @@ class OAuthSession(): OAuthSession.quote(str(key)) + "=" + OAuthSession.quote(str(value)) for key, value in sorted(params.items()) if value ) + + +class DownloadArchive(): + + def __init__(self, extractor, path): + con = sqlite3.connect(path) + con.isolation_level = None + self.cursor = con.cursor() + self.cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry PRIMARY KEY) WITHOUT ROWID") + self.keygen = ( + extractor.category + + (extractor.archive_fmt or extractor.filename_fmt) + ).format_map + self._key = None + + def check(self, kwdict): + """Return True if item described by 'kwdict' exists in archive""" + self._key = self.keygen(kwdict) + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (self._key,)) + return self.cursor.fetchone() + + def add(self): + """Add last item used in 'check()' to archive""" + self.cursor.execute( + "INSERT OR IGNORE INTO archive VALUES (?)", (self._key,))