Merge branch 'archive'

pull/79/head
Mike Fährmann 7 years ago
commit 3cec533c28
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -353,6 +353,20 @@ Description Additional key-value pairs to be added to each metadata dictionary.
=========== ===== =========== =====
extractor.*.archive
-------------------
=========== =====
Type ``string``
Default ``null``
Description File to store IDs of downloaded files in. Downloads of files
already recorded in this archive file will be skipped_.
The resulting archive file is not a plain text file but an SQLite3
database, as lookup operations are significantly faster when the
amount of stored IDs gets reasonably large.
=========== =====
Extractor-specific Options Extractor-specific Options
========================== ==========================
@ -788,6 +802,7 @@ How To - login and visit Tumblr's Applications_ section
.. |datetime.max| replace:: ``datetime.max`` .. |datetime.max| replace:: ``datetime.max``
.. |strptime| replace:: strftime() and strptime() Behavior .. |strptime| replace:: strftime() and strptime() Behavior
.. _skipped: `extractor.*.skip`_
.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_ .. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_
.. _date-format: extractor.reddit.date-format_ .. _date-format: extractor.reddit.date-format_

@ -16,6 +16,7 @@
}, },
"extractor": "extractor":
{ {
"archive": null,
"skip": true, "skip": true,
"sleep": 0, "sleep": 0,

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -17,8 +17,10 @@ class FutabaThreadExtractor(Extractor):
category = "2chan" category = "2chan"
subcategory = "thread" subcategory = "thread"
directory_fmt = ["{category}", "{board_name}", "{thread}"] directory_fmt = ["{category}", "{board_name}", "{thread}"]
pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"] filename_fmt = "{tim}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
urlfmt = "https://{server}.2chan.net/{board}/src/{filename}" urlfmt = "https://{server}.2chan.net/{board}/src/{filename}"
pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"]
test = [("http://dec.2chan.net/70/res/947.htm", { test = [("http://dec.2chan.net/70/res/947.htm", {
"url": "c5c12b80b290e224b6758507b3bb952044f4595b", "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
"keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",

@ -20,6 +20,7 @@ class BooruExtractor(SharedConfigExtractor):
"""Base class for all booru extractors""" """Base class for all booru extractors"""
basecategory = "booru" basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}" filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
api_url = "" api_url = ""
per_page = 50 per_page = 50
page_start = 1 page_start = 1

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -19,7 +19,8 @@ class ChanThreadExtractor(Extractor):
category = "chan" category = "chan"
subcategory = "thread" subcategory = "thread"
directory_fmt = ["{category}", "{board}", "{thread} - {title}"] directory_fmt = ["{category}", "{board}", "{thread} - {title}"]
filename_fmt = "{tim}-{filename}{ext}" filename_fmt = "{tim}-{filename}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
api_url = "" api_url = ""
file_url = "" file_url = ""
@ -69,6 +70,7 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
directory_fmt = ["{category}", "{board[shortname]}", directory_fmt = ["{category}", "{board[shortname]}",
"{thread_num}{title:? - //}"] "{thread_num}{title:? - //}"]
filename_fmt = "{media[media]}" filename_fmt = "{media[media]}"
archive_fmt = "{{board[shortname]}}_{num}_{timestamp}"
root = "" root = ""
referer = True referer = True

@ -28,6 +28,7 @@ class Extractor():
categorytransfer = False categorytransfer = False
directory_fmt = ["{category}"] directory_fmt = ["{category}"]
filename_fmt = "{name}.{extension}" filename_fmt = "{name}.{extension}"
archive_fmt = ""
cookiedomain = "" cookiedomain = ""
def __init__(self): def __init__(self):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -20,8 +20,9 @@ import re
class DeviantartExtractor(Extractor): class DeviantartExtractor(Extractor):
"""Base class for deviantart extractors""" """Base class for deviantart extractors"""
category = "deviantart" category = "deviantart"
filename_fmt = "{category}_{index}_{title}.{extension}"
directory_fmt = ["{category}", "{author[username]!l}"] directory_fmt = ["{category}", "{author[username]!l}"]
filename_fmt = "{category}_{index}_{title}.{extension}"
archive_fmt = "{index}.{extension}"
def __init__(self, match=None): def __init__(self, match=None):
Extractor.__init__(self) Extractor.__init__(self)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -22,6 +22,7 @@ class ExhentaiGalleryExtractor(Extractor):
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{gallery_id}"] directory_fmt = ["{category}", "{gallery_id}"]
filename_fmt = "{gallery_id}_{num:>04}_{image_token}_{name}.{extension}" filename_fmt = "{gallery_id}_{num:>04}_{image_token}_{name}.{extension}"
archive_fmt = "{gallery_id}_{num}"
pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
test = [ test = [
("https://exhentai.org/g/960460/4f0e369d82/", { ("https://exhentai.org/g/960460/4f0e369d82/", {

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -16,6 +16,7 @@ class FlickrExtractor(Extractor):
"""Base class for flickr extractors""" """Base class for flickr extractors"""
category = "flickr" category = "flickr"
filename_fmt = "{category}_{id}.{extension}" filename_fmt = "{category}_{id}.{extension}"
archive_fmt = "{id}"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)

@ -62,6 +62,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
directory_fmt = ["{category}", "{manga}", "{chapter_string}"] directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
filename_fmt = ( filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
archive_fmt = "{id}"
method = "default" method = "default"
def __init__(self, match, url=None): def __init__(self, match, url=None):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class GelbooruExtractor(SharedConfigExtractor):
basecategory = "booru" basecategory = "booru"
category = "gelbooru" category = "gelbooru"
filename_fmt = "{category}_{id}_{md5}.{extension}" filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index" api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"
def __init__(self): def __init__(self):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -15,6 +15,7 @@ from .. import exception
class GfycatExtractor(Extractor): class GfycatExtractor(Extractor):
"""Base class for gfycat extractors""" """Base class for gfycat extractors"""
category = "gfycat" category = "gfycat"
archive_fmt = "{gfyName}"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)

@ -69,6 +69,7 @@ class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"] directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"]
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
"{page:>03}.{extension}") "{page:>03}.{extension}")
archive_fmt = "{manga_id}_{chapter}_{page}"
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"]
test = [("http://www.hbrowse.com/10363/c00000", { test = [("http://www.hbrowse.com/10363/c00000", {
"url": "634f4800858913f097bc3b62a8fedaf74b5254bd", "url": "634f4800858913f097bc3b62a8fedaf74b5254bd",

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class HentaifoundryUserExtractor(Extractor):
subcategory = "user" subcategory = "user"
directory_fmt = ["{category}", "{artist}"] directory_fmt = ["{category}", "{artist}"]
filename_fmt = "{category}_{index}_{title}.{extension}" filename_fmt = "{category}_{index}_{title}.{extension}"
archive_fmt = "{index}"
pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/" pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/"
r"(?:pictures/user/([^/]+)/?$|user/([^/]+)/profile)"] r"(?:pictures/user/([^/]+)/?$|user/([^/]+)/profile)"]
test = [ test = [

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -19,6 +19,7 @@ class HitomiGalleryExtractor(Extractor):
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{gallery_id} {title}"] directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{num:>03}_{name}.{extension}" filename_fmt = "{category}_{gallery_id}_{num:>03}_{name}.{extension}"
archive_fmt = "{gallery_id}_{num}"
pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"] pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"]
test = [("https://hitomi.la/galleries/867789.html", { test = [("https://hitomi.la/galleries/867789.html", {
"url": "e42a47dfadda93e4bf37e82b1dc9ad29edfa9130", "url": "e42a47dfadda93e4bf37e82b1dc9ad29edfa9130",

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -17,7 +17,8 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
category = "imagebam" category = "imagebam"
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{title} - {gallery_key}"] directory_fmt = ["{category}", "{title} - {gallery_key}"]
filename_fmt = "{num:>03}-{filename}" filename_fmt = "{num:>03}-{name}.{extension}"
archive_fmt = "{image_id}"
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"] pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"]
test = [(("http://www.imagebam.com/" test = [(("http://www.imagebam.com/"
"gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), { "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), {
@ -76,6 +77,7 @@ class ImagebamImageExtractor(Extractor):
"""Extractor for single images from imagebam.com""" """Extractor for single images from imagebam.com"""
category = "imagebam" category = "imagebam"
subcategory = "image" subcategory = "image"
archive_fmt = "{token}"
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/image/([0-9a-f]{15})"] pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/image/([0-9a-f]{15})"]
test = [("http://www.imagebam.com/image/94d56c502511890", { test = [("http://www.imagebam.com/image/94d56c502511890", {
"url": "b384893c35a01a09c58018db71ddc4cf2480be95", "url": "b384893c35a01a09c58018db71ddc4cf2480be95",

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -13,12 +13,17 @@ from .. import text, util
import json import json
class ImagefapGalleryExtractor(Extractor): class ImagefapExtractor(Extractor):
"""Extractor for image galleries from imagefap.com""" """Base class for imagefap extractors"""
category = "imagefap" category = "imagefap"
subcategory = "gallery"
directory_fmt = ["{category}", "{gallery_id} {title}"] directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{name}.{extension}" filename_fmt = "{category}_{gallery_id}_{name}.{extension}"
archive_fmt = "{gallery_id}_{image_id}"
class ImagefapGalleryExtractor(ImagefapExtractor):
"""Extractor for image galleries from imagefap.com"""
subcategory = "gallery"
pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/" pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/"
r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")] r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")]
test = [ test = [
@ -35,7 +40,7 @@ class ImagefapGalleryExtractor(Extractor):
] ]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) ImagefapExtractor.__init__(self)
self.gid = match.group(1) self.gid = match.group(1)
self.image_id = "" self.image_id = ""
@ -80,12 +85,9 @@ class ImagefapGalleryExtractor(Extractor):
params["idx"] += 24 params["idx"] += 24
class ImagefapImageExtractor(Extractor): class ImagefapImageExtractor(ImagefapExtractor):
"""Extractor for single images from imagefap.com""" """Extractor for single images from imagefap.com"""
category = "imagefap"
subcategory = "image" subcategory = "image"
directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{name}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"]
test = [("http://www.imagefap.com/photo/1369341772/", { test = [("http://www.imagefap.com/photo/1369341772/", {
"url": "24cc4312e4a5084f39f1e35af5ba92e5f7c1ad3c", "url": "24cc4312e4a5084f39f1e35af5ba92e5f7c1ad3c",
@ -93,7 +95,7 @@ class ImagefapImageExtractor(Extractor):
})] })]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) ImagefapExtractor.__init__(self)
self.image_id = match.group(1) self.image_id = match.group(1)
def items(self): def items(self):
@ -132,9 +134,8 @@ class ImagefapImageExtractor(Extractor):
return json_dict return json_dict
class ImagefapUserExtractor(Extractor): class ImagefapUserExtractor(ImagefapExtractor):
"""Extractor for all galleries from a user at imagefap.com""" """Extractor for all galleries from a user at imagefap.com"""
category = "imagefap"
subcategory = "user" subcategory = "user"
categorytransfer = True categorytransfer = True
pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/" pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/"
@ -146,7 +147,7 @@ class ImagefapUserExtractor(Extractor):
})] })]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) ImagefapExtractor.__init__(self)
try: try:
self.user_id = int(match.group(1)) self.user_id = int(match.group(1))
self.user = None self.user = None

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ from urllib.parse import urljoin
class ImagehostImageExtractor(Extractor): class ImagehostImageExtractor(Extractor):
"""Base class for single-image extractors for various imagehosts""" """Base class for single-image extractors for various imagehosts"""
subcategory = "image" subcategory = "image"
archive_fmt = "{token}"
https = False https = False
method = "post" method = "post"
params = "simple" params = "simple"

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -16,6 +16,7 @@ import re
class ImgboxExtractor(Extractor): class ImgboxExtractor(Extractor):
"""Base class for imgbox extractors""" """Base class for imgbox extractors"""
category = "imgbox" category = "imgbox"
archive_fmt = "{image_key}"
root = "https://imgbox.com" root = "https://imgbox.com"
def items(self): def items(self):
@ -62,7 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor):
"""Extractor for image galleries from imgbox.com""" """Extractor for image galleries from imgbox.com"""
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{title} - {gallery_key}"] directory_fmt = ["{category}", "{title} - {gallery_key}"]
filename_fmt = "{num:>03}-{filename}" filename_fmt = "{num:>03}-{name}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"] pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"]
test = [ test = [
("https://imgbox.com/g/JaX5V5HX7g", { ("https://imgbox.com/g/JaX5V5HX7g", {

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -15,6 +15,7 @@ from .. import text
class ImgchiliExtractor(Extractor): class ImgchiliExtractor(Extractor):
"""Base class for imgchili extractors""" """Base class for imgchili extractors"""
category = "imgchili" category = "imgchili"
archive_fmt = "{image_id}"
root = "https://imgchili.net" root = "https://imgchili.net"
def __init__(self, match): def __init__(self, match):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class ImgthGalleryExtractor(Extractor):
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{gallery_id} {title}"] directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
archive_fmt = "{gallery_id}_{num}"
pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"] pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"]
test = [("http://imgth.com/gallery/37/wallpaper-anime", { test = [("http://imgth.com/gallery/37/wallpaper-anime", {
"url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",

@ -16,6 +16,7 @@ import json
class ImgurExtractor(Extractor): class ImgurExtractor(Extractor):
"""Base class for imgur extractors""" """Base class for imgur extractors"""
category = "imgur" category = "imgur"
archive_fmt = "{hash}"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
category = "khinsider" category = "khinsider"
subcategory = "soundtrack" subcategory = "soundtrack"
directory_fmt = ["{category}", "{album}"] directory_fmt = ["{category}", "{album}"]
archive_fmt = "{album}_{name}"
pattern = [r"(?:https?://)?downloads\.khinsider\.com/" pattern = [r"(?:https?://)?downloads\.khinsider\.com/"
r"game-soundtracks/album/([^/?&#]+)"] r"game-soundtracks/album/([^/?&#]+)"]
test = [(("https://downloads.khinsider.com/game-soundtracks/" test = [(("https://downloads.khinsider.com/game-soundtracks/"

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class LusciousAlbumExtractor(AsynchronousExtractor):
subcategory = "album" subcategory = "album"
directory_fmt = ["{category}", "{gallery_id} {title}"] directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
archive_fmt = "{gallery_id}_{image_id}"
pattern = [(r"(?:https?://)?(?:www\.|members\.)?luscious\.net/" pattern = [(r"(?:https?://)?(?:www\.|members\.)?luscious\.net/"
r"(?:c/[^/?&#]+/)?(?:pictures/album|albums)/([^/?&#]+_(\d+))")] r"(?:c/[^/?&#]+/)?(?:pictures/album|albums)/([^/?&#]+_(\d+))")]
test = [ test = [

@ -59,6 +59,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
"""Extractor for manga-chapters from mangareader.net""" """Extractor for manga-chapters from mangareader.net"""
archive_fmt = "{manga}_{chapter}_{page}"
pattern = [ pattern = [
(r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"), (r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"),
(r"(?:https?://)?(?:www\.)?mangareader\.net" (r"(?:https?://)?(?:www\.)?mangareader\.net"

@ -16,6 +16,7 @@ from urllib.parse import urljoin
class MangastreamChapterExtractor(ChapterExtractor): class MangastreamChapterExtractor(ChapterExtractor):
"""Extractor for manga-chapters from mangastream.com""" """Extractor for manga-chapters from mangastream.com"""
category = "mangastream" category = "mangastream"
archive_fmt = "{chapter_id}_{page}"
pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/" pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/"
r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")] r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")]
test = [("https://readms.net/r/onepunch_man/087/4874/1", None)] test = [("https://readms.net/r/onepunch_man/087/4874/1", None)]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2017 Mike Fährmann # Copyright 2015-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -19,6 +19,7 @@ class NhentaiGalleryExtractor(Extractor):
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{gallery_id} {title}"] directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
archive_fmt = "{gallery_id}_{num}"
pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"]
test = [("http://nhentai.net/g/147850/", { test = [("http://nhentai.net/g/147850/", {
"url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0", "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0",

@ -18,6 +18,7 @@ class NijieExtractor(AsynchronousExtractor):
category = "nijie" category = "nijie"
directory_fmt = ["{category}", "{artist_id}"] directory_fmt = ["{category}", "{artist_id}"]
filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}" filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}"
archive_fmt = "{image_id}_{index}"
cookiedomain = "nijie.info" cookiedomain = "nijie.info"
popup_url = "https://nijie.info/view_popup.php?id=" popup_url = "https://nijie.info/view_popup.php?id="

@ -17,6 +17,7 @@ class PahealExtractor(SharedConfigExtractor):
basecategory = "booru" basecategory = "booru"
category = "paheal" category = "paheal"
filename_fmt = "{category}_{id}_{md5}.{extension}" filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
root = "http://rule34.paheal.net" root = "http://rule34.paheal.net"
def items(self): def items(self):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -17,6 +17,7 @@ class PawooExtractor(Extractor):
category = "pawoo" category = "pawoo"
directory_fmt = ["{category}", "{account[username]}"] directory_fmt = ["{category}", "{account[username]}"]
filename_fmt = "{category}_{id}_{media[id]}.{extension}" filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
def __init__(self): def __init__(self):
Extractor.__init__(self) Extractor.__init__(self)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -16,6 +16,7 @@ class PinterestExtractor(Extractor):
"""Base class for pinterest extractors""" """Base class for pinterest extractors"""
category = "pinterest" category = "pinterest"
filename_fmt = "{category}_{pin_id}.{extension}" filename_fmt = "{category}_{pin_id}.{extension}"
archive_fmt = "{pin_id}"
def __init__(self): def __init__(self):
Extractor.__init__(self) Extractor.__init__(self)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -19,6 +19,7 @@ class PixivExtractor(Extractor):
category = "pixiv" category = "pixiv"
directory_fmt = ["{category}", "{user[id]} {user[account]}"] directory_fmt = ["{category}", "{user[id]} {user[account]}"]
filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
archive_fmt = "{id}{num}"
illust_url = "https://www.pixiv.net/member_illust.php?mode=medium" illust_url = "https://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self): def __init__(self):

@ -18,6 +18,7 @@ class ReadcomiconlineBase():
category = "readcomiconline" category = "readcomiconline"
directory_fmt = ["{category}", "{comic}", "{issue:>03}"] directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
archive_fmt = "{comic}_{issue}_{page}"
root = "http://readcomiconline.to" root = "http://readcomiconline.to"
useragent = "Wget/1.19.2 (linux-gnu)" useragent = "Wget/1.19.2 (linux-gnu)"

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -118,6 +118,7 @@ class RedditImageExtractor(Extractor):
"""Extractor for reddit-hosted images""" """Extractor for reddit-hosted images"""
category = "reddit" category = "reddit"
subcategory = "image" subcategory = "image"
archive_fmt = "{name}"
pattern = [r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" pattern = [r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)"
r"/[^/?&#]+(?:\?[^#]*)?"] r"/[^/?&#]+(?:\?[^#]*)?"]
test = [ test = [

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann # Copyright 2014-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -20,6 +20,7 @@ class SankakuExtractor(SharedConfigExtractor):
basecategory = "booru" basecategory = "booru"
category = "sankaku" category = "sankaku"
filename_fmt = "{category}_{id}_{md5}.{extension}" filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
cookienames = ("login", "pass_hash") cookienames = ("login", "pass_hash")
cookiedomain = "chan.sankakucomplex.com" cookiedomain = "chan.sankakucomplex.com"
subdomain = "chan" subdomain = "chan"

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -16,6 +16,7 @@ from ..cache import cache
class SeigaExtractor(Extractor): class SeigaExtractor(Extractor):
"""Base class for seiga extractors""" """Base class for seiga extractors"""
category = "seiga" category = "seiga"
archive_fmt = "{image_id}"
cookiedomain = ".nicovideo.jp" cookiedomain = ".nicovideo.jp"
def __init__(self): def __init__(self):

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class SenmangaChapterExtractor(Extractor):
subcategory = "chapter" subcategory = "chapter"
directory_fmt = ["{category}", "{manga}", "{chapter_string}"] directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
archive_fmt = "{manga}_{chapter_string}_{page}"
pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"] pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"]
test = [ test = [
("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {

@ -18,6 +18,7 @@ class SlideshareExtractor(Extractor):
subcategory = "presentation" subcategory = "presentation"
directory_fmt = ["{category}", "{user}"] directory_fmt = ["{category}", "{user}"]
filename_fmt = "{presentation}-{num:>02}.{extension}" filename_fmt = "{presentation}-{num:>02}.{extension}"
archive_fmt = "{presentation}_{num}"
pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net" pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net"
r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)"] r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)"]
test = [ test = [

@ -38,7 +38,7 @@ class SpectrumnexusChapterExtractor(ChapterExtractor):
category = "spectrumnexus" category = "spectrumnexus"
directory_fmt = ["{category}", "{manga}", "{chapter_string}"] directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
archive_fmt = "{manga}_{chapter_string}_{page}"
pattern = [r"(?:https?://)?view\.thespectrum\.net/series/" pattern = [r"(?:https?://)?view\.thespectrum\.net/series/"
r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"] r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"]
test = [(("http://view.thespectrum.net/series/" test = [(("http://view.thespectrum.net/series/"

@ -17,7 +17,7 @@ import re
def _original_image(url): def _original_image(url):
match = re.match( match = re.match(
r"https?://\d+\.media\.tumblr\.com" r"https?://\d+\.media\.tumblr\.com"
r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+)_\d+\.([0-9a-z]+)", r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+_)\d+(\.[0-9a-z]+)",
url) url)
if not match: if not match:
@ -26,8 +26,8 @@ def _original_image(url):
path, key, ext = match.groups() path, key, ext = match.groups()
return ( return (
"".join((root, path, "_raw." if key else "_1280.", ext)), "".join((root, path, "raw" if key else "1280", ext)),
"".join((root, path, "_500.", ext)), "".join((root, path, "500", ext)),
url, url,
) )
@ -53,6 +53,7 @@ class TumblrExtractor(Extractor):
category = "tumblr" category = "tumblr"
directory_fmt = ["{category}", "{name}"] directory_fmt = ["{category}", "{name}"]
filename_fmt = "{category}_{blog_name}_{id}o{offset}.{extension}" filename_fmt = "{category}_{blog_name}_{id}o{offset}.{extension}"
archive_fmt = "{id}_{offset}"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2017 Mike Fährmann # Copyright 2016-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class TwitterTweetExtractor(Extractor):
subcategory = "tweet" subcategory = "tweet"
directory_fmt = ["{category}", "{user}"] directory_fmt = ["{category}", "{user}"]
filename_fmt = "{tweet_id}_{num}.{extension}" filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{num}"
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/" pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/"
r"(([^/]+)/status/(\d+))"] r"(([^/]+)/status/(\d+))"]
test = [ test = [

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -18,6 +18,7 @@ class WarosuThreadExtractor(Extractor):
subcategory = "thread" subcategory = "thread"
directory_fmt = ["{category}", "{board}", "{thread} - {title}"] directory_fmt = ["{category}", "{board}", "{thread} - {title}"]
filename_fmt = "{tim}-{filename}{ext}" filename_fmt = "{tim}-{filename}{ext}"
archive_fmt = "{board}_{thread}_{tim}"
pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"] pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"]
test = [ test = [
("https://warosu.org/jp/thread/16656025", { ("https://warosu.org/jp/thread/16656025", {

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann # Copyright 2017-2018 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -29,6 +29,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{user[name]}", "{title}"] directory_fmt = ["{category}", "{user[name]}", "{title}"]
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
archive_fmt = "{gallery_id}_{num}"
pattern = [r"(?:https?://)?(?:www\.)?xvideos\.com" pattern = [r"(?:https?://)?(?:www\.)?xvideos\.com"
r"/profiles/([^/?&#]+)/photos/(\d+)"] r"/profiles/([^/?&#]+)/photos/(\d+)"]
test = [ test = [

@ -10,6 +10,7 @@ import sys
import time import time
import json import json
import hashlib import hashlib
import logging
from . import extractor, downloader, config, util, output, exception from . import extractor, downloader, config, util, output, exception
from .extractor.message import Message from .extractor.message import Message
@ -152,34 +153,57 @@ class DownloadJob(Job):
def __init__(self, url, parent=None): def __init__(self, url, parent=None):
Job.__init__(self, url, parent) Job.__init__(self, url, parent)
self.log = logging.getLogger("download")
self.pathfmt = None self.pathfmt = None
self.archive = None
self.sleep = None self.sleep = None
self.downloaders = {} self.downloaders = {}
self.out = output.select() self.out = output.select()
def handle_url(self, url, keywords): def handle_url(self, url, keywords, fallback=None):
"""Download the resource specified in 'url'""" """Download the resource specified in 'url'"""
if self._prepare_download(keywords): # prepare download
dlobj = self.get_downloader(url) self.pathfmt.set_keywords(keywords)
if not dlobj.download(url, self.pathfmt):
self._report_failure(dlobj) if self.pathfmt.exists(self.archive):
self.out.skip(self.pathfmt.path)
return
if self.sleep:
time.sleep(self.sleep)
# download from URL
if not self.get_downloader(url).download(url, self.pathfmt):
# use fallback URLs if available
for num, url in enumerate(fallback or (), 1):
self.log.info("Trying fallback URL #%d", num)
if self.get_downloader(url).download(url, self.pathfmt):
break
else:
# download failed
self.log.error(
"Failed to download %s", self.pathfmt.filename)
return
# download succeeded
if self.archive:
self.archive.add()
def handle_urllist(self, urls, keywords): def handle_urllist(self, urls, keywords):
"""Download the resource specified in 'url'""" """Download the resource specified in 'url'"""
if self._prepare_download(keywords): fallback = iter(urls)
for num, url in enumerate(urls): url = next(fallback)
dlobj = self.get_downloader(url) self.handle_url(url, keywords, fallback)
if num:
dlobj.log.info("Trying fallback URL #%d", num)
if dlobj.download(url, self.pathfmt):
return
self._report_failure(dlobj)
def handle_directory(self, keywords): def handle_directory(self, keywords):
"""Set and create the target directory for downloads""" """Set and create the target directory for downloads"""
if not self.pathfmt: if not self.pathfmt:
self.pathfmt = util.PathFormat(self.extractor) self.pathfmt = util.PathFormat(self.extractor)
self.sleep = self.extractor.config("sleep") self.sleep = self.extractor.config("sleep")
archive = self.extractor.config("archive")
if archive:
self.archive = util.DownloadArchive(self.extractor, archive)
self.pathfmt.set_directory(keywords) self.pathfmt.set_directory(keywords)
def handle_queue(self, url, keywords): def handle_queue(self, url, keywords):
@ -201,18 +225,6 @@ class DownloadJob(Job):
self.downloaders[scheme] = instance self.downloaders[scheme] = instance
return instance return instance
def _prepare_download(self, keywords):
self.pathfmt.set_keywords(keywords)
if self.pathfmt.exists():
self.out.skip(self.pathfmt.path)
return False
if self.sleep:
time.sleep(self.sleep)
return True
def _report_failure(self, dlobj):
dlobj.log.error("Failed to download %s", self.pathfmt.filename)
class KeywordJob(Job): class KeywordJob(Job):
"""Print available keywords""" """Print available keywords"""

@ -206,6 +206,12 @@ def build_parser():
) )
selection = parser.add_argument_group("Selection Options") selection = parser.add_argument_group("Selection Options")
selection.add_argument(
"--download-archive",
metavar="FILE", dest="archive", action=ConfigAction,
help=("Record all downloaded files in the archive file and "
"skip downloading any file already in it.")
)
selection.add_argument( selection.add_argument(
"--range", "--range",
metavar="RANGE", dest="image_range", metavar="RANGE", dest="image_range",

@ -19,6 +19,7 @@ import shutil
import string import string
import _string import _string
import hashlib import hashlib
import sqlite3
import datetime import datetime
import itertools import itertools
import urllib.parse import urllib.parse
@ -373,22 +374,31 @@ class PathFormat():
if os.altsep: if os.altsep:
self.basedirectory = self.basedirectory.replace(os.altsep, os.sep) self.basedirectory = self.basedirectory.replace(os.altsep, os.sep)
skipmode = extractor.config("skip", True) skip = extractor.config("skip", True)
if skipmode == "abort": if skip:
self.exists = self._exists_abort if skip == "abort":
elif skipmode == "exit": self._skipexc = exception.StopExtraction
self.exists = self._exists_exit elif skip == "exit":
elif not skipmode: self._skipexc = exit
self.exists = lambda: False else:
self._skipexc = None
else:
self.exists = lambda x=None: False
def open(self, mode="wb"): def open(self, mode="wb"):
"""Open file and return a corresponding file object""" """Open file and return a corresponding file object"""
return open(self.partpath or self.realpath, mode) return open(self.partpath or self.realpath, mode)
def exists(self): def exists(self, archive=None):
"""Return True if 'path' is complete and refers to an existing path""" if (self.has_extension and os.path.exists(self.realpath) or
if self.has_extension: archive and archive.check(self.keywords)):
return os.path.exists(self.realpath) if self._skipexc:
raise self._skipexc()
if not self.has_extension:
self.set_extension("")
if self.path[-1] == ".":
self.path = self.path[:-1]
return True
return False return False
def set_directory(self, keywords): def set_directory(self, keywords):
@ -473,16 +483,6 @@ class PathFormat():
shutil.copyfile(self.partpath, self.realpath) shutil.copyfile(self.partpath, self.realpath)
os.unlink(self.partpath) os.unlink(self.partpath)
def _exists_abort(self):
if self.has_extension and os.path.exists(self.realpath):
raise exception.StopExtraction()
return False
def _exists_exit(self):
if self.has_extension and os.path.exists(self.realpath):
exit()
return False
@staticmethod @staticmethod
def adjust_path(path): def adjust_path(path):
"""Enable longer-than-260-character paths on windows""" """Enable longer-than-260-character paths on windows"""
@ -535,3 +535,30 @@ class OAuthSession():
OAuthSession.quote(str(key)) + "=" + OAuthSession.quote(str(value)) OAuthSession.quote(str(key)) + "=" + OAuthSession.quote(str(value))
for key, value in sorted(params.items()) if value for key, value in sorted(params.items()) if value
) )
class DownloadArchive():
def __init__(self, extractor, path):
con = sqlite3.connect(path)
con.isolation_level = None
self.cursor = con.cursor()
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry PRIMARY KEY) WITHOUT ROWID")
self.keygen = (
extractor.category +
(extractor.archive_fmt or extractor.filename_fmt)
).format_map
self._key = None
def check(self, kwdict):
"""Return True if item described by 'kwdict' exists in archive"""
self._key = self.keygen(kwdict)
self.cursor.execute(
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (self._key,))
return self.cursor.fetchone()
def add(self):
"""Add last item used in 'check()' to archive"""
self.cursor.execute(
"INSERT OR IGNORE INTO archive VALUES (?)", (self._key,))

Loading…
Cancel
Save