[downloader:http] improve 'adjust-extensions' (#776)

Check file headers against a list of file signatures before
downloading the whole file and writing it to disk.

The file signature check needs some improvements (*),
but it produces usable results for the most part.

(*)
- 'webp', 'wav', and others start with 'RFFI'
- 'svg' uses the same "signature" as all XML documents
- 'webm' has the same signature as 'mkv' files
- only 'mp3' files in an ID3v2 container get recognized
pull/1195/head
Mike Fährmann 4 years ago
parent 46323ae6ff
commit 536c088462
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -44,12 +44,14 @@ class HttpDownloader(DownloaderBase):
if self.minsize: if self.minsize:
minsize = text.parse_bytes(self.minsize) minsize = text.parse_bytes(self.minsize)
if not minsize: if not minsize:
self.log.warning("Invalid minimum filesize (%r)", self.minsize) self.log.warning(
"Invalid minimum file size (%r)", self.minsize)
self.minsize = minsize self.minsize = minsize
if self.maxsize: if self.maxsize:
maxsize = text.parse_bytes(self.maxsize) maxsize = text.parse_bytes(self.maxsize)
if not maxsize: if not maxsize:
self.log.warning("Invalid maximum filesize (%r)", self.maxsize) self.log.warning(
"Invalid maximum file size (%r)", self.maxsize)
self.maxsize = maxsize self.maxsize = maxsize
if self.rate: if self.rate:
rate = text.parse_bytes(self.rate) rate = text.parse_bytes(self.rate)
@ -84,17 +86,20 @@ class HttpDownloader(DownloaderBase):
if tries: if tries:
if response: if response:
response.close() response.close()
response = None
self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
if tries > self.retries: if tries > self.retries:
return False return False
time.sleep(tries) time.sleep(tries)
tries += 1
tries += 1
headers = {} headers = {}
file_header = None
# check for .part file # check for .part file
filesize = pathfmt.part_size() file_size = pathfmt.part_size()
if filesize: if file_size:
headers["Range"] = "bytes={}-".format(filesize) headers["Range"] = "bytes={}-".format(file_size)
# file-specific headers # file-specific headers
extra = pathfmt.kwdict.get("_http_headers") extra = pathfmt.kwdict.get("_http_headers")
if extra: if extra:
@ -118,9 +123,9 @@ class HttpDownloader(DownloaderBase):
offset = 0 offset = 0
size = response.headers.get("Content-Length") size = response.headers.get("Content-Length")
elif code == 206: # Partial Content elif code == 206: # Partial Content
offset = filesize offset = file_size
size = response.headers["Content-Range"].rpartition("/")[2] size = response.headers["Content-Range"].rpartition("/")[2]
elif code == 416 and filesize: # Requested Range Not Satisfiable elif code == 416 and file_size: # Requested Range Not Satisfiable
break break
else: else:
msg = "'{} {}' for '{}'".format(code, response.reason, url) msg = "'{} {}' for '{}'".format(code, response.reason, url)
@ -129,7 +134,14 @@ class HttpDownloader(DownloaderBase):
self.log.warning(msg) self.log.warning(msg)
return False return False
# check filesize # set missing filename extension from MIME type
if not pathfmt.extension:
pathfmt.set_extension(self._find_extension(response))
if pathfmt.exists():
pathfmt.temppath = ""
return True
# check file size
size = text.parse_int(size, None) size = text.parse_int(size, None)
if size is not None: if size is not None:
if self.minsize and size < self.minsize: if self.minsize and size < self.minsize:
@ -143,50 +155,55 @@ class HttpDownloader(DownloaderBase):
size, self.maxsize) size, self.maxsize)
return False return False
# set missing filename extension # check filename extension against file header
if not pathfmt.extension: if self.adjust_extension and not offset and \
pathfmt.set_extension(self.get_extension(response)) pathfmt.extension in FILE_SIGNATURES:
if pathfmt.exists(): try:
file_header = next(response.iter_content(16), b"")
except (RequestException, SSLError, OpenSSLError) as exc:
msg = str(exc)
print()
continue
if self._adjust_extension(pathfmt, file_header) and \
pathfmt.exists():
pathfmt.temppath = "" pathfmt.temppath = ""
return True return True
# set open mode # set open mode
if not offset: if not offset:
mode = "w+b" mode = "w+b"
if filesize: if file_size:
self.log.debug("Unable to resume partial download") self.log.debug("Unable to resume partial download")
else: else:
mode = "r+b" mode = "r+b"
self.log.debug("Resuming download at byte %d", offset) self.log.debug("Resuming download at byte %d", offset)
# start downloading # download content
self.out.start(pathfmt.path)
self.downloading = True self.downloading = True
with pathfmt.open(mode) as file: with pathfmt.open(mode) as fp:
if offset: if file_header:
file.seek(offset) fp.write(file_header)
elif offset:
# download content if self.adjust_extension and \
pathfmt.extension in FILE_SIGNATURES:
self._adjust_extension(pathfmt, fp.read(16))
fp.seek(offset)
self.out.start(pathfmt.path)
try: try:
self.receive(response, file) self.receive(fp, response.iter_content(self.chunk_size))
except (RequestException, SSLError, OpenSSLError) as exc: except (RequestException, SSLError, OpenSSLError) as exc:
msg = str(exc) msg = str(exc)
print() print()
continue continue
# check filesize # check file size
if size and file.tell() < size: if size and fp.tell() < size:
msg = "filesize mismatch ({} < {})".format( msg = "file size mismatch ({} < {})".format(
file.tell(), size) fp.tell(), size)
print() print()
continue continue
# check filename extension
if self.adjust_extension:
adj_ext = self.check_extension(file, pathfmt.extension)
if adj_ext:
pathfmt.set_extension(adj_ext)
break break
self.downloading = False self.downloading = False
@ -198,16 +215,18 @@ class HttpDownloader(DownloaderBase):
return True return True
def receive(self, response, file): @staticmethod
for data in response.iter_content(self.chunk_size): def receive(fp, content):
file.write(data) write = fp.write
for data in content:
write(data)
def _receive_rate(self, response, file): def _receive_rate(self, fp, content):
t1 = time.time() t1 = time.time()
rt = self.rate rt = self.rate
for data in response.iter_content(self.chunk_size): for data in content:
file.write(data) fp.write(data)
t2 = time.time() # current time t2 = time.time() # current time
actual = t2 - t1 # actual elapsed time actual = t2 - t1 # actual elapsed time
@ -220,81 +239,91 @@ class HttpDownloader(DownloaderBase):
else: else:
t1 = t2 t1 = t2
def get_extension(self, response): def _find_extension(self, response):
"""Get filename extension from MIME type"""
mtype = response.headers.get("Content-Type", "image/jpeg") mtype = response.headers.get("Content-Type", "image/jpeg")
mtype = mtype.partition(";")[0] mtype = mtype.partition(";")[0]
if "/" not in mtype: if "/" not in mtype:
mtype = "image/" + mtype mtype = "image/" + mtype
if mtype in MIMETYPE_MAP: if mtype in MIME_TYPES:
return MIMETYPE_MAP[mtype] return MIME_TYPES[mtype]
exts = mimetypes.guess_all_extensions(mtype, strict=False) ext = mimetypes.guess_extension(mtype, strict=False)
if exts: if ext:
exts.sort() return ext[1:]
return exts[-1][1:] self.log.warning("Unknown MIME type '%s'", mtype)
return "bin"
self.log.warning(
"No filename extension found for MIME type '%s'", mtype)
return "txt"
@staticmethod @staticmethod
def check_extension(file, extension): def _adjust_extension(pathfmt, file_header):
"""Check filename extension against fileheader""" """Check filename extension against file header"""
if extension in FILETYPE_CHECK: sig = FILE_SIGNATURES[pathfmt.extension]
file.seek(0) if not file_header.startswith(sig):
header = file.read(8) for ext, sig in FILE_SIGNATURES.items():
if len(header) >= 8 and not FILETYPE_CHECK[extension](header): if file_header.startswith(sig):
for ext, check in FILETYPE_CHECK.items(): pathfmt.set_extension(ext)
if ext != extension and check(header): return True
return ext return False
return None
FILETYPE_CHECK = {
"jpg": lambda h: h[0:2] == b"\xff\xd8",
"png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
"gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
}
MIMETYPE_MAP = { MIME_TYPES = {
"image/jpeg": "jpg", "image/jpeg" : "jpg",
"image/jpg": "jpg", "image/jpg" : "jpg",
"image/png": "png", "image/png" : "png",
"image/gif": "gif", "image/gif" : "gif",
"image/bmp": "bmp", "image/bmp" : "bmp",
"image/x-bmp": "bmp", "image/x-bmp" : "bmp",
"image/x-ms-bmp": "bmp", "image/x-ms-bmp": "bmp",
"image/webp": "webp", "image/webp" : "webp",
"image/svg+xml": "svg", "image/svg+xml" : "svg",
"image/x-photoshop" : "psd",
"application/x-photoshop" : "psd",
"image/vnd.adobe.photoshop": "psd", "image/vnd.adobe.photoshop": "psd",
"image/x-photoshop": "psd",
"application/x-photoshop": "psd",
"video/webm": "webm", "video/webm": "webm",
"video/ogg": "ogg", "video/ogg" : "ogg",
"video/mp4": "mp4", "video/mp4" : "mp4",
"audio/wav": "wav", "audio/wav" : "wav",
"audio/x-wav": "wav", "audio/x-wav": "wav",
"audio/webm": "webm", "audio/webm" : "webm",
"audio/ogg": "ogg", "audio/ogg" : "ogg",
"audio/mpeg": "mp3", "audio/mpeg" : "mp3",
"application/zip": "zip", "application/zip" : "zip",
"application/x-zip": "zip", "application/x-zip": "zip",
"application/x-zip-compressed": "zip", "application/x-zip-compressed": "zip",
"application/rar": "rar", "application/rar" : "rar",
"application/x-rar": "rar", "application/x-rar": "rar",
"application/x-rar-compressed": "rar", "application/x-rar-compressed": "rar",
"application/x-7z-compressed": "7z", "application/x-7z-compressed" : "7z",
"application/ogg": "ogg", "application/ogg": "ogg",
"application/octet-stream": "bin", "application/octet-stream": "bin",
} }
# taken from https://en.wikipedia.org/wiki/List_of_file_signatures
FILE_SIGNATURES = {
"jpg" : b"\xFF\xD8\xFF",
"png" : b"\x89PNG\r\n\x1A\n",
"gif" : b"GIF8",
"bmp" : b"\x42\x4D",
"webp": b"RIFF",
"svg" : b"<?xml",
"psd" : b"8BPS",
"webm": b"\x1A\x45\xDF\xA3",
"ogg" : b"OggS",
"wav" : b"RIFF",
"mp3" : b"ID3",
"zip" : b"\x50\x4B",
"rar" : b"\x52\x61\x72\x21\x1A\x07",
"7z" : b"\x37\x7A\xBC\xAF\x27\x1C",
# check 'bin' files against all other file signatures
"bin" : b"\x00\x00\x00\x00",
}
__downloader__ = HttpDownloader __downloader__ = HttpDownloader

Loading…
Cancel
Save