diff --git a/docs/configuration.rst b/docs/configuration.rst index b9eca00d..a89e5341 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5586,6 +5586,63 @@ Description See `metadata.event`_ for a list of available events. +hash.chunk-size +--------------- +Type + ``integer`` +Default + ``32768`` +Description + Number of bytes read per chunk during file hash computation. + + +hash.event +---------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"file"`` +Description + The event(s) for which `file hashes `__ are computed. + + See `metadata.event`_ for a list of available events. + + +hash.filename +------------- +Type + * ``bool`` +Default + ``false`` +Description + Rebuild `filenames `__ after computing + `hash digests `__ and adding them to the metadata dict. + + +hash.hashes +----------- +Type + * ``string`` + * ``object`` (`field name` -> `hash algorithm`) +Default + ``"md5,sha1"`` +Example + .. code:: json + + "sha256:hash_sha,sha3_512:hash_sha3" + + .. code:: json + + { + "hash_sha" : "sha256", + "hash_sha3": "sha3_512" + } + +Description + Hash digests to compute. + + metadata.mode ------------- Type @@ -6694,6 +6751,8 @@ Description | (requires `downloader.*.part`_ = ``true`` and `extractor.*.skip`_ = ``false``) ``exec`` Execute external commands + ``hash`` + Compute file hash digests ``metadata`` Write metadata to separate files ``mtime`` diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index c17eacc6..7837b063 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -12,6 +12,7 @@ modules = [ "classify", "compare", "exec", + "hash", "metadata", "mtime", "python", diff --git a/gallery_dl/postprocessor/hash.py b/gallery_dl/postprocessor/hash.py new file mode 100644 index 00000000..92a74779 --- /dev/null +++ b/gallery_dl/postprocessor/hash.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Compute file hash digests""" + +from .common import PostProcessor +import hashlib + + +class HashPP(PostProcessor): + + def __init__(self, job, options): + PostProcessor.__init__(self, job) + + self.chunk_size = options.get("chunk-size", 32768) + self.filename = options.get("filename") + + hashes = options.get("hashes") + if isinstance(hashes, dict): + self.hashes = list(hashes.items()) + elif isinstance(hashes, str): + self.hashes = [] + for h in hashes.split(","): + name, sep, key = h.partition(":") + self.hashes.append((key if sep else name, name)) + elif hashes: + self.hashes = hashes + else: + self.hashes = (("md5", "md5"), ("sha1", "sha1")) + + events = options.get("event") + if events is None: + events = ("file",) + elif isinstance(events, str): + events = events.split(",") + job.register_hooks({event: self.run for event in events}, options) + + def run(self, pathfmt): + hashes = [ + (key, hashlib.new(name)) + for key, name in self.hashes + ] + + size = self.chunk_size + with self._open(pathfmt) as fp: + while True: + data = fp.read(size) + if not data: + break + for _, h in hashes: + h.update(data) + + for key, h in hashes: + pathfmt.kwdict[key] = h.hexdigest() + + if self.filename: + pathfmt.build_path() + + def _open(self, pathfmt): + try: + return open(pathfmt.temppath, "rb") + except OSError: + return open(pathfmt.realpath, "rb") + + +__postprocessor__ = HashPP diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 3089b82a..1d80df2f 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -240,6 +240,57 @@ class ExecTest(BasePostprocessorTest): self.assertFalse(i.wait.called) +class HashTest(BasePostprocessorTest): + + def test_default(self): + self._create({}) + + with self.pathfmt.open() as fp: + fp.write(b"Foo Bar\n") + + self._trigger() + + kwdict = self.pathfmt.kwdict + self.assertEqual( + "35c9c9c7c90ad764bae9e2623f522c24", kwdict["md5"], "md5") + self.assertEqual( + "14d3d804494ef4e57d72de63e4cfee761240471a", kwdict["sha1"], "sha1") + + def test_custom_hashes(self): + self._create({"hashes": "sha256:a,sha512:b"}) + + with self.pathfmt.open() as fp: + fp.write(b"Foo Bar\n") + + self._trigger() + + kwdict = self.pathfmt.kwdict + self.assertEqual( + "4775b55be17206445d7015a5fc7656f38a74b880670523c3b175455f885f2395", + kwdict["a"], "sha256") + self.assertEqual( + "6028f9e6957f4ca929941318c4bba6258713fd5162f9e33bd10e1c456d252700" + "3e1095b50736c4fd1e2deea152e3c8ecd5993462a747208e4d842659935a1c62", + kwdict["b"], "sha512") + + def test_custom_hashes_dict(self): + self._create({"hashes": {"a": "sha256", "b": "sha512"}}) + + with self.pathfmt.open() as fp: + fp.write(b"Foo Bar\n") + + self._trigger() + + kwdict = self.pathfmt.kwdict + self.assertEqual( + "4775b55be17206445d7015a5fc7656f38a74b880670523c3b175455f885f2395", + kwdict["a"], "sha256") + self.assertEqual( + "6028f9e6957f4ca929941318c4bba6258713fd5162f9e33bd10e1c456d252700" + "3e1095b50736c4fd1e2deea152e3c8ecd5993462a747208e4d842659935a1c62", + kwdict["b"], "sha512") + + class MetadataTest(BasePostprocessorTest): def test_metadata_default(self):