[pp:hash] add 'hash' post processor (#6099)

pull/5071/merge
Mike Fährmann 3 weeks ago
parent f52cf54e16
commit ae9b0da755
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -5586,6 +5586,63 @@ Description
See `metadata.event`_ for a list of available events.
hash.chunk-size
---------------
Type
``integer``
Default
``32768``
Description
Number of bytes read per chunk during file hash computation.
hash.event
----------
Type
* ``string``
* ``list`` of ``strings``
Default
``"file"``
Description
The event(s) for which `file hashes <hash.hashes_>`__ are computed.
See `metadata.event`_ for a list of available events.
hash.filename
-------------
Type
* ``bool``
Default
``false``
Description
Rebuild `filenames <extractor.*.filename_>`__ after computing
`hash digests <hash.hashes_>`__ and adding them to the metadata dict.
hash.hashes
-----------
Type
* ``string``
* ``object`` (`field name` -> `hash algorithm`)
Default
``"md5,sha1"``
Example
.. code:: json
"sha256:hash_sha,sha3_512:hash_sha3"
.. code:: json
{
"hash_sha" : "sha256",
"hash_sha3": "sha3_512"
}
Description
Hash digests to compute.
metadata.mode
-------------
Type
@ -6694,6 +6751,8 @@ Description
| (requires `downloader.*.part`_ = ``true`` and `extractor.*.skip`_ = ``false``)
``exec``
Execute external commands
``hash``
Compute file hash digests
``metadata``
Write metadata to separate files
``mtime``

@ -12,6 +12,7 @@ modules = [
"classify",
"compare",
"exec",
"hash",
"metadata",
"mtime",
"python",

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Compute file hash digests"""
from .common import PostProcessor
import hashlib
class HashPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
self.chunk_size = options.get("chunk-size", 32768)
self.filename = options.get("filename")
hashes = options.get("hashes")
if isinstance(hashes, dict):
self.hashes = list(hashes.items())
elif isinstance(hashes, str):
self.hashes = []
for h in hashes.split(","):
name, sep, key = h.partition(":")
self.hashes.append((key if sep else name, name))
elif hashes:
self.hashes = hashes
else:
self.hashes = (("md5", "md5"), ("sha1", "sha1"))
events = options.get("event")
if events is None:
events = ("file",)
elif isinstance(events, str):
events = events.split(",")
job.register_hooks({event: self.run for event in events}, options)
def run(self, pathfmt):
hashes = [
(key, hashlib.new(name))
for key, name in self.hashes
]
size = self.chunk_size
with self._open(pathfmt) as fp:
while True:
data = fp.read(size)
if not data:
break
for _, h in hashes:
h.update(data)
for key, h in hashes:
pathfmt.kwdict[key] = h.hexdigest()
if self.filename:
pathfmt.build_path()
def _open(self, pathfmt):
try:
return open(pathfmt.temppath, "rb")
except OSError:
return open(pathfmt.realpath, "rb")
__postprocessor__ = HashPP

@ -240,6 +240,57 @@ class ExecTest(BasePostprocessorTest):
self.assertFalse(i.wait.called)
class HashTest(BasePostprocessorTest):
def test_default(self):
self._create({})
with self.pathfmt.open() as fp:
fp.write(b"Foo Bar\n")
self._trigger()
kwdict = self.pathfmt.kwdict
self.assertEqual(
"35c9c9c7c90ad764bae9e2623f522c24", kwdict["md5"], "md5")
self.assertEqual(
"14d3d804494ef4e57d72de63e4cfee761240471a", kwdict["sha1"], "sha1")
def test_custom_hashes(self):
self._create({"hashes": "sha256:a,sha512:b"})
with self.pathfmt.open() as fp:
fp.write(b"Foo Bar\n")
self._trigger()
kwdict = self.pathfmt.kwdict
self.assertEqual(
"4775b55be17206445d7015a5fc7656f38a74b880670523c3b175455f885f2395",
kwdict["a"], "sha256")
self.assertEqual(
"6028f9e6957f4ca929941318c4bba6258713fd5162f9e33bd10e1c456d252700"
"3e1095b50736c4fd1e2deea152e3c8ecd5993462a747208e4d842659935a1c62",
kwdict["b"], "sha512")
def test_custom_hashes_dict(self):
self._create({"hashes": {"a": "sha256", "b": "sha512"}})
with self.pathfmt.open() as fp:
fp.write(b"Foo Bar\n")
self._trigger()
kwdict = self.pathfmt.kwdict
self.assertEqual(
"4775b55be17206445d7015a5fc7656f38a74b880670523c3b175455f885f2395",
kwdict["a"], "sha256")
self.assertEqual(
"6028f9e6957f4ca929941318c4bba6258713fd5162f9e33bd10e1c456d252700"
"3e1095b50736c4fd1e2deea152e3c8ecd5993462a747208e4d842659935a1c62",
kwdict["b"], "sha512")
class MetadataTest(BasePostprocessorTest):
def test_metadata_default(self):

Loading…
Cancel
Save