Add support for thumbnail offload to scripts/s3_media_upload (#61)

Media are now considered deleted only if both the original file and all
thumbnails have been deleted.

`cache.db`s built before this change may incorrectly count media as
deleted while their thumbnails still exist in the local cache. This can
be resolved by either:
 a) deleting `cache.db` and running an `update` to crawl through the
    entire local cache again. This may take an extremely long time for
    large Synapse deployments.
 b) uploading the contents of local_thumbnails/ and remote_thumbnail/
    manually, then deleting the uploaded files. Note that a running
    Synapse instance may write new thumbnails during the process.

    If the S3 storage provider has been installed since the very start
    and configured to store both local and remote media synchronously,
    all thumbnails should already be in S3 and the upload step can be
    skipped.

This commit changes the behavior of the `write` command. Previously,
`write` would only output undeleted file paths. Now the output contains
a mix of file and thumbnail directory paths, which may sometimes already
be deleted / not exist.
This commit is contained in:
Sean Quah 2021-09-15 10:18:26 +01:00 committed by GitHub
parent a5b15d644d
commit 04e3d31b40
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -19,6 +19,8 @@ SCHEMA = """
filesystem_id TEXT NOT NULL, filesystem_id TEXT NOT NULL,
-- Type is "local" or "remote" -- Type is "local" or "remote"
type TEXT NOT NULL, type TEXT NOT NULL,
-- indicates whether the media and all its thumbnails have been deleted from the
-- local cache
known_deleted BOOLEAN NOT NULL known_deleted BOOLEAN NOT NULL
); );
@ -116,6 +118,55 @@ def to_path(origin, filesystem_id, m_type):
return file_path return file_path
def to_thumbnail_dir(origin, filesystem_id, m_type):
"""Get a relative path to the given media's thumbnail directory
"""
if m_type == "local":
thumbnail_path = os.path.join(
"local_thumbnails",
filesystem_id[:2],
filesystem_id[2:4],
filesystem_id[4:],
)
elif m_type == "remote":
thumbnail_path = os.path.join(
"remote_thumbnail",
origin,
filesystem_id[:2],
filesystem_id[2:4],
filesystem_id[4:],
)
else:
raise Exception("Unexpected media type %r", m_type)
return thumbnail_path
def get_local_files(base_path, origin, filesystem_id, m_type):
"""Get a list of relative paths to undeleted files for the given media
"""
local_files = []
original_path = to_path(origin, filesystem_id, m_type)
if os.path.exists(os.path.join(base_path, original_path)):
local_files.append(original_path)
thumbnail_path = to_thumbnail_dir(origin, filesystem_id, m_type)
try:
with os.scandir(os.path.join(base_path, thumbnail_path)) as dir_entries:
for dir_entry in dir_entries:
if dir_entry.is_file():
local_files.append(os.path.join(thumbnail_path, dir_entry.name))
except FileNotFoundError:
# The thumbnail directory does not exist
pass
except NotADirectoryError:
# The thumbnail directory is not a directory for some reason
pass
return local_files
def check_file_in_s3(s3, bucket, key): def check_file_in_s3(s3, bucket, key):
"""Check the file exists in S3 (though it could be different) """Check the file exists in S3 (though it could be different)
""" """
@ -136,6 +187,11 @@ def run_write(sqlite_conn, output_file):
file_path = to_path(origin, filesystem_id, m_type) file_path = to_path(origin, filesystem_id, m_type)
print(file_path, file=output_file) print(file_path, file=output_file)
# Print thumbnail directories with a trailing '/'
thumbnail_path = to_thumbnail_dir(origin, filesystem_id, m_type)
thumbnail_path = os.path.join(thumbnail_path, "")
print(thumbnail_path, file=output_file)
def run_update_db(postgres_conn, sqlite_conn, before_date): def run_update_db(postgres_conn, sqlite_conn, before_date):
"""Entry point for update-db command """Entry point for update-db command
@ -200,9 +256,8 @@ def run_check_delete(sqlite_conn, base_path):
print("Checking on ", get_not_deleted_count(sqlite_conn), " undeleted files") print("Checking on ", get_not_deleted_count(sqlite_conn), " undeleted files")
for origin, media_id, filesystem_id, m_type in it: for origin, media_id, filesystem_id, m_type in it:
rel_file_path = to_path(origin, filesystem_id, m_type) local_files = get_local_files(base_path, origin, filesystem_id, m_type)
file_path = os.path.join(base_path, rel_file_path) if not local_files:
if not os.path.exists(file_path):
deleted.append((origin, media_id)) deleted.append((origin, media_id))
with sqlite_conn: with sqlite_conn:
@ -222,9 +277,11 @@ def run_upload(s3, bucket, sqlite_conn, base_path, should_delete, storage_class)
""" """
total = get_not_deleted_count(sqlite_conn) total = get_not_deleted_count(sqlite_conn)
uploaded = 0 uploaded_media = 0
uploaded_files = 0
uploaded_bytes = 0 uploaded_bytes = 0
deleted = 0 deleted_media = 0
deleted_files = 0
deleted_bytes = 0 deleted_bytes = 0
# This is a progress bar # This is a progress bar
@ -235,50 +292,67 @@ def run_upload(s3, bucket, sqlite_conn, base_path, should_delete, storage_class)
it = get_not_deleted(sqlite_conn) it = get_not_deleted(sqlite_conn)
for origin, media_id, filesystem_id, m_type in it: for origin, media_id, filesystem_id, m_type in it:
rel_file_path = to_path(origin, filesystem_id, m_type) local_files = get_local_files(base_path, origin, filesystem_id, m_type)
local_path = os.path.join(base_path, rel_file_path) if not local_files:
path_exists = os.path.exists(local_path)
if not path_exists:
mark_as_deleted(sqlite_conn, origin, media_id) mark_as_deleted(sqlite_conn, origin, media_id)
continue continue
if not check_file_in_s3(s3, bucket, rel_file_path): # Counters of uploaded and deleted files for this media only
try: media_uploaded_files = 0
s3.upload_file( media_deleted_files = 0
local_path,
bucket,
rel_file_path,
ExtraArgs={"StorageClass": storage_class},
)
except Exception as e:
print("Failed to upload file %s: %s", local_path, e)
continue
uploaded += 1 for rel_file_path in local_files:
uploaded_bytes += os.path.getsize(local_path) local_path = os.path.join(base_path, rel_file_path)
if should_delete: if not check_file_in_s3(s3, bucket, rel_file_path):
size = os.path.getsize(local_path) try:
os.remove(local_path) s3.upload_file(
local_path,
bucket,
rel_file_path,
ExtraArgs={"StorageClass": storage_class},
)
except Exception as e:
print("Failed to upload file %s: %s", local_path, e)
continue
try: media_uploaded_files += 1
# This may have lead to an empty directory, so lets remove all uploaded_files += 1
# that are empty uploaded_bytes += os.path.getsize(local_path)
os.removedirs(os.path.dirname(local_path))
except Exception:
# The directory might not be empty, or maybe we don't have
# permission. Either way doesn't really matter.
pass
if should_delete:
size = os.path.getsize(local_path)
os.remove(local_path)
try:
# This may have lead to an empty directory, so lets remove all
# that are empty
os.removedirs(os.path.dirname(local_path))
except Exception:
# The directory might not be empty, or maybe we don't have
# permission. Either way doesn't really matter.
pass
media_deleted_files += 1
deleted_files += 1
deleted_bytes += size
if media_uploaded_files:
uploaded_media += 1
if media_deleted_files:
deleted_media += 1
if media_deleted_files == len(local_files):
# Mark as deleted only if *all* the local files have been deleted
mark_as_deleted(sqlite_conn, origin, media_id) mark_as_deleted(sqlite_conn, origin, media_id)
deleted += 1 print("Uploaded", uploaded_media, "media out of", total)
deleted_bytes += size print("Uploaded", uploaded_files, "files")
print("Uploaded", uploaded, "files out of", total)
print("Uploaded", humanize.naturalsize(uploaded_bytes, gnu=True)) print("Uploaded", humanize.naturalsize(uploaded_bytes, gnu=True))
print("Deleted", deleted, "files") print("Deleted", deleted_media, "media")
print("Deleted", deleted_files, "files")
print("Deleted", humanize.naturalsize(deleted_bytes, gnu=True)) print("Deleted", humanize.naturalsize(deleted_bytes, gnu=True))
@ -359,8 +433,8 @@ def main():
write_parser = subparsers.add_parser( write_parser = subparsers.add_parser(
"write", "write",
help="Outputs all files in local cache that we may not have deleted," help="Outputs all file and directory paths in the local cache that we may not"
" check-deleted should be run first to update cache.", " have deleted. check-deleted should be run first to update cache.",
) )
write_parser.add_argument( write_parser.add_argument(
"out", "out",