From 04e3d31b40d2deb1d7ad9367d56dd555c3428bc3 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Wed, 15 Sep 2021 10:18:26 +0100 Subject: [PATCH] Add support for thumbnail offload to `scripts/s3_media_upload` (#61) Media are now considered deleted only if both the original file and all thumbnails have been deleted. `cache.db`s built before this change may incorrectly count media as deleted while their thumbnails still exist in the local cache. This can be resolved by either: a) deleting `cache.db` and running an `update` to crawl through the entire local cache again. This may take an extremely long time for large Synapse deployments. b) uploading the contents of local_thumbnails/ and remote_thumbnail/ manually, then deleting the uploaded files. Note that a running Synapse instance may write new thumbnails during the process. If the S3 storage provider has been installed since the very start and configured to store both local and remote media synchronously, all thumbnails should already be in S3 and the upload step can be skipped. This commit changes the behavior of the `write` command. Previously, `write` would only output undeleted file paths. Now the output contains a mix of file and thumbnail directory paths, which may sometimes already be deleted / not exist. --- scripts/s3_media_upload | 154 +++++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 40 deletions(-) diff --git a/scripts/s3_media_upload b/scripts/s3_media_upload index 39c0bbc..99ce56d 100755 --- a/scripts/s3_media_upload +++ b/scripts/s3_media_upload @@ -19,6 +19,8 @@ SCHEMA = """ filesystem_id TEXT NOT NULL, -- Type is "local" or "remote" type TEXT NOT NULL, + -- indicates whether the media and all its thumbnails have been deleted from the + -- local cache known_deleted BOOLEAN NOT NULL ); @@ -116,6 +118,55 @@ def to_path(origin, filesystem_id, m_type): return file_path +def to_thumbnail_dir(origin, filesystem_id, m_type): + """Get a relative path to the given media's thumbnail directory + """ + if m_type == "local": + thumbnail_path = os.path.join( + "local_thumbnails", + filesystem_id[:2], + filesystem_id[2:4], + filesystem_id[4:], + ) + elif m_type == "remote": + thumbnail_path = os.path.join( + "remote_thumbnail", + origin, + filesystem_id[:2], + filesystem_id[2:4], + filesystem_id[4:], + ) + else: + raise Exception("Unexpected media type %r", m_type) + + return thumbnail_path + + +def get_local_files(base_path, origin, filesystem_id, m_type): + """Get a list of relative paths to undeleted files for the given media + """ + local_files = [] + + original_path = to_path(origin, filesystem_id, m_type) + if os.path.exists(os.path.join(base_path, original_path)): + local_files.append(original_path) + + thumbnail_path = to_thumbnail_dir(origin, filesystem_id, m_type) + try: + with os.scandir(os.path.join(base_path, thumbnail_path)) as dir_entries: + for dir_entry in dir_entries: + if dir_entry.is_file(): + local_files.append(os.path.join(thumbnail_path, dir_entry.name)) + except FileNotFoundError: + # The thumbnail directory does not exist + pass + except NotADirectoryError: + # The thumbnail directory is not a directory for some reason + pass + + return local_files + + def check_file_in_s3(s3, bucket, key): """Check the file exists in S3 (though it could be different) """ @@ -136,6 +187,11 @@ def run_write(sqlite_conn, output_file): file_path = to_path(origin, filesystem_id, m_type) print(file_path, file=output_file) + # Print thumbnail directories with a trailing '/' + thumbnail_path = to_thumbnail_dir(origin, filesystem_id, m_type) + thumbnail_path = os.path.join(thumbnail_path, "") + print(thumbnail_path, file=output_file) + def run_update_db(postgres_conn, sqlite_conn, before_date): """Entry point for update-db command @@ -200,9 +256,8 @@ def run_check_delete(sqlite_conn, base_path): print("Checking on ", get_not_deleted_count(sqlite_conn), " undeleted files") for origin, media_id, filesystem_id, m_type in it: - rel_file_path = to_path(origin, filesystem_id, m_type) - file_path = os.path.join(base_path, rel_file_path) - if not os.path.exists(file_path): + local_files = get_local_files(base_path, origin, filesystem_id, m_type) + if not local_files: deleted.append((origin, media_id)) with sqlite_conn: @@ -222,9 +277,11 @@ def run_upload(s3, bucket, sqlite_conn, base_path, should_delete, storage_class) """ total = get_not_deleted_count(sqlite_conn) - uploaded = 0 + uploaded_media = 0 + uploaded_files = 0 uploaded_bytes = 0 - deleted = 0 + deleted_media = 0 + deleted_files = 0 deleted_bytes = 0 # This is a progress bar @@ -235,50 +292,67 @@ def run_upload(s3, bucket, sqlite_conn, base_path, should_delete, storage_class) it = get_not_deleted(sqlite_conn) for origin, media_id, filesystem_id, m_type in it: - rel_file_path = to_path(origin, filesystem_id, m_type) + local_files = get_local_files(base_path, origin, filesystem_id, m_type) - local_path = os.path.join(base_path, rel_file_path) - path_exists = os.path.exists(local_path) - if not path_exists: + if not local_files: mark_as_deleted(sqlite_conn, origin, media_id) continue - if not check_file_in_s3(s3, bucket, rel_file_path): - try: - s3.upload_file( - local_path, - bucket, - rel_file_path, - ExtraArgs={"StorageClass": storage_class}, - ) - except Exception as e: - print("Failed to upload file %s: %s", local_path, e) - continue + # Counters of uploaded and deleted files for this media only + media_uploaded_files = 0 + media_deleted_files = 0 - uploaded += 1 - uploaded_bytes += os.path.getsize(local_path) + for rel_file_path in local_files: + local_path = os.path.join(base_path, rel_file_path) - if should_delete: - size = os.path.getsize(local_path) - os.remove(local_path) + if not check_file_in_s3(s3, bucket, rel_file_path): + try: + s3.upload_file( + local_path, + bucket, + rel_file_path, + ExtraArgs={"StorageClass": storage_class}, + ) + except Exception as e: + print("Failed to upload file %s: %s", local_path, e) + continue - try: - # This may have lead to an empty directory, so lets remove all - # that are empty - os.removedirs(os.path.dirname(local_path)) - except Exception: - # The directory might not be empty, or maybe we don't have - # permission. Either way doesn't really matter. - pass + media_uploaded_files += 1 + uploaded_files += 1 + uploaded_bytes += os.path.getsize(local_path) + if should_delete: + size = os.path.getsize(local_path) + os.remove(local_path) + + try: + # This may have lead to an empty directory, so lets remove all + # that are empty + os.removedirs(os.path.dirname(local_path)) + except Exception: + # The directory might not be empty, or maybe we don't have + # permission. Either way doesn't really matter. + pass + + media_deleted_files += 1 + deleted_files += 1 + deleted_bytes += size + + if media_uploaded_files: + uploaded_media += 1 + + if media_deleted_files: + deleted_media += 1 + + if media_deleted_files == len(local_files): + # Mark as deleted only if *all* the local files have been deleted mark_as_deleted(sqlite_conn, origin, media_id) - deleted += 1 - deleted_bytes += size - - print("Uploaded", uploaded, "files out of", total) + print("Uploaded", uploaded_media, "media out of", total) + print("Uploaded", uploaded_files, "files") print("Uploaded", humanize.naturalsize(uploaded_bytes, gnu=True)) - print("Deleted", deleted, "files") + print("Deleted", deleted_media, "media") + print("Deleted", deleted_files, "files") print("Deleted", humanize.naturalsize(deleted_bytes, gnu=True)) @@ -359,8 +433,8 @@ def main(): write_parser = subparsers.add_parser( "write", - help="Outputs all files in local cache that we may not have deleted," - " check-deleted should be run first to update cache.", + help="Outputs all file and directory paths in the local cache that we may not" + " have deleted. check-deleted should be run first to update cache.", ) write_parser.add_argument( "out",