diff --git a/scripts/s3_media_upload b/scripts/s3_media_upload index 39c0bbc..99ce56d 100755 --- a/scripts/s3_media_upload +++ b/scripts/s3_media_upload @@ -19,6 +19,8 @@ SCHEMA = """ filesystem_id TEXT NOT NULL, -- Type is "local" or "remote" type TEXT NOT NULL, + -- indicates whether the media and all its thumbnails have been deleted from the + -- local cache known_deleted BOOLEAN NOT NULL ); @@ -116,6 +118,55 @@ def to_path(origin, filesystem_id, m_type): return file_path +def to_thumbnail_dir(origin, filesystem_id, m_type): + """Get a relative path to the given media's thumbnail directory + """ + if m_type == "local": + thumbnail_path = os.path.join( + "local_thumbnails", + filesystem_id[:2], + filesystem_id[2:4], + filesystem_id[4:], + ) + elif m_type == "remote": + thumbnail_path = os.path.join( + "remote_thumbnail", + origin, + filesystem_id[:2], + filesystem_id[2:4], + filesystem_id[4:], + ) + else: + raise Exception("Unexpected media type %r", m_type) + + return thumbnail_path + + +def get_local_files(base_path, origin, filesystem_id, m_type): + """Get a list of relative paths to undeleted files for the given media + """ + local_files = [] + + original_path = to_path(origin, filesystem_id, m_type) + if os.path.exists(os.path.join(base_path, original_path)): + local_files.append(original_path) + + thumbnail_path = to_thumbnail_dir(origin, filesystem_id, m_type) + try: + with os.scandir(os.path.join(base_path, thumbnail_path)) as dir_entries: + for dir_entry in dir_entries: + if dir_entry.is_file(): + local_files.append(os.path.join(thumbnail_path, dir_entry.name)) + except FileNotFoundError: + # The thumbnail directory does not exist + pass + except NotADirectoryError: + # The thumbnail directory is not a directory for some reason + pass + + return local_files + + def check_file_in_s3(s3, bucket, key): """Check the file exists in S3 (though it could be different) """ @@ -136,6 +187,11 @@ def run_write(sqlite_conn, output_file): file_path = to_path(origin, filesystem_id, m_type) print(file_path, file=output_file) + # Print thumbnail directories with a trailing '/' + thumbnail_path = to_thumbnail_dir(origin, filesystem_id, m_type) + thumbnail_path = os.path.join(thumbnail_path, "") + print(thumbnail_path, file=output_file) + def run_update_db(postgres_conn, sqlite_conn, before_date): """Entry point for update-db command @@ -200,9 +256,8 @@ def run_check_delete(sqlite_conn, base_path): print("Checking on ", get_not_deleted_count(sqlite_conn), " undeleted files") for origin, media_id, filesystem_id, m_type in it: - rel_file_path = to_path(origin, filesystem_id, m_type) - file_path = os.path.join(base_path, rel_file_path) - if not os.path.exists(file_path): + local_files = get_local_files(base_path, origin, filesystem_id, m_type) + if not local_files: deleted.append((origin, media_id)) with sqlite_conn: @@ -222,9 +277,11 @@ def run_upload(s3, bucket, sqlite_conn, base_path, should_delete, storage_class) """ total = get_not_deleted_count(sqlite_conn) - uploaded = 0 + uploaded_media = 0 + uploaded_files = 0 uploaded_bytes = 0 - deleted = 0 + deleted_media = 0 + deleted_files = 0 deleted_bytes = 0 # This is a progress bar @@ -235,50 +292,67 @@ def run_upload(s3, bucket, sqlite_conn, base_path, should_delete, storage_class) it = get_not_deleted(sqlite_conn) for origin, media_id, filesystem_id, m_type in it: - rel_file_path = to_path(origin, filesystem_id, m_type) + local_files = get_local_files(base_path, origin, filesystem_id, m_type) - local_path = os.path.join(base_path, rel_file_path) - path_exists = os.path.exists(local_path) - if not path_exists: + if not local_files: mark_as_deleted(sqlite_conn, origin, media_id) continue - if not check_file_in_s3(s3, bucket, rel_file_path): - try: - s3.upload_file( - local_path, - bucket, - rel_file_path, - ExtraArgs={"StorageClass": storage_class}, - ) - except Exception as e: - print("Failed to upload file %s: %s", local_path, e) - continue + # Counters of uploaded and deleted files for this media only + media_uploaded_files = 0 + media_deleted_files = 0 - uploaded += 1 - uploaded_bytes += os.path.getsize(local_path) + for rel_file_path in local_files: + local_path = os.path.join(base_path, rel_file_path) - if should_delete: - size = os.path.getsize(local_path) - os.remove(local_path) + if not check_file_in_s3(s3, bucket, rel_file_path): + try: + s3.upload_file( + local_path, + bucket, + rel_file_path, + ExtraArgs={"StorageClass": storage_class}, + ) + except Exception as e: + print("Failed to upload file %s: %s", local_path, e) + continue - try: - # This may have lead to an empty directory, so lets remove all - # that are empty - os.removedirs(os.path.dirname(local_path)) - except Exception: - # The directory might not be empty, or maybe we don't have - # permission. Either way doesn't really matter. - pass + media_uploaded_files += 1 + uploaded_files += 1 + uploaded_bytes += os.path.getsize(local_path) + if should_delete: + size = os.path.getsize(local_path) + os.remove(local_path) + + try: + # This may have lead to an empty directory, so lets remove all + # that are empty + os.removedirs(os.path.dirname(local_path)) + except Exception: + # The directory might not be empty, or maybe we don't have + # permission. Either way doesn't really matter. + pass + + media_deleted_files += 1 + deleted_files += 1 + deleted_bytes += size + + if media_uploaded_files: + uploaded_media += 1 + + if media_deleted_files: + deleted_media += 1 + + if media_deleted_files == len(local_files): + # Mark as deleted only if *all* the local files have been deleted mark_as_deleted(sqlite_conn, origin, media_id) - deleted += 1 - deleted_bytes += size - - print("Uploaded", uploaded, "files out of", total) + print("Uploaded", uploaded_media, "media out of", total) + print("Uploaded", uploaded_files, "files") print("Uploaded", humanize.naturalsize(uploaded_bytes, gnu=True)) - print("Deleted", deleted, "files") + print("Deleted", deleted_media, "media") + print("Deleted", deleted_files, "files") print("Deleted", humanize.naturalsize(deleted_bytes, gnu=True)) @@ -359,8 +433,8 @@ def main(): write_parser = subparsers.add_parser( "write", - help="Outputs all files in local cache that we may not have deleted," - " check-deleted should be run first to update cache.", + help="Outputs all file and directory paths in the local cache that we may not" + " have deleted. check-deleted should be run first to update cache.", ) write_parser.add_argument( "out",