From e8a36966e888e2b58837a67476b12ca69905ba33 Mon Sep 17 00:00:00 2001 From: Andrew Johnston Date: Thu, 21 Aug 2025 16:30:07 -0800 Subject: [PATCH] delete any existing objects prior to uploading to s3 --- CHANGELOG.md | 7 +++++++ src/hyp3_opera_rtc/upload_rtc.py | 11 +++++++++-- tests/test_upload_rtc.py | 12 ++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fa6347..e669970 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/) and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.8] + +### Fixed +- `upload_rtc.py` now deletes any pre-existing objects from S3 prior to uploading output files. Eliminates duplicate + output files in the rare case where a HyP3 job attempt is terminated in the middle of uploading objects and then later + retried. + ## [0.1.7] ### Fixed diff --git a/src/hyp3_opera_rtc/upload_rtc.py b/src/hyp3_opera_rtc/upload_rtc.py index 13d6ef0..f568c49 100644 --- a/src/hyp3_opera_rtc/upload_rtc.py +++ b/src/hyp3_opera_rtc/upload_rtc.py @@ -3,7 +3,7 @@ from shutil import copyfile, make_archive from xml.etree import ElementTree as et -from hyp3lib.aws import upload_file_to_s3 +from hyp3lib import aws import hyp3_opera_rtc @@ -12,6 +12,12 @@ class FailedToFindLineageStatementError(Exception): pass +def delete_prefix(bucket: str, bucket_prefix: str) -> None: + response = aws.S3_CLIENT.list_objects_v2(Bucket=bucket, Prefix=bucket_prefix) + for obj in response.get('Contents', []): + aws.S3_CLIENT.delete_object(Bucket=bucket, Key=obj['Key']) + + def upload_rtc(bucket: str, bucket_prefix: str, output_dir: Path) -> None: output_files = [f for f in output_dir.iterdir() if not f.is_dir()] @@ -20,8 +26,9 @@ def upload_rtc(bucket: str, bucket_prefix: str, output_dir: Path) -> None: output_zip = make_zip(output_files, output_dir) output_files.append(output_zip) + delete_prefix(bucket, bucket_prefix) for output_file in output_files: - upload_file_to_s3(output_file, bucket, bucket_prefix, chunk_size=100_000_000) + aws.upload_file_to_s3(output_file, bucket, bucket_prefix, chunk_size=100_000_000) def make_zip(output_files: list[Path], output_dir: Path) -> Path: diff --git a/tests/test_upload_rtc.py b/tests/test_upload_rtc.py index 56274d1..1070d1c 100644 --- a/tests/test_upload_rtc.py +++ b/tests/test_upload_rtc.py @@ -61,6 +61,18 @@ def test_upload_slc_rtc(rtc_slc_results_dir, s3_bucket): assert file_suffixs == {'.json': 1, '.log': 1, '.h5': 27, '.xml': 27, '.png': 27, '.tif': 27 * 3} +def test_upload_slc_rtc_with_existing_objects(rtc_slc_results_dir, s3_bucket): + prefix = 'myPrefix' + aws.S3_CLIENT.put_object(Bucket=s3_bucket, Key=f'{prefix}/foo.txt') + aws.S3_CLIENT.put_object(Bucket=s3_bucket, Key=f'{prefix}/bar.json') + + upload_rtc.upload_rtc(s3_bucket, prefix, rtc_slc_results_dir) + + resp = aws.S3_CLIENT.list_objects_v2(Bucket=s3_bucket, Prefix=prefix) + file_suffixes = dict(Counter(Path(obj['Key']).suffix for obj in resp['Contents'])) + assert file_suffixes == {'.json': 1, '.log': 1, '.h5': 27, '.xml': 27, '.png': 27, '.tif': 27 * 3} + + def test_make_zip_name(rtc_burst_output_files): zip_filename = upload_rtc.make_zip_name([Path(f) for f in rtc_burst_output_files])