I am trying to write a python script that will move specific files from source s3 bucket to target s3 bucket. The objective is to copy specific files to the target bucket in the initial run. on the second run, it compares the max lastmodified date in target with the lastmodified date in the source, then it uses that to copy new files in the source to the target.
This is what I have written
import boto3
import os
from datetime import datetime, timezone
def get_max_last_modified_time(bucket):
"""
Get the maximum Last Modified time of files in an S3 bucket.
Args:
- bucket (str): The name of the S3 bucket.
Returns:
- datetime: The maximum Last Modified time of files in the bucket (timezone-aware).
"""
s3_client = boto3.client('s3')
response = s3_client.list_objects_v2(Bucket=bucket)
files = response.get('Contents', [])
if not files:
return None
# Convert Last Modified timestamps to datetime objects with UTC timezone
last_modified_times = [file['LastModified'].astimezone(timezone.utc) for file in files]
# Return the maximum Last Modified time as a timezone-aware datetime object
return max(last_modified_times)
def copy_files(source_bucket, target_bucket):
"""
Copy new files from source bucket to target bucket, excluding files containing 'example' in their filename.
Args:
- source_bucket (str): The name of the source bucket.
- target_bucket (str): The name of the target bucket.
Returns:
- None
"""
# Initialize the S3 client
s3_client = boto3.client('s3')
# Get the maximum Last Modified time of files in the source bucket
source_max_last_modified_time = get_max_last_modified_time(source_bucket)
# Get the maximum Last Modified time of files in the target bucket
target_max_last_modified_time = get_max_last_modified_time(target_bucket)
# If there are no files in the target bucket, set the maximum Last Modified time to None
if not target_max_last_modified_time:
target_max_last_modified_time = datetime.min
# List objects in the source bucket
response = s3_client.list_objects_v2(Bucket=source_bucket)
files = response.get('Contents', [])
# Iterate over the files in the source bucket
for file_obj in files:
file_key = file_obj['Key']
file_name = os.path.basename(file_key)
# Check if the file name contains 'example'
if 'example' in file_name.lower():
print(f"File containing 'example' found: '{file_name}'")
else:
# Get the Last Modified time of the file in the source bucket
source_last_modified_time = file_obj['LastModified']
# Skip files that have not been modified since the last execution
if source_last_modified_time > target_max_last_modified_time:
# Copy the file to the target bucket
s3_client.copy_object(
Bucket=target_bucket,
CopySource={'Bucket': source_bucket, 'Key': file_key},
Key=file_key
)
print(f"Successfully copied new file '{file_name}' from '{source_bucket}' to '{target_bucket}'")
if __name__ == "__main__":
# Specify the source and target bucket names
source_bucket = 'queen-data-lake'
target_bucket = 'queen-output'
# Call the copy_files function
copy_files(source_bucket, target_bucket)
I seem to be getting this error. also I wonder if this is an effective approach to handle millions of file
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/var/folders/06/2zj_616d5vx22y29nq1jq2hc0000gn/T/ipykernel_30361/3639448357.py in <module>
120
121 # Call the copy_files function
--> 122 copy_files(source_bucket, target_bucket)
/var/folders/06/2zj_616d5vx22y29nq1jq2hc0000gn/T/ipykernel_30361/3639448357.py in copy_files(source_bucket, target_bucket)
105
106 # Skip files that have not been modified since the last execution
--> 107 if source_last_modified_time > target_max_last_modified_time:
108 # Copy the file to the target bucket
109 s3_client.copy_object(
TypeError: can't compare offset-naive and offset-aware datetimes