I’m hoping to upload (potentially) large files (up to 5 Gig) to both Amazon S3 and Alibaba OSS, with verification – preferably MD5 verification.
My code currently looks like:
#!/usr/bin/env python3.10
"""
This toy program is an exploration of how to do verified, multipart file uploads to AWS S3 and Alibaba OSS.
It's a modified version of /questions/58921396/boto3-multipart-upload-and-md5-checking
"""
import os
import sys
import hashlib
import boto3
from botocore.exceptions import ClientError
from botocore.client import Config
from boto3.s3.transfer import TransferConfig
chunk_size = 2**23
# This function is a re-worked function taken from here: /questions/43794838/multipart-upload-to-s3-with-hash-verification
# Credits to user: /users/518169/hyperknot
def calculate_s3_etag(file_path, chunk_size=chunk_size):
"""Calculate an S3/OSS etag for the file."""
chunk_md5s = []
with open(file_path, "rb") as fp:
while True:
data = fp.read(chunk_size)
if not data:
break
chunk_md5s.append(hashlib.md5(data))
num_hashes = len(chunk_md5s)
if not num_hashes:
# do whatever you want to do here
raise ValueError
if num_hashes == 1:
return f"{chunk_md5s[0].hexdigest()}"
digest_byte_string = b"".join(m.digest() for m in chunk_md5s)
digests_md5 = hashlib.md5(digest_byte_string)
return f"{digests_md5.hexdigest().lower()}-{num_hashes}"
def s3_md5sum(bucket_name, resource_name, client):
"""Calcuate an MD5 hash for the file."""
try:
et = client.head_object(Bucket=bucket_name, Key=resource_name)["ETag"]
assert et[0] == '"' and et[-1] == '"'
return et[1:-1].lower()
except ClientError:
# do whatever you want to do here
raise ClientError
def upload_one_file(
*,
filename,
endpoint_url,
aws_credentials,
aws_region,
bucket,
addressing_style=None,
):
"""Upload one file to S3. The file can be up to 5 gigabytes in size, and will be verified on upload."""
config = {}
config["region_name"] = aws_region
if addressing_style:
config["s3"] = {"addressing_style": addressing_style}
kwargs = {"config": Config(**config), **aws_credentials}
if endpoint_url:
kwargs["endpoint_url"] = endpoint_url
client = boto3.client("s3", **kwargs)
transfer_config = TransferConfig(multipart_chunksize=chunk_size)
client.upload_file(filename, bucket, filename, Config=transfer_config)
tag = calculate_s3_etag(filename)
result = s3_md5sum(bucket, filename, client)
return (tag, result)
class AuthorizationData:
"""Hold things about what S3/OSS to use."""
def __init__(
self, *, name, bucket, aws_region, aws_access_key_id, aws_secret_access_key, endpoint_url=None, addressing_style=None
):
"""Initialize."""
self.name = name
self.endpoint_url = endpoint_url
self.bucket = bucket
self.aws_region = aws_region
# self.aws_credentials = aws_credentials
self.aws_credentials = {
"aws_access_key_id": aws_access_key_id,
"aws_secret_access_key": aws_secret_access_key,
}
self.addressing_style = addressing_style
def create_test_inputs():
"""Create test files."""
for filename, blocksize, total_size in (
("dastromberg-urandom-1k", 2**10, 2**10),
("dastromberg-urandom-16M", 2**20, 2**24),
("dastromberg-urandom-256M", 2**20, 2**28),
("dastromberg-urandom-1G", 2**20, 2**30),
("dastromberg-urandom-5G", 2**20, 5 * 2**30),
):
if not os.path.isfile(filename):
length_written = 0
print("Creating test input %s" % filename)
with (
open("/dev/urandom", "rb") as infile,
open(filename, "wb") as outfile,
):
while length_written < total_size:
block = infile.read(blocksize)
outfile.write(block)
len_block = len(block)
assert blocksize == len_block
length_written += len_block
assert length_written == total_size
assert os.path.getsize(filename) == total_size, "Final length does not match. Perhaps remove %s and try again?" % filename
def main():
"""Test some AWS S3- and Alibaba OSS-use."""
# This works.
aws = AuthorizationData(
name="AWS",
bucket="aaa",
aws_region="us-west-2",
aws_access_key_id="bbb",
aws_secret_access_key=os.environ["aws_secret_key"],
)
alibaba = AuthorizationData(
name="Alibaba",
# endpoint_url="https://blz-p-cdn-gamepublishing.oss-cn-hangzhou.aliyuncs.com/",
endpoint_url="https://oss-cn-hangzhou.aliyuncs.com",
bucket="ccc",
aws_region="cn-hangzhou",
aws_access_key_id="ddd",
aws_secret_access_key=os.environ["alibaba_secret_key"],
addressing_style="virtual",
)
all_good = True
create_test_inputs()
for auth in (
aws,
alibaba,
):
for filename in (
"dastromberg-urandom-1k",
"dastromberg-urandom-16M",
"dastromberg-urandom-256M",
# "dastromberg-urandom-1G",
# "dastromberg-urandom-5G",
):
print(f"Checking {auth.name}: {filename}")
(tag, result) = upload_one_file(
filename=filename,
endpoint_url=auth.endpoint_url,
aws_credentials=auth.aws_credentials,
aws_region=auth.aws_region,
bucket=auth.bucket,
addressing_style=auth.addressing_style,
)
if tag == result:
print("Verification succeeded: %s, %s" % (tag, result))
else:
all_good = False
print("Verification failed: %s, %s" % (tag, result), file=sys.stderr)
if all_good:
print("All tests passed")
else:
print("One or more tests failed", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
Running it, AWS S3 verifies for all test inputs, but Alibaba only verifies for the 1 kilobyte test – the 16 megabyte test fails, as do the larger tests.
In https://www.alibabacloud.com/help/en/oss/use-cases/can-i-use-etag-values-as-oss-md5-hashes-to-check-data-consistency Alibaba says that they don’t recommend using MD5 for verification.
In https://www.alibabacloud.com/help/en/oss/use-cases/check-data-transmission-integrity-by-using-crc-64 they seem to relatively thoroughly explore using x-oss-hash-crc64ecma. I’d rather use MD5 (with boto3), but can use crc64ecma if necessary (hopefully with boto3 still).
Has anyone explored this?
Any suggestions?
1