Our database has multiple data quality issues where same file name is mentioned multiple times with different casings. This data was used as the base information in our python application. And as python is case-sensitive, for a given filename we ended up creating as many folders as there are casing differences. We run our ACLs on top of this which now distributed the ACLs across these duplicate folders.
We fixed the casing differences in the database. But, now I’m trying to find a way to easily and efficiently migrate the ACLs applied on all those duplicate folders onto their respective directories with correct naming (and delete all those duplicate folders with incorrect casing).
But I’m not sure how to approach this. Is there some way to accomplish this quickly using python?
1
Having duplicate folders in ADLS due to casing differences in filenames. How to move ACLs from one folder to another?
You can use the below python code to move ACL
from one folder to another folder and delete the duplicate folder (casing differences in filenames) in Azure Data Lake gen2 Storage using Azure python SDK.
Code:
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core.exceptions import ResourceNotFoundError
def get_data_lake_service_client(account_name, account_key):
service_client = DataLakeServiceClient(
account_url=f"https://{account_name}.dfs.core.windows.net",
credential=account_key
)
return service_client
def list_directories(service_client, container_name):
file_system_client = service_client.get_file_system_client(file_system=container_name)
paths = file_system_client.get_paths()
directories = [path.name for path in paths if path.is_directory]
return directories
def get_directory_acls(service_client, container_name, directory_name):
directory_client = service_client.get_directory_client(container_name, directory_name)
acl_props = directory_client.get_access_control()
return acl_props['acl']
def set_directory_acls(service_client, container_name, directory_name, acl):
directory_client = service_client.get_directory_client(container_name, directory_name)
directory_client.set_access_control(acl=acl)
def delete_directory(service_client, container_name, directory_name):
directory_client = service_client.get_directory_client(container_name, directory_name)
directory_client.delete_directory()
# Main function to migrate ACLs
def migrate_acls(service_client, container_name, correct_directory, duplicate_directories):
try:
correct_acl = get_directory_acls(service_client, container_name, correct_directory)
for duplicate in duplicate_directories:
duplicate_acl = get_directory_acls(service_client, container_name, duplicate)
if duplicate_acl != correct_acl:
set_directory_acls(service_client, container_name, correct_directory, duplicate_acl)
delete_directory(service_client, container_name, duplicate)
print(f"Deleted duplicate folder: {duplicate}")
except ResourceNotFoundError as e:
print(f"Error: {e}")
if __name__ == "__main__":
account_name = "venkat8912"
account_key = "<Account key>"
container_name = "test"
correct_directory = "data"
duplicate_directories = ["Data", "dAta"]
service_client = get_data_lake_service_client(account_name, account_key)
migrate_acls(service_client, container_name, correct_directory, duplicate_directories)
Output:
Deleted duplicate folder: Data
Deleted duplicate folder: dAta
Reference:
Use Python to manage ACLs in Azure Data Lake Storage – Azure Storage | Microsoft Learn