I have a Python script that deletes files and directories older than X days. However, this script is running on a huge directory with millions of files and directories. At the current rate, it will take approximately six weeks to complete the deletion process (looking at disk space metric).
It appears that the main bottleneck is in listing the files and directories. Could anyone suggest code changes or optimizations that can help reduce the runtime?
Not sure it’s relevant but for context this run as Job in k8s so resources are not an issue.
def delete_files(root_directory, delete_time_threshold):
global EXAMINED_FILES
global DELETED_FILES
try:
for dirpath, dirnames, filenames in os.walk(root_directory):
for file in filenames:
file_path = os.path.join(dirpath, file)
try:
file_mtime = os.path.getmtime(file_path)
EXAMINED_FILES += 1
if file_mtime < delete_time_threshold:
os.remove(file_path)
logging.debug(f"File {file} deleted because mtime {file_mtime} is older than threshold {delete_time_threshold}")
DELETED_FILES += 1
except Exception as e:
logging.error(f"Error deleting file {file_path}")
except Exception as e:
logging.error(f"Error walking root directory {root_directory}: {e}")
def delete_empty_directories(root_directory, allowed_empty_dirs):
global EXAMINED_DIRS
global DELETED_DIRS
global SKIPPED_DELETE_DIRS
try:
for dirpath, dirnames, filenames in os.walk(root_directory):
if dirpath != root_directory: #don't look at the root directory
EXAMINED_DIRS += 1
try:
if not dirnames and not filenames:
relative_dirpath = re.sub(f'^{root_directory}/', '', dirpath)
if relative_dirpath and relative_dirpath in allowed_empty_dirs:
logging.debug(f"Skipping deletion of allowed empty directory: {dirpath}")
SKIPPED_DELETE_DIRS += 1
else:
os.rmdir(dirpath)
logging.debug(f"Deleted empty directory: {dirpath}")
DELETED_DIRS += 1
except Exception as e:
logging.error(f"Error deleting directory {dirpath}")
except Exception as e:
logging.error(f"Error walking root directory {root_directory}: {e}")
Thank you!