I am working on a Python script to remove duplicate MP3 files from a main folder and its subfolders. The challenge is that the file names differ, but the audio content is the same. Additionally, I need to ensure that if any of the duplicate files have hotcues set in Rekordbox (indicated by color attributes in the Rekordbox XML file), those files are preserved.
Current Approach:
Compute a hash for the audio content of each MP3 file using pydub.
Parse the Rekordbox XML file to check if a track has hotcues.
Compare hashes to identify duplicates.
Remove duplicates, ensuring that files with hotcues are kept.
Problem:
The script sometimes fails to correctly identify and preserve files with hotcues, leading to unintended deletions.
Requirements:
Correctly identify and remove duplicate MP3 files.
Preserve any file with hotcues as indicated by color attributes in <POSITION_MARK> elements in the Rekordbox XML.
Example of <POSITION_MARK> with hotcue:
xml
<POSITION_MARK Name="" Type="0" Start="0.236" Num="1" Red="69" Green="172" Blue="219"/>
Current Script:
Here is the latest version of the script I am using:
python
import os
import hashlib
from pydub import AudioSegment
import xml.etree.ElementTree as ET
rekordbox_xml_path = '/path/to/data.xml'
def get_audio_hash(file_path):
audio = AudioSegment.from_file(file_path)
audio_bytes = audio.raw_data
return hashlib.md5(audio_bytes).hexdigest()
def has_hotcues(track):
for mark in track.findall('.//POSITION_MARK'):
if 'Red' in mark.attrib or 'Green' in mark.attrib or 'Blue' in mark.attrib:
return True
return False
def get_track_by_file_path(xml_root, file_path):
for track in xml_root.findall('.//TRACK'):
location = track.get('Location')
if location and os.path.basename(location) == os.path.basename(file_path):
return track
return None
def find_and_remove_duplicates(main_folder, xml_path):
seen_hashes = {}
duplicates_to_remove = []
tree = ET.parse(xml_path)
xml_root = tree.getroot()
for root, _, files in os.walk(main_folder):
for file in files:
if file.lower().endswith(".mp3"):
file_path = os.path.join(root, file)
audio_hash = get_audio_hash(file_path)
if audio_hash in seen_hashes:
existing_file_path = seen_hashes[audio_hash]
track_existing = get_track_by_file_path(xml_root, existing_file_path)
track_new = get_track_by_file_path(xml_root, file_path)
if track_existing is not None and has_hotcues(track_existing):
print(f"Hotcue found, keeping: {existing_file_path} (removing {file_path})")
duplicates_to_remove.append(file_path)
elif track_new is not None and has_hotcues(track_new):
print(f"Hotcue found, keeping: {file_path} (removing {existing_file_path})")
duplicates_to_remove.append(existing_file_path)
seen_hashes[audio_hash] = file_path
else:
print(f"Duplicate found: {file_path} (duplicate of {existing_file_path}), removing...")
duplicates_to_remove.append(file_path)
else:
seen_hashes[audio_hash] = file_path
for duplicate in duplicates_to_remove:
if os.path.exists(duplicate):
os.remove(duplicate)
print(f"Removed: {duplicate}")
main_folder = '/path/to/main_folder'
find_and_remove_duplicates(main_folder, rekordbox_xml_path)
Question:
How can I modify the script to reliably detect and preserve MP3 files with hotcues while removing other duplicates?