I have a Python script that reads all XML files in a specified directory, translates them using the Google API, and saves them in another directory.
Since the API is slow, I divide the XML into chunks for each thread to process.
However, the code freezes within ThreadPoolExecutor without finishing or returning an error, even with try-except blocks.
I have added timeouts and logs in all functions that could be causing the deadlock but couldn’t find anything.
This issue usually doesn’t occur with lighter XMLs (<70kb), and the code runs fine for them.
import os
import xml.etree.ElementTree as ET
from googletrans import Translator
from time import sleep
import httpx
import re
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
translator = Translator(timeout=httpx.Timeout(10))
translation_cache = {}
cache_lock = threading.Lock()
def translate_text(base):
with cache_lock:
if base in translation_cache:
return translation_cache[base]
translations = [base]
languages = ['pt', 'ig', 'zh-cn', 'si', 'ig', 'zu', 'hu', 'el', 'ig', 'gu', 'tr', 'pt']
for i in range(len(languages) - 1):
src_lang = languages[i]
dest_lang = languages[i + 1]
for attempt in range(0, 7):
try:
trad = translator.translate(translations[-1], src=src_lang, dest=dest_lang)
translations.append(trad.text)
break
except Exception as e:
print(f"Translation error on attempt {attempt + 1}: {e}")
if attempt != 6:
sleep(3)
continue
return base
with cache_lock:
translation_cache[base] = translations[-1]
return translations[-1]
def process_element(element):
base = element.text
if not base or base[0] == "%" or base.isspace():
return
index = re.findall(r"<.*?gt", base)
r = 0
if index:
for result in index:
r += 1
print("Made Replace")
base = base.replace(result, f"__{r}__")
translated_text = translate_text(base)
if index:
for result in reversed(index):
try:
translated_text = translated_text.replace(f"__{r}__", result)
r -= 1
except Exception as e:
print(f"Error replacing: {e}")
r -= 1
element.text = translated_text
print(element.text)
sleep(0.1)
print("Element Made")
def process_chunk(chunk):
for element in chunk:
print(f"Processing element: {element.tag}")
process_element(element)
return
def findfiles(root, chunk_size=1):
looplist = os.listdir(root)
for i in looplist:
path = os.path.join(root, i)
if os.path.isfile(path) and os.path.splitext(path)[1] == ".xml":
ext_dir = "trad_" + root
ext_file = os.path.join(ext_dir, "trad_" + os.path.basename(path))
if not os.path.exists(ext_dir):
os.makedirs(ext_dir)
print(f"Processing file: {path}")
with open(path, "r+", encoding="utf8") as basefile:
tree = ET.parse(basefile)
baseline = tree.findall(".//text")
chunks = [baseline[i:i + chunk_size] for i in range(0, len(baseline), chunk_size)]
with ThreadPoolExecutor(max_workers=8) as executor:
futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
for future in as_completed(futures):
try:
result = future.result(timeout=100)
print("Future result obtained")
except TimeoutError:
print(f"Process timed out after 100 seconds!")
except Exception as e:
print(f"Exception in thread: {e}")
print("Threads Finished")
with open(ext_file, "w", encoding="utf8") as basefile:
tree.write(ext_file, encoding="utf8", xml_declaration=True)
print("Files Saved")
os.remove(path)
print("Original File Deleted")
elif os.path.isdir(path):
findfiles(path)
if __name__ == "__main__":
print("Starting the process...")
root = "Xml_msg"
findfiles(root)
print("Process completed.")
It remains stuck like this indefinitely:
Like this
With a lighter XML works well like this:
Like this
Higor Freitas is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1