I’m working on a project that takes in JSON file, cleans them up, removes any invalid characters and/or errors, and anonymizes the data. Once the files are cleaned and anonymized, they are exported as new JSON files – and then go through another process. However, I keep running into issues with invalid characters that I have not been able to resolved, and the errors prevent the new JSON files from being created.
Here is the current code:
import os
import json
import uuid
import chardet
import re
def clean_json(json_data):
# This function removes invalid characters from the JSON data
cleaned_data = ''
for char in json_data:
try:
char.encode('utf-8')
cleaned_data += char
except UnicodeEncodeError:
pass # Skip invalid characters
return cleaned_data
def anonymize_json(json_data):
def anonymize_value(value):
if isinstance(value, str) and 'user' in value.lower():
return str(uuid.uuid4())
else:
return value
if isinstance(json_data, dict):
return {key: anonymize_value(val) for key, val in json_data.items()}
elif isinstance(json_data, list):
return [anonymize_json(item) for item in json_data]
else:
return anonymize_value(json_data)
def process_json_file(json_file, output_folder):
print(f"Processing file: {json_file}")
try:
with open(json_file, 'rb') as file:
raw_data = file.read()
# Detect the encoding of the file
encoding = chardet.detect(raw_data)['encoding']
# Decode the data using the detected encoding and remove invalid characters
cleaned_data = raw_data.decode(encoding, errors='ignore')
cleaned_data = clean_json(cleaned_data)
except FileNotFoundError:
print(f"File not found: {json_file}")
return
if cleaned_data.strip():
try:
json_obj = json.loads(cleaned_data)
print(f"JSON loaded successfully for file: {json_file}")
except json.JSONDecodeError as e:
print(f"Error decoding JSON in file: {json_file}. Error: {e}")
error_position = e.pos
snippet = cleaned_data[max(0, error_position-50):min(len(cleaned_data), error_position+50)]
print(f"Error occurred near: {snippet}")
return
else:
print(f"Cleaned JSON data is empty for file: {json_file}")
return
anonymized_data = anonymize_json(json_obj)
print(f"Anonymized data created for file: {json_file}")
anonymized_json_file = os.path.join(output_folder, os.path.splitext(os.path.basename(json_file))[0] + '_anonymized.json')
with open(anonymized_json_file, 'w', encoding='utf-8') as outfile:
json.dump(anonymized_data, outfile, ensure_ascii=False, indent=4)
print(f"Anonymized JSON file created: {anonymized_json_file}")
def main():
input_folder = 'data'
output_folder = 'anonymized_data'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for file_name in os.listdir(input_folder):
if file_name.endswith('.json'):
json_file = os.path.join(input_folder, file_name)
process_json_file(json_file, output_folder)
if __name__ == '__main__':
main()
And here is an example of some text that fails (I cut it off because it was so long):
“comment”: “on behalf of our organization)nnJoin us – The school↓s Solar Electric Vehicle TeamnWe are a student c”
In the editor it shows up as:
enter image description here
And my terminal has throws these errors/messages:
Error occurred near: on behalf of our organization)nnJoin us - The school↓s Solar Electric Vehicle TeamnWe are a student c
Processing file: file1.json
Error decoding JSON in file: file2.json. Error: Invalid control character at: line 34 column 216 (char 3678)
Error occurred near: what happens when class↓s lab are full?
I’ve tried removing the invalid characters, replacing with empty string, ignoring the errors. I’m having trouble understanding why none of these methods worked. I’ve tried different libraries (chardet, re, pandas) – but to no avail. Even with each library, I’m still getting this invalid character problem. I know I must be doing something wrong – but I’m not sure where or why
Vic is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.