I’m currently facing an issue with UTF-8 encoding while using a text generation AI in my project. Here’s a summary of the situation:
Problem Description:
When attempting to load data from a JSON or .pth file for my AI model, I encounter a UTF-8 encoding error. I’ve tried multiple encodings (utf-8, utf-16, latin-1, cp1252) without success. The specific error message I’m encountering is: UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xe9 in position 570: invalid continuation byte
eps Taken:
I’ve checked and adjusted the encodings in the Python code to try and resolve the issue.
I’ve used different methods to load JSON and .pth files while specifying encodings.
Expected Outcome:
I expect my program to load the data successfully without any encoding errors, allowing my AI model to function as intended.
Code or Screenshots:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
intents_file = os.path.join(BASE_DIR, 'data', 'intent.json')
data_file = os.path.join(BASE_DIR, 'data.pth')
unanswered_questions_file = os.path.join(BASE_DIR, 'unanswered_questions.json')
generate_data_file = os.path.join(BASE_DIR, 'generate_data.pth')
lstm_data_file = os.path.join(BASE_DIR, 'lstm_data.pth')
encodings_to_try = ['utf-8', 'utf-16', 'latin-1', 'cp1252']
def load_json_file(file_path):
data = None
for encoding in encodings_to_try:
try:
with open(file_path, 'r', encoding=encoding) as json_data:
data = json.load(json_data)
break
except (UnicodeDecodeError, json.JSONDecodeError) as e:
print(f"Error with encoding {encoding}: {e}")
continue
if data is None:
print(f"Failed to load {file_path}. Check file encoding or content.")
return data
def load_pt_files(file_path):
try:
return torch.load(file_path)
except Exception as e:
print(f"Error loading {file_path}: {e}")
return None
intents = load_json_file(intents_file)
data = load_pt_files(data_file)
generate_data = load_pt_files(generate_data_file)
lstm_data = load_pt_files(lstm_data_file)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if data is not None:
input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]
model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()
if lstm_data is not None:
texts = lstm_data["texts"]
characters = lstm_data["characters"]
lstm_model = LSTMModel(input_size=len(characters), hidden_size=128, output_size=len(characters)).to(device)
lstm_model0 = LSTMModel(input_size=len(characters), hidden_size=128, output_size=len(characters)).to(device)
lstm_model.load_state_dict(lstm_data['model_state'])
lstm_model0.load_state_dict(generate_data['model_state'])
lstm_model.eval()
lstm_model0.eval()
bot_name = "Sam"
Any assistance in resolving this UTF-8 encoding issue would be greatly appreciated! Thank you in advance for your suggestions and guidance.
def get_response(msg):
sentence = tokenize(msg)
X = bag_of_words(sentence, all_words)
X = X.reshape(1, X.shape[0])
X = torch.from_numpy(X).float().to(device)
output = model(X)
_, predicted = torch.max(output, dim=1)
tag = tags[predicted.item()]
probs = torch.softmax(output, dim=1)
prob = probs[0][predicted.item()]
if prob.item() > 0.75:
if tag == "economic_forecast":
forecast = forecast_gdp(os.path.join(BASE_DIR, 'data', 'economic_data.csv'))
return f"The GDP forecast for the next period is: {forecast}"
for intent in intents['intents']:
if tag == intent["tag"]:
if tag == "generate_video":
category = sentence[-1] # Extract the last word as the category
if category in DATASET_DIRS:
video = generate_video(category)
if video is not None:
show_generated_video(video)
response = random.choice([resp for intent in intents['intents'] if intent['tag'] == tag for resp in intent['responses']]).format(category)
unanswered_questions = load_unanswered_questions()
unanswered_questions.append({"question": msg, "response": response})
save_unanswered_questions(unanswered_questions)
return response
else:
return f"Desole, je ne trouve pas de categorie pour {category}."
else:
return f"Desole, la categorie {category} n'existe pas dans les donnees de videos."
if tag == "generate_image":
category = sentence[-1] # Extract the last word as the category
if category in DATASET_DIRS:
image = generate_image(category)
if image is not None:
show_generated_image(image)
response = random.choice([resp for intent in intents['intents'] if intent['tag'] == tag for resp in intent['responses']]).format(category)
unanswered_questions = load_unanswered_questions()
unanswered_questions.append({"question": msg, "response": response})
save_unanswered_questions(unanswered_questions)
return response
else:
return f"Desole, je ne trouve pas de categorie pour {category}."
else:
return f"Desole, la categorie {category} n'existe pas dans les donnees d'images."
else:
return random.choice(intent['responses'])
else:
for keyword in keywords.values():
if keyword in msg:
response = generate_text_based_on_keyword(keyword, length=100, temperature=1.2)
unanswered_questions = load_unanswered_questions()
unanswered_questions.append({"question": msg, "response": response})
save_unanswered_questions(unanswered_questions)
return response
# Si aucune réponse n'a été générée pour une question non reconnue
response = "Je suis desole, mais je ne comprends pas. Pouvez-vous reformuler votre question ?"
unanswered_questions = load_unanswered_questions()
unanswered_questions.append({"question": msg, "response": response})
save_unanswered_questions(unanswered_questions)
return response
user23298973 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.