I’m working on a project where I need to extract references from a SQL database, preprocess them, and use them to train a BERT model for token classification. Below is the code I have so far:
# -*- coding: utf-8 -*-
import re
import pyodbc
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
import torch
# Funzione per connettersi al database e fare una query sulla tabella Atti
def query_references(server, database, username, password):
try:
conn_str = (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={server};"
f"DATABASE={database};"
f"UID={username};"
f"PWD={password}"
)
conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
query = "SELECT top(100) Reference FROM Atti where Reference is not null"
cursor.execute(query)
references = cursor.fetchall()
conn.close()
references = [ref[0] for ref in references]
return references
except Exception as e:
print(f"Errore durante la connessione al database o l'esecuzione della query: {e}")
return []
# Funzione per preprocessare i riferimenti normativi
def preprocess_references(references):
try:
processed_references = []
for ref in references:
ref = re.sub(r'[^A-Za-z0-9.,;:/- ]+', '', ref)
ref = re.sub(r's+', ' ', ref).strip()
processed_references.append(ref)
return processed_references
except Exception as e:
print(f"Errore durante il preprocessamento dei riferimenti: {e}")
return []
def tokenize_and_align_labels(references, tokenizer):
try:
# Tokenizza gli input
tokenized_inputs = tokenizer(references, padding=True, truncation=True, return_tensors="pt", is_split_into_words=False)
labels = []
for i, ref in enumerate(references):
try:
print(f"Processing reference {i}: {ref}")
word_ids = tokenized_inputs.word_ids(batch_index=i)
if word_ids is None:
print(f"No word_ids found for reference: {ref}")
continue
print(f"Word IDs for reference {i}: {word_ids}")
label_ids = []
previous_word_id = None
for idx, word_id in enumerate(word_ids):
if word_id is None:
label_ids.append(-100)
else:
if previous_word_id is None or word_id != previous_word_id:
label_ids.append(1)
else:
label_ids.append(-100)
previous_word_id = word_id
# Check consistency of word_ids and label_ids
if len(word_ids) != len(label_ids):
print(f"Mismatch in lengths for reference {i}: {ref}")
print(f"Word IDs Length: {len(word_ids)}")
print(f"Label IDs Length: {len(label_ids)}")
print(f"Word IDs: {word_ids}")
print(f"Label IDs: {label_ids}")
raise ValueError("Mismatch between word_ids and label_ids length")
print(f"Label IDs for reference {i}: {label_ids}")
labels.append(label_ids)
except Exception as e:
print(f"Error processing reference {i}: {ref}, error: {e}")
raise
tokenized_inputs["labels"] = labels
# Controlla che tutte le liste abbiano la stessa lunghezza
for key in tokenized_inputs.keys():
print(f"Key: {key}, Length: {len(tokenized_inputs[key])}")
if len(tokenized_inputs[key]) != len(labels):
raise ValueError(f"Mismatch between tokenized inputs and labels for key: {key}")
print("Tokenized inputs:", tokenized_inputs)
return tokenized_inputs
except Exception as e:
print(f"Errore durante la tokenizzazione e l'allineamento delle etichette: {e}")
return None
def join_predicted_tokens(tokens, labels):
result = []
current_reference = []
for token, label in zip(tokens, labels):
if label == 1:
if token.startswith("##"):
current_reference[-1] += token[2:]
else:
current_reference.append(token)
elif current_reference:
result.append(" ".join(current_reference))
current_reference = []
if current_reference:
result.append(" ".join(current_reference))
return result
def clean_references(references):
cleaned_references = []
for ref in references:
ref = re.sub(r's([?.!,;:/])', r'1', ref) # Remove spaces before punctuation
ref = re.sub(r'([?.!,;:/])s', r'1', ref) # Remove spaces after punctuation
cleaned_references.append(ref)
return cleaned_references
# Classe custom dataset
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, tokenized_inputs):
self.tokenized_inputs = tokenized_inputs
def __len__(self):
return len(self.tokenized_inputs["input_ids"])
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_inputs.items()}
return item
# Funzione per addestrare il modello
def train_model(train_dataset, eval_dataset, tokenizer):
try:
model = BertForTokenClassification.from_pretrained('dbmdz/bert-base-italian-cased', num_labels=2)
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
trainer.train()
trainer.save_model('./results')
tokenizer.save_pretrained('./results')
return model
except Exception as e:
print(f"Errore durante l'addestramento del modello: {e}")
return None
# Funzione per predire riferimenti
def predict_references(text, model, tokenizer):
try:
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = predictions[0].tolist()
result = join_predicted_tokens(tokens, labels)
cleaned_result = clean_references(result)
return cleaned_result
except Exception as e:
print(f"Errore durante la predizione dei riferimenti normativi: {e}")
return []
# Codice principale
if __name__ == "__main__":
server = '192.168.0.xxx'
database = 'Pc'
username = 'user'
password = 'pass4321'
references = query_references(server, database, username, password)
if references:
print(f"Numero di riferimenti ottenuti: {len(references)}")
processed_references = preprocess_references(references)
if processed_references:
print(f"Numero di riferimenti preprocessati: {len(processed_references)}")
df = pd.DataFrame(processed_references, columns=['Reference'])
tokenizer = BertTokenizerFast.from_pretrained('dbmdz/bert-base-italian-cased')
tokenized_inputs = tokenize_and_align_labels(df['Reference'].tolist(), tokenizer)
if tokenized_inputs:
print(f"Tokenized Inputs Verificati: {tokenized_inputs}")
print(f"Lunghezza input_ids: {len(tokenized_inputs['input_ids'])}")
print(f"Lunghezza labels: {len(tokenized_inputs['labels'])}")
dataset = CustomDataset(tokenized_inputs)
train_dataset, eval_dataset = train_test_split(dataset, test_size=0.1)
model = train_model(train_dataset, eval_dataset, tokenizer)
if model:
text = "Questo è un nuovo documento con riferimento normativo D.M. n. 278/1861, DECRETO MINISTERIALE n. 278/1861"
predicted_references = predict_references(text, model, tokenizer)
print("Riferimenti normativi identificati:")
for ref in predicted_references:
print(ref)
The problem I’m facing is that I should obtain a list of references from the SQL database to be tokenized and used for training the model, but I am encountering unexpected outputs and various warnings and errors during the process. The outputs include:
Length of input_ids: 100
Length of labels: 100
Various warnings about tensor copying and model initialization
Training and evaluation logs
Despite these, the prediction output includes the entire text, which is not expected.
Could someone help me understand what might be going wrong? Thank you!
Here is the output