Thiết kế website giá rẻ

Question

I’m working on a project where I need to extract references from a SQL database, preprocess them, and use them to train a BERT model for token classification. Below is the code I have so far:

# -*- coding: utf-8 -*-

import re
import pyodbc
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
import torch

# Funzione per connettersi al database e fare una query sulla tabella Atti
def query_references(server, database, username, password):
    try:
        conn_str = (
            f"DRIVER={{ODBC Driver 17 for SQL Server}};"
            f"SERVER={server};"
            f"DATABASE={database};"
            f"UID={username};"
            f"PWD={password}"
        )
        
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        
        query = "SELECT top(100) Reference FROM Atti where Reference is not null"
        cursor.execute(query)
        
        references = cursor.fetchall()
        
        conn.close()
        
        references = [ref[0] for ref in references]
        
        return references
    except Exception as e:
        print(f"Errore durante la connessione al database o l'esecuzione della query: {e}")
        return []

# Funzione per preprocessare i riferimenti normativi
def preprocess_references(references):
    try:
        processed_references = []
        for ref in references:
            ref = re.sub(r'[^A-Za-z0-9.,;:/- ]+', '', ref)
            ref = re.sub(r's+', ' ', ref).strip()
            processed_references.append(ref)
        return processed_references
    except Exception as e:
        print(f"Errore durante il preprocessamento dei riferimenti: {e}")
        return []

def tokenize_and_align_labels(references, tokenizer):
    try:
        # Tokenizza gli input
        tokenized_inputs = tokenizer(references, padding=True, truncation=True, return_tensors="pt", is_split_into_words=False)
        labels = []

        for i, ref in enumerate(references):
            try:
                print(f"Processing reference {i}: {ref}")
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                if word_ids is None:
                    print(f"No word_ids found for reference: {ref}")
                    continue

                print(f"Word IDs for reference {i}: {word_ids}")

                label_ids = []
                previous_word_id = None
                for idx, word_id in enumerate(word_ids):
                    if word_id is None:
                        label_ids.append(-100)
                    else:
                        if previous_word_id is None or word_id != previous_word_id:
                            label_ids.append(1)
                        else:
                            label_ids.append(-100)
                    previous_word_id = word_id

                # Check consistency of word_ids and label_ids
                if len(word_ids) != len(label_ids):
                    print(f"Mismatch in lengths for reference {i}: {ref}")
                    print(f"Word IDs Length: {len(word_ids)}")
                    print(f"Label IDs Length: {len(label_ids)}")
                    print(f"Word IDs: {word_ids}")
                    print(f"Label IDs: {label_ids}")
                    raise ValueError("Mismatch between word_ids and label_ids length")

                print(f"Label IDs for reference {i}: {label_ids}")

                labels.append(label_ids)
            except Exception as e:
                print(f"Error processing reference {i}: {ref}, error: {e}")
                raise

        tokenized_inputs["labels"] = labels

        # Controlla che tutte le liste abbiano la stessa lunghezza
        for key in tokenized_inputs.keys():
            print(f"Key: {key}, Length: {len(tokenized_inputs[key])}")
            if len(tokenized_inputs[key]) != len(labels):
                raise ValueError(f"Mismatch between tokenized inputs and labels for key: {key}")

        print("Tokenized inputs:", tokenized_inputs)
        return tokenized_inputs

    except Exception as e:
        print(f"Errore durante la tokenizzazione e l'allineamento delle etichette: {e}")
        return None

def join_predicted_tokens(tokens, labels):
    result = []
    current_reference = []
    for token, label in zip(tokens, labels):
        if label == 1:
            if token.startswith("##"):
                current_reference[-1] += token[2:]
            else:
                current_reference.append(token)
        elif current_reference:
            result.append(" ".join(current_reference))
            current_reference = []
    if current_reference:
        result.append(" ".join(current_reference))
    return result

def clean_references(references):
    cleaned_references = []
    for ref in references:
        ref = re.sub(r's([?.!,;:/])', r'1', ref)  # Remove spaces before punctuation
        ref = re.sub(r'([?.!,;:/])s', r'1', ref)  # Remove spaces after punctuation
        cleaned_references.append(ref)
    return cleaned_references

# Classe custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs):
        self.tokenized_inputs = tokenized_inputs

    def __len__(self):
        return len(self.tokenized_inputs["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_inputs.items()}
        return item

# Funzione per addestrare il modello
def train_model(train_dataset, eval_dataset, tokenizer):
    try:
        model = BertForTokenClassification.from_pretrained('dbmdz/bert-base-italian-cased', num_labels=2)

        training_args = TrainingArguments(
            output_dir='./results',
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
        )

        data_collator = DataCollatorForTokenClassification(tokenizer)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer
        )

        trainer.train()

        trainer.save_model('./results')
        tokenizer.save_pretrained('./results')
        return model
    except Exception as e:
        print(f"Errore durante l'addestramento del modello: {e}")
        return None

# Funzione per predire riferimenti
def predict_references(text, model, tokenizer):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = predictions[0].tolist()
        
        result = join_predicted_tokens(tokens, labels)
        cleaned_result = clean_references(result)
        
        return cleaned_result
    except Exception as e:
        print(f"Errore durante la predizione dei riferimenti normativi: {e}")
        return []

# Codice principale
if __name__ == "__main__":
    server = '192.168.0.xxx'
    database = 'Pc'
    username = 'user'
    password = 'pass4321'
    
    references = query_references(server, database, username, password)
    
    if references:
        print(f"Numero di riferimenti ottenuti: {len(references)}")
        processed_references = preprocess_references(references)
        
        if processed_references:
            print(f"Numero di riferimenti preprocessati: {len(processed_references)}")
            df = pd.DataFrame(processed_references, columns=['Reference'])
            
            tokenizer = BertTokenizerFast.from_pretrained('dbmdz/bert-base-italian-cased')
            tokenized_inputs = tokenize_and_align_labels(df['Reference'].tolist(), tokenizer)
            
            if tokenized_inputs:
                print(f"Tokenized Inputs Verificati: {tokenized_inputs}")
                
                print(f"Lunghezza input_ids: {len(tokenized_inputs['input_ids'])}")
                print(f"Lunghezza labels: {len(tokenized_inputs['labels'])}")
                
                dataset = CustomDataset(tokenized_inputs)
                train_dataset, eval_dataset = train_test_split(dataset, test_size=0.1)
                
                model = train_model(train_dataset, eval_dataset, tokenizer)
                
                if model:
                    text = "Questo è un nuovo documento con riferimento normativo D.M. n. 278/1861, DECRETO MINISTERIALE n. 278/1861"
                    predicted_references = predict_references(text, model, tokenizer)
                    
                    print("Riferimenti normativi identificati:")
                    for ref in predicted_references:
                        print(ref)

The problem I’m facing is that I should obtain a list of references from the SQL database to be tokenized and used for training the model, but I am encountering unexpected outputs and various warnings and errors during the process. The outputs include:

Length of input_ids: 100
Length of labels: 100
Various warnings about tensor copying and model initialization
Training and evaluation logs
Despite these, the prediction output includes the entire text, which is not expected.

Could someone help me understand what might be going wrong? Thank you!

Here is the output

Thiết kế website giá rẻ

Danh mục

Issues Tokenizing SQL Data for BERT Model