i have this code:
!pip install datasets
!pip install transformers
from datasets import load_dataset
raw_dataset= load_dataset("Amir13/wnut2017-persian")
print(raw_dataset)
print(raw_dataset)
DatasetDict({
train: Dataset({
features: ['id', 'ner_tags', 'tokens'],
num_rows: 3386
})
validation: Dataset({
features: ['id', 'ner_tags', 'tokens'],
num_rows: 1007
})
test: Dataset({
features: ['id', 'ner_tags', 'tokens'],
num_rows: 1284
})
})
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-armanner-uncased")
inputs = tokenizer(raw_dataset['train'][0:1]['tokens'], is_split_into_words=True)
print(inputs.tokens())
['[CLS]', '[', "'", '@', 'paul', '##wa', '##l', '##k', "'", ',', "'", 'این', "'", ',', "'", 'منظره', "'", ',', "'", 'ای', "'", ',', "'", 'است', "'", ',', "'", 'که', "'", ',', "'", 'دو', "'", ',', "'", 'هفته', "'", ',', "'", 'است', "'", ',', "'", 'از', "'", ',', "'", 'انجا', "'", ',', "'", 'زندگی', "'", ',', "'", 'می', "'", ',', "'", 'کنم', '.', "'", ',', "'", 'ساختمان', "'", ',', "'", 'امپایر', "'", ',', "'", 'استیت', "'", ',', "'", '=', "'", ',', "'", 'es', '##b', "'", ',', "'", '.', "'", ',', "'", 'غروب', "'", ',', "'", 'گذشته', "'", ',', "'", 'اینجا', "'", ',', "'", 'طوفان', "'", ',', "'", 'بسیار', "'", ',', "'", 'بدی', "'", ',', "'", 'بود', '.', "'", ']', '[SEP]']
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
previous_word_idx = None
label_ids = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_dataset= raw_dataset.map(
tokenize_and_align_labels,
batched=True,
remove_columns=raw_dataset["train"].column_names
)
”’
in tokenize_and_align_labels(examples)
11 label_ids.append(-100)
12 elif word_idx != previous_word_idx: # Only label the first token of a given word.
—> 13 label_ids.append(label[word_idx])
14 else:
15 label_ids.append(-100)
IndexError: string index out of range