I am trying to experiment with the databricks-dolly-15k dataset to make it suitable for fine tuning a Llama2 model according to this article by Phil Schmid. The initial part of building the dataset is quite clear.
from transformers import AutoTokenizer
from random import randint
from itertools import chain
from functools import partial
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HF_API_KEY")
# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
model_id = "meta-llama/Llama-2-7b-hf" # sharded weights
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
def format_dolly(sample):
instruction = f"### Instructionn{sample['instruction']}"
context = f"### Contextn{sample['context']}" if len(sample['context'])>0 else None
response = f"### Responsen{sample['response']}"
prompt = "nn".join([i for i in [instruction, context, response] if i is not None])
return prompt
# template dataset to add prompt to each sample
def template_dataset(sample):
sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
return sample
# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
# print random sample
print(dataset[randint(0, len(dataset))]["text"])
At this point the dataset is a list of 15011 dictionaries, each with the key “text” and the “value” a prompt like this.
The next step of tokenization produces a Dataset with features ‘input_ids’ and ‘attention_mask’
tokenized_dataset = dataset.map(
lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
)
This is where the author defines a function chunk and maps it to the tokenized_dataset
# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}
def chunk(sample, chunk_length=2048):
# define global remainder variable to save remainder from batches to use in next batch
global remainder
# Concatenate all texts and add remainder from previous batch
#print(sample.keys())
concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
# get total number of tokens for batch
batch_total_length = len(concatenated_examples[list(sample.keys())[0]])
# get max number of chunks for batch
if batch_total_length >= chunk_length:
batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
# Split by chunks of max_len.
result = {
k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
for k, t in concatenated_examples.items()
}
# add remainder to global variable for next batch
remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
# prepare labels
result["labels"] = result["input_ids"].copy()
return result
lm_dataset = tokenized_dataset.map(
partial(chunk, chunk_length=2048),
batched=True,
)
I cannot , for the life of me, understand the chunk function. I tried to implement it on a single sample
x = iter(tokenized_dataset)
sample= next(x)
concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
this produces the error. Its way too pythonic for me to understand.
Can someone write a for loop version of this for my understanding.