I’m trying to create a proof of concept (PoC) for a local code assistant by fine-tuning the tiny_starcoder_py-vi06 model on my MacBook Pro with an M2 chip. My dataset looks like this:
<code>[
{ "filename": "filename.ext", "text": "const a = 1; // other code" }
]
</code>
<code>[
{ "filename": "filename.ext", "text": "const a = 1; // other code" }
]
</code>
[
{ "filename": "filename.ext", "text": "const a = 1; // other code" }
]
Here’s the code I’m using:
<code>import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
def main():
model_name = "tiendung/tiny_starcoder_py-vi06"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))
df = pd.read_json("collection.json")
dataset = Dataset.from_pandas(df)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=10_000,
save_total_limit=2,
)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
def move_to_device(batch):
batch['input_ids'] = batch['input_ids'].to(device)
batch['attention_mask'] = batch['attention_mask'].to(device)
return batch
tokenized_datasets = tokenized_datasets.map(move_to_device, batched=True)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_datasets,
)
trainer.train()
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
if __name__ == "__main__":
main()
</code>
<code>import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
def main():
model_name = "tiendung/tiny_starcoder_py-vi06"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))
df = pd.read_json("collection.json")
dataset = Dataset.from_pandas(df)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=10_000,
save_total_limit=2,
)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
def move_to_device(batch):
batch['input_ids'] = batch['input_ids'].to(device)
batch['attention_mask'] = batch['attention_mask'].to(device)
return batch
tokenized_datasets = tokenized_datasets.map(move_to_device, batched=True)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_datasets,
)
trainer.train()
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
if __name__ == "__main__":
main()
</code>
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
def main():
model_name = "tiendung/tiny_starcoder_py-vi06"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))
df = pd.read_json("collection.json")
dataset = Dataset.from_pandas(df)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=10_000,
save_total_limit=2,
)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
def move_to_device(batch):
batch['input_ids'] = batch['input_ids'].to(device)
batch['attention_mask'] = batch['attention_mask'].to(device)
return batch
tokenized_datasets = tokenized_datasets.map(move_to_device, batched=True)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_datasets,
)
trainer.train()
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
if __name__ == "__main__":
main()
Error Encountered:
When running the code, I get the following error:
RuntimeError: Placeholder storage has not been allocated on MPS device!
Questions:
- Is there something I might have missed in the setup for using the MPS device?
- Are there specific configurations or adjustments needed for training on MacBook’s M2 chip?
- Any recommendations for troubleshooting this issue?
Environment Details:
- OS: macOS
- Chip: Apple M2
- Transformers version: 4.42.4
Any help or guidance would be greatly appreciated!