I have fine tuned a Llama-3 model ( model_name=”meta-llama/Meta-Llama-3-8B”) in standard way per this notebook https://colab.research.google.com/drive/1Zmaceu65d7w4Tcd-cfnZRb6k_Tcv2b8g?usp=sharing
Using the merged model, I’m trying to deploy on AWS sagamaker as per this https://github.com/aws/amazon-sagemaker-examples/blob/main/advanced_functionality/pytorch_deploy_large_GPT_model/GPT-J-6B-model-parallel-inference-DJL.ipynb
Below is the code …
The docker image I’m using is DeepSpeed image URI is 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121 ( https://github.com/aws/deep-learning-containers/blob/master/available_images.md )
I’m getting below error, the
.half()
is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct dtype
.’)
Any advise ? Thanks
from djl_python import Input, Output
import os
import deepspeed
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
pipeline,
logging,
)
predictor = None
def init_model(model_name="cs_model"):
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=bnb_config,
trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
return model, tokenizer
def get_model(properties):
tensor_parallel = properties["tensor_parallel_degree"]
local_rank = int(os.getenv("LOCAL_RANK", "0"))
model, tokenizer = init_model(model_name="cs_model")
model = deepspeed.init_inference(
model,
mp_size=tensor_parallel,
dtype=model.dtype,
replace_method="auto",
replace_with_kernel_inject=True,
)
generator = pipeline(
task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
)
return generator
def handle(inputs: Input) -> None:
global predictor
if not predictor:
predictor = get_model(inputs.get_properties())
if inputs.is_empty():
# Model server makes an empty call to warmup the model on startup
return None
data = inputs.get_as_string()
result = predictor(data, do_sample=True, max_new_tokens=256)
return Output().add(result)