I am trying to train linear-model in AWS sagemaker, where predictor type is “regressor”.
I am stuck I have no clue what I have to do.
I have uploaded my data to S3-bucket.
Bellow is the code to upload data in S3-bucket in required format.
def export_data_lr(data, name, pre):
# split data into X and y subsets
X = data.drop(columns="price")
y = data.price.copy()
# transformation
X = pre.transform(X)
file_name = get_file_name_lr(name)
(
y
. to_frame()
.join(X)
.to_csv(file_name, index=False, header=False)
)
def upload_to_bucket_lr(name,BUCKET_NAME):
file_name = get_file_name_lr(name)
(
boto3
.Session()
.resource("s3")
.Bucket(BUCKET_NAME)
.Object(os.path.join(DATA_PREFIX, f"{name}/{name}-lr.csv"))
.upload_file(file_name)
)
Upto here every thing is working fine. Data is correctly uploaded to S3-bucket without any problem.
In my s3-bucket i am able to see my training, testing and validation data.
Bellow is code for building linear-learner model with predictor type is “regressor”.
I am not using much hyperparameters for shake of simplicity.
from sagemaker import image_uris
def return_lr_model(sagemaker,BUCKET_NAME,algorithm,version):
OUTPUT_PATH = get_output_path(BUCKET_NAME)
region = sagemaker.Session().boto_region_name
session = sagemaker.Session()
role = sagemaker.get_execution_role()
image_uri = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")
lr_model = Estimator(
image_uri = image_uri,
role = role,
instance_count = 1,
instance_type = "ml.m4.xlarge",
output_path = OUTPUT_PATH,
sagemaker_session = session,
base_job_name = 'LR-v1')
return lr_model
def perform_hyperparameter(lr_model):
lr_model.set_hyperparameters(
feature_dim = "auto",
predictor_type = "regressor",
normalize_data = False,
epochs = 15
)
return lr_model
def create_lr_model(sagemaker,BUCKET_NAME,algorithm,version):
lr_model = return_lr_model(sagemaker,BUCKET_NAME,algorithm,version)
lr_model = perform_hyperparameter(lr_model)
return lr_model
lr_model = create_lr_model(sagemaker,BUCKET_NAME,algorithm,version)
Code to create data channel
def get_data_channels(train,val):
train = get_file_name_lr(train)
bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/train/{train}"
train_data_channel = TrainingInput(bucket_path, content_type="csv")
val = get_file_name_lr(val)
bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/val/{val}"
val_data_channel = TrainingInput(bucket_path, content_type="text/csv")
data_channels = {
"train": train_data_channel,
"validation": val_data_channel
}
return data_channels
data_channel = get_data_channels("train","val")
Train Model:
lr_model.fit(data_channel)
Here my actual problem starts, bellow is error which I am encountering while training the model.
Error:
INFO:sagemaker:Creating training-job with name: LR-v1-2024-06-16-05-01-56-352
2024-06-16 05:01:56 Starting - Starting the training job...
2024-06-16 05:02:17 Starting - Preparing the instances for training...
2024-06-16 05:02:48 Downloading - Downloading input data...
2024-06-16 05:03:23 Downloading - Downloading the training image.........
2024-06-16 05:04:29 Training - Training image download completed. Training in progress..Docker entrypoint called with argument(s): train
Running default environment configuration script
[06/16/2024 05:04:54 INFO 140601654445888] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity': '0.01', 'huber_delta': '1.0', 'num_classes': '1', 'accuracy_top_k': '3', 'wd': 'auto', 'l1': 'auto', 'momentum': 'auto', 'learning_rate': 'auto', 'beta_1': 'auto', 'beta_2': 'auto', 'bias_lr_mult': 'auto', 'bias_wd_mult': 'auto', 'use_lr_scheduler': 'true', 'lr_scheduler_step': 'auto', 'lr_scheduler_factor': 'auto', 'lr_scheduler_minimum_lr': 'auto', 'positive_example_weight_mult': '1.0', 'balance_multiclass_weights': 'false', 'normalize_data': 'true', 'normalize_label': 'auto', 'unbias_data': 'auto', 'unbias_label': 'auto', 'num_point_for_scaler': '10000', '_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_log_level': 'info', '_tuning_objective_metric': '', 'early_stopping_patience': '3', 'early_stopping_tolerance': '0.001', '_enable_profiler': 'false'}
[06/16/2024 05:04:54 INFO 140601654445888] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'epochs': '15', 'feature_dim': 'auto', 'normalize_data': 'False', 'predictor_type': 'regressor'}
[06/16/2024 05:04:54 INFO 140601654445888] Final configuration: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity': '0.01', 'huber_delta': '1.0', 'num_classes': '1', 'accuracy_top_k': '3', 'wd': 'auto', 'l1': 'auto', 'momentum': 'auto', 'learning_rate': 'auto', 'beta_1': 'auto', 'beta_2': 'auto', 'bias_lr_mult': 'auto', 'bias_wd_mult': 'auto', 'use_lr_scheduler': 'true', 'lr_scheduler_step': 'auto', 'lr_scheduler_factor': 'auto', 'lr_scheduler_minimum_lr': 'auto', 'positive_example_weight_mult': '1.0', 'balance_multiclass_weights': 'false', 'normalize_data': 'False', 'normalize_label': 'auto', 'unbias_data': 'auto', 'unbias_label': 'auto', 'num_point_for_scaler': '10000', '_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_log_level': 'info', '_tuning_objective_metric': '', 'early_stopping_patience': '3', 'early_stopping_tolerance': '0.001', '_enable_profiler': 'false', 'predictor_type': 'regressor'}
/opt/amazon/lib/python3.8/site-packages/mxnet/model.py:97: SyntaxWarning: "is" with a literal. Did you mean "=="?
if num_device is 1 and 'dist' not in kvstore:
/opt/amazon/lib/python3.8/site-packages/scipy/optimize/_shgo.py:495: SyntaxWarning: "is" with a literal. Did you mean "=="?
if cons['type'] is 'ineq':
/opt/amazon/lib/python3.8/site-packages/scipy/optimize/_shgo.py:743: SyntaxWarning: "is not" with a literal. Did you mean "!="?
if len(self.X_min) is not 0:
[06/16/2024 05:04:57 WARNING 140601654445888] Loggers have already been setup.
[06/16/2024 05:04:57 INFO 140601654445888] Final configuration: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity': '0.01', 'huber_delta': '1.0', 'num_classes': '1', 'accuracy_top_k': '3', 'wd': 'auto', 'l1': 'auto', 'momentum': 'auto', 'learning_rate': 'auto', 'beta_1': 'auto', 'beta_2': 'auto', 'bias_lr_mult': 'auto', 'bias_wd_mult': 'auto', 'use_lr_scheduler': 'true', 'lr_scheduler_step': 'auto', 'lr_scheduler_factor': 'auto', 'lr_scheduler_minimum_lr': 'auto', 'positive_example_weight_mult': '1.0', 'balance_multiclass_weights': 'false', 'normalize_data': 'False', 'normalize_label': 'auto', 'unbias_data': 'auto', 'unbias_label': 'auto', 'num_point_for_scaler': '10000', '_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_log_level': 'info', '_tuning_objective_metric': '', 'early_stopping_patience': '3', 'early_stopping_tolerance': '0.001', '_enable_profiler': 'false', 'predictor_type': 'regressor'}
[06/16/2024 05:04:57 WARNING 140601654445888] Loggers have already been setup.
Process 7 is a worker.
[06/16/2024 05:04:57 INFO 140601654445888] Using default worker.
[06/16/2024 05:04:57 INFO 140601654445888] Checkpoint loading and saving are disabled.
[06/16/2024 05:04:57 ERROR 140601654445888] Customer Error: No iterator has been registered for ContentType ('csv', '1.0')
2024-06-16 05:05:13 Uploading - Uploading generated training model
2024-06-16 05:05:13 Failed - Training job failed
---------------------------------------------------------------------------
UnexpectedStatusException Traceback (most recent call last)
Cell In[278], line 1
----> 1 lr_model.fit(data_channel)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py:346, in runnable_by_pipeline.<locals>.wrapper(*args, **kwargs)
342 return context
344 return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, **kwargs)
--> 346 return run_func(*args, **kwargs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/estimator.py:1346, in EstimatorBase.fit(self, inputs, wait, logs, job_name, experiment_config)
1344 self.jobs.append(self.latest_training_job)
1345 if wait:
-> 1346 self.latest_training_job.wait(logs=logs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/estimator.py:2703, in _TrainingJob.wait(self, logs)
2701 # If logs are requested, call logs_for_jobs.
2702 if logs != "None":
-> 2703 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
2704 else:
2705 self.sagemaker_session.wait_for_job(self.job_name)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/session.py:5797, in Session.logs_for_job(self, job_name, wait, poll, log_type, timeout)
5776 def logs_for_job(self, job_name, wait=False, poll=10, log_type="All", timeout=None):
5777 """Display logs for a given training job, optionally tailing them until job is complete.
5778
5779 If the output is a tty or a Jupyter cell, it will be color-coded
(...)
5795 exceptions.UnexpectedStatusException: If waiting and the training job fails.
5796 """
-> 5797 _logs_for_job(self, job_name, wait, poll, log_type, timeout)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/session.py:8026, in _logs_for_job(sagemaker_session, job_name, wait, poll, log_type, timeout)
8023 last_profiler_rule_statuses = profiler_rule_statuses
8025 if wait:
-> 8026 _check_job_status(job_name, description, "TrainingJobStatus")
8027 if dot:
8028 print()
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/session.py:8079, in _check_job_status(job, desc, status_key_name)
8073 if "CapacityError" in str(reason):
8074 raise exceptions.CapacityError(
8075 message=message,
8076 allowed_statuses=["Completed", "Stopped"],
8077 actual_status=status,
8078 )
-> 8079 raise exceptions.UnexpectedStatusException(
8080 message=message,
8081 allowed_statuses=["Completed", "Stopped"],
8082 actual_status=status,
8083 )
UnexpectedStatusException: Error for Training job LR-v1-2024-06-16-05-01-56-352: Failed. Reason: ClientError: No iterator has been registered for ContentType ('csv', '1.0'), exit code: 2
I tried to create new bucket but not worked.
Thanks in advance.
I hope to hear from you soon.