I am trying to train the model using Sagemaker’s Estimator class. My directory structure is as follows:
- temp
- train_step
- train.py
- requirements.txt
- temp.py
train.py:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
######################################## VERSION TESTING ##############################################################
import logging
import sys
import pkg_resources
import json
def get_python_and_package_versions():
# Get Python version
python_version = sys.version
# Get installed package versions
installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
# Combine Python version and installed package versions into a dictionary
version_data = {
"python_version": python_version,
"installed_packages": installed_packages
}
# Initialize logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Check if a default logger already exists
if not logger.hasHandlers():
# If no default logger exists, create a new one
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
# Log the version data
logger.info("Python version and installed package versions:")
logger.info(json.dumps(version_data, indent=4))
# Example usage:
get_python_and_package_versions()
######################################## VERSION TESTING ##############################################################
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)
# Make predictions on the test set
predictions = classifier.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
temp.py:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.image_uris import get_base_python_image_uri
from sagemaker.local import LocalSession
import boto3
# Create boto session
boto_session = boto3.Session()
local_session = LocalSession(boto_session = boto_session, default_bucket = pipeline_bucket_name) #bucket name
inputs = {
"train": sagemaker.inputs.TrainingInput(
s3_data=processor_output #S3 URI where data exists
)
}
estimator = Estimator(
image_uri=get_base_python_image_uri(
"us-east-1", py_version=str(38)
),
role=role, #Add execution role
instance_count=1,
instance_type="local",
output_path=artifact_location, #s3-bucket-location
base_job_name=training_base_job_name, # name of the training job
sagemaker_session=local_session,
source_dir="./train_step",
code_location=training_code_location, #s3-bucket-location
entry_point="train.py",
container_entry_point=["find", ".", "-type", "f", "(" , "-name", "train.py", "-o", "-name", "train.csv", ")" ]
)
train_args = estimator.fit(
inputs=inputs
)
It seems that the directory specified in source_dir
is not getting added. As I tried to find train.py
and train.csv
, I could find path to train.csv
but none of the files in train_step
directory.
train.csv - ./opt/ml/input/data/train/train.csv
.
Note: train.py is a dummy code just to make it reproducible without dataset.
On checking the s3 location specified in code_location
, I can see the source_code uploaded there.