class RecommendationSystemModel(nn.Module):
def __init__(
self,
num_users,
num_movies,
embedding_size=128,
hidden_dim=128,
dropout_rate=0.2,
):
super(RecommendationSystemModel, self).__init__()
self.num_users = num_users
self.num_movies = num_movies
self.embedding_size = embedding_size
self.hidden_dim = hidden_dim
# Embedding layers
self.user_embedding = nn.Embedding(
num_embeddings=self.num_users, embedding_dim=self.embedding_size
)
self.movie_embedding = nn.Embedding(
num_embeddings=self.num_movies, embedding_dim=self.embedding_size
)
print(self.num_users, self.num_movies)
# Hidden layers
self.fc1 = nn.Linear(2 * self.embedding_size, self.hidden_dim)
self.fc2 = nn.Linear(self.hidden_dim, 1)
# Dropout layer
self.dropout = nn.Dropout(p=dropout_rate)
# Activation function
self.relu = nn.ReLU()
def forward(self, users, movies):
# Embeddings
user_embedded = self.user_embedding(users)
movie_embedded = self.movie_embedding(movies)
#print("User embedded shape:", user_embedded.shape)
#print("Movie embedded shape:", movie_embedded.shape)
# Concatenate user and movie embeddings
combined = torch.cat([user_embedded, movie_embedded], dim=1)
#print("Combined shape:", combined.shape)
# Pass through hidden layers with ReLU activation and dropout
x = self.relu(self.fc1(combined))
x = self.dropout(x)
output = self.fc2(x)
return output
return output
This is my RecSys model code. I used MovieLens 25M dataset with binary labels (ratings lower than 3 are considered as 0 where the ratings higher than 3 are considered as 1). I sampled 1M lines of the dataset. In my subsample number of user ids and movie ids are 12674, 20784 respectively. I am running my training on Kaggle P100 GPU. When I run this cell on Kaggle
recommendation_model = RecommendationSystemModel(
num_users=12674,
num_movies=20784,
embedding_size=128,
hidden_dim=64,
dropout_rate=0.1,
).to(device)
or this cell
optimizer = torch.optim.Adam(recommendation_model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()
EPOCHS = 2
# Function to log progress
def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
avg_loss = total_loss / log_progress_step
sys.stderr.write(
f"r{epoch+1:02d}/{EPOCHS:02d} | Step: {step}/{data_size} | Avg Loss: {avg_loss:<6.9f}"
)
sys.stderr.flush()
losses.append(avg_loss)
total_loss = 0
log_progress_step = 100
losses = []
train_dataset_size = len(df_train)
print(f"Training on {train_dataset_size} samples...")
recommendation_model.train()
for e in range(EPOCHS):
step_count = 0 # Reset step count at the beginning of each epoch
for i, train_data in enumerate(train_loader):
output = recommendation_model(
train_data["users"].to(device), train_data["movies"].to(device)
)
# Reshape the model output to match the target's shape
output = output.squeeze() # Removes the singleton dimension
#print('train data clicks shape: ', train_data["clicks"].shape)
ratings = (
train_data["clicks"].to(torch.float32).to(device)
) # Assuming ratings is already 1D
loss = loss_func(output, ratings)
total_loss += loss.sum().item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Increment step count by the actual size of the batch
step_count += len(train_data["users"])
# Check if it's time to log progress
if (
step_count % log_progress_step == 0 or i == len(train_loader) - 1
): # Log at the end of each epoch
log_progress(
e, step_count, total_loss, log_progress_step, train_dataset_size, losses
)
total_loss = 0
I get the exception:
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
When I set
recommendation_model = RecommendationSystemModel(
num_users=1_000_000,
num_movies=1_000_000,
embedding_size=128,
hidden_dim=64,
dropout_rate=0.1,
).to(device)
where num_users and num_movies are equal, no exception occurs!
But once I get an exception, even if I changed the num_users and num_movies set to 1000000, I still get the exception! Never run into such problem before.
Any ideas?
I tried to create my model with different number of entries in terms of user ids and movie ids. I ended up with an exception. I want to solve it.
bilgi teorisi is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.