I have followed the tutorial on the TensorFlow Recommenders website and when trying to adapt it to my dataset, the accuracy suddenly becomes extremely low.. like 0.27 for their dataset and 0.0027 for mine.
I have managed at some point to adapt it correctly and have a similar accuracy on my dataset but somehow I ruined the code and now I can’t make it back up.
This is the code that adapts the dataset:
<code>movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv')
ratings=pd.read_csv('/content/drive/My Drive/ratings.csv')
ratings = ratings[['userId', 'movieId','timestamp']].rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})
movies_metadata = movies_metadata[['id', 'title']].rename(columns={'id': 'movie_id', 'title': 'movie_title'})
ratings['movie_id'] = ratings['movie_id'].astype(str)
movies_metadata['movie_id'] = movies_metadata['movie_id'].astype(str)
combined_dataset = pd.merge(ratings, movies_metadata, on='movie_id', how='inner')
combined_dataset= combined_dataset.sample(100_000,random_state=1)
combined_dataset_tf = tf.data.Dataset.from_tensor_slices({
'user_id': combined_dataset['user_id'].astype(str).values,
'movie_title': combined_dataset['movie_title'].astype(str).values,
'timestamp': combined_dataset['timestamp'].values
movies = combined_dataset_tf.map(lambda x: x["movie_title"])
unique_user_ids = np.unique(np.concatenate(list(combined_dataset_tf.batch(1_000).map(
lambda x: x["user_id"]))))
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1_000).map(
<code>movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv')
ratings=pd.read_csv('/content/drive/My Drive/ratings.csv')
ratings = ratings[['userId', 'movieId','timestamp']].rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})
movies_metadata = movies_metadata[['id', 'title']].rename(columns={'id': 'movie_id', 'title': 'movie_title'})
ratings['movie_id'] = ratings['movie_id'].astype(str)
movies_metadata['movie_id'] = movies_metadata['movie_id'].astype(str)
combined_dataset = pd.merge(ratings, movies_metadata, on='movie_id', how='inner')
combined_dataset= combined_dataset.sample(100_000,random_state=1)
combined_dataset_tf = tf.data.Dataset.from_tensor_slices({
'user_id': combined_dataset['user_id'].astype(str).values,
'movie_title': combined_dataset['movie_title'].astype(str).values,
'timestamp': combined_dataset['timestamp'].values
})
movies = combined_dataset_tf.map(lambda x: x["movie_title"])
unique_user_ids = np.unique(np.concatenate(list(combined_dataset_tf.batch(1_000).map(
lambda x: x["user_id"]))))
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1_000).map(
lambda x: x))))
</code>
movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv')
ratings=pd.read_csv('/content/drive/My Drive/ratings.csv')
ratings = ratings[['userId', 'movieId','timestamp']].rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})
movies_metadata = movies_metadata[['id', 'title']].rename(columns={'id': 'movie_id', 'title': 'movie_title'})
ratings['movie_id'] = ratings['movie_id'].astype(str)
movies_metadata['movie_id'] = movies_metadata['movie_id'].astype(str)
combined_dataset = pd.merge(ratings, movies_metadata, on='movie_id', how='inner')
combined_dataset= combined_dataset.sample(100_000,random_state=1)
combined_dataset_tf = tf.data.Dataset.from_tensor_slices({
'user_id': combined_dataset['user_id'].astype(str).values,
'movie_title': combined_dataset['movie_title'].astype(str).values,
'timestamp': combined_dataset['timestamp'].values
})
movies = combined_dataset_tf.map(lambda x: x["movie_title"])
unique_user_ids = np.unique(np.concatenate(list(combined_dataset_tf.batch(1_000).map(
lambda x: x["user_id"]))))
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1_000).map(
lambda x: x))))
Next, this is the adapted movie model:
<code>from typing import Dict, Text
class MovieModel(tf.keras.Model):
self.title_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_movie_titles,mask_token=None),
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
self.title_vectorizer = tf.keras.layers.TextVectorization(
self.title_text_embedding = tf.keras.Sequential([
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
self.title_vectorizer.adapt(movies)
self.title_embedding(inputs),
self.title_text_embedding(inputs),
<code>from typing import Dict, Text
class MovieModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000
self.title_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_movie_titles,mask_token=None),
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
])
self.title_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.title_vectorizer.adapt(movies)
def call(self, inputs):
print(inputs.dtype)
return tf.concat([
self.title_embedding(inputs),
self.title_text_embedding(inputs),
], axis=1)
</code>
from typing import Dict, Text
class MovieModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000
self.title_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_movie_titles,mask_token=None),
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
])
self.title_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.title_vectorizer.adapt(movies)
def call(self, inputs):
print(inputs.dtype)
return tf.concat([
self.title_embedding(inputs),
self.title_text_embedding(inputs),
], axis=1)
And this is the movie model:
<code>class MovielensModel(tfrs.models.Model):
def __init__(self, layer_sizes):
self.query_model = QueryModel(layer_sizes)
self.candidate_model = CandidateModel(layer_sizes)
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=movies.batch(128).map(self.candidate_model),
def compute_loss(self, features, training=False):
# We only pass the user id and timestamp features into the query model. This
# is to ensure that the training inputs would have the same keys as the
# query inputs. Otherwise the discrepancy in input structure would cause an
# error when loading the query model after saving it.
query_embeddings = self.query_model({
"user_id": features["user_id"],
"timestamp": features["timestamp"],
movie_embeddings = self.candidate_model(features["movie_title"])
query_embeddings, movie_embeddings)
<code>class MovielensModel(tfrs.models.Model):
def __init__(self, layer_sizes):
super().__init__()
self.query_model = QueryModel(layer_sizes)
self.candidate_model = CandidateModel(layer_sizes)
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=movies.batch(128).map(self.candidate_model),
),
)
def compute_loss(self, features, training=False):
# We only pass the user id and timestamp features into the query model. This
# is to ensure that the training inputs would have the same keys as the
# query inputs. Otherwise the discrepancy in input structure would cause an
# error when loading the query model after saving it.
query_embeddings = self.query_model({
"user_id": features["user_id"],
"timestamp": features["timestamp"],
})
movie_embeddings = self.candidate_model(features["movie_title"])
return self.task(
query_embeddings, movie_embeddings)
</code>
class MovielensModel(tfrs.models.Model):
def __init__(self, layer_sizes):
super().__init__()
self.query_model = QueryModel(layer_sizes)
self.candidate_model = CandidateModel(layer_sizes)
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=movies.batch(128).map(self.candidate_model),
),
)
def compute_loss(self, features, training=False):
# We only pass the user id and timestamp features into the query model. This
# is to ensure that the training inputs would have the same keys as the
# query inputs. Otherwise the discrepancy in input structure would cause an
# error when loading the query model after saving it.
query_embeddings = self.query_model({
"user_id": features["user_id"],
"timestamp": features["timestamp"],
})
movie_embeddings = self.candidate_model(features["movie_title"])
return self.task(
query_embeddings, movie_embeddings)
In the picture, it is attached the picture with the results on the validation, showing the low accuracy. Thank you for the responses, please!!