I’m struggling with setting up PandasDataset.from_long_dataframe. The error code is not specified clearly, and I can’t fix it as I’m not sure where the problem is. I suspect there’s an issue with static_feature_columns=static_cat_columns and use_feat_static_cat=True, but I can’t recognize the relation between them.
Here is my code:
<code>import pandas as pd
import numpy as np
import mxnet as mx
from gluonts.dataset.pandas import PandasDataset
from gluonts.mx import DeepAREstimator, Trainer
from gluonts.evaluation import make_evaluation_predictions
from sklearn.metrics import mean_squared_log_error
# Set date range
final_custom_dataset['date'] = pd.to_datetime(final_custom_dataset['date'])
start_date = '2016-01-01'
end_date = '2017-08-15'
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
def generate_single_ts_with_features(final_custom_dataset, item_id, date_range) -> pd.DataFrame:
ts = final_custom_dataset[final_custom_dataset['new_id'] == item_id].copy()
ts.set_index('date', inplace=True)
ts = ts.reindex(date_range, fill_value=0)
ts['new_id'] = item_id
ts['date'] = ts.index
return ts
# Generate multiple time series data
multiple_ts = {
item_id: generate_single_ts_with_features(final_custom_dataset, item_id, date_range)
for item_id in ['25_AUTOMOTIVE', '26_AUTOMOTIVE'] # final_custom_dataset['new_id'].unique()
}
# Combine multiple time series data into a single DataFrame
multiple_ts_long = pd.concat(multiple_ts.values(), axis=0)
# Ensure all required columns are present
required_columns = [
"onpromotion", "onpromotion_lag1", "day_of_week_Friday", "day_of_week_Monday",
"day_of_week_Saturday", "day_of_week_Sunday", "day_of_week_Thursday",
"day_of_week_Tuesday", "day_of_week_Wednesday", "sales", "new_id", "date"
]
for col in required_columns:
if col not in multiple_ts_long.columns:
multiple_ts_long[col] = 0
# Define dynamic_real_columns and static_cat_columns
dynamic_real_columns = [
"onpromotion", "onpromotion_lag1"
]
static_cat_columns = [
"day_of_week_Friday", "day_of_week_Monday",
"day_of_week_Saturday", "day_of_week_Sunday",
"day_of_week_Thursday", "day_of_week_Tuesday",
"day_of_week_Wednesday"
]
# Create PandasDataset for long dataset
multiple_ts_long_dataset = PandasDataset.from_long_dataframe(
multiple_ts_long,
item_id="new_id",
timestamp="date",
feat_dynamic_real=dynamic_real_columns,
static_feature_columns=static_cat_columns,
target="sales",
freq='D' # freq setting added
)
# Set GPU context
ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
# Compute cardinality and embedding_dimension
cardinality = [2 for _ in static_cat_columns]
embedding_dimension = [min(50, (cat + 1) // 2) for cat in cardinality]
# Train the model
estimator = DeepAREstimator(
freq="D",
prediction_length=16,
use_feat_dynamic_real=True,
use_feat_static_cat=True,
cardinality=cardinality,
embedding_dimension=embedding_dimension,
trainer=Trainer(epochs=1, ctx=ctx)
)
predictor = estimator.train(multiple_ts_long_dataset)
# Make predictions
forecast_it, ts_it = make_evaluation_predictions(
dataset=multiple_ts_long_dataset,
predictor=predictor,
num_samples=100
)
# Retrieve forecasts and actual values
forecasts = list(forecast_it)
tss = list(ts_it)
# Calculate MSLE
msle_values = []
for ts, forecast in zip(tss, forecasts):
actual = ts[-forecast.prediction_length:].values
predicted = forecast.mean
msle = mean_squared_log_error(actual, predicted)
msle_values.append(msle)
print(f"MSLE: {msle}")
# Print average MSLE
average_msle = np.mean(msle_values)
print(f"Average MSLE: {average_msle}")
</code>
<code>import pandas as pd
import numpy as np
import mxnet as mx
from gluonts.dataset.pandas import PandasDataset
from gluonts.mx import DeepAREstimator, Trainer
from gluonts.evaluation import make_evaluation_predictions
from sklearn.metrics import mean_squared_log_error
# Set date range
final_custom_dataset['date'] = pd.to_datetime(final_custom_dataset['date'])
start_date = '2016-01-01'
end_date = '2017-08-15'
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
def generate_single_ts_with_features(final_custom_dataset, item_id, date_range) -> pd.DataFrame:
ts = final_custom_dataset[final_custom_dataset['new_id'] == item_id].copy()
ts.set_index('date', inplace=True)
ts = ts.reindex(date_range, fill_value=0)
ts['new_id'] = item_id
ts['date'] = ts.index
return ts
# Generate multiple time series data
multiple_ts = {
item_id: generate_single_ts_with_features(final_custom_dataset, item_id, date_range)
for item_id in ['25_AUTOMOTIVE', '26_AUTOMOTIVE'] # final_custom_dataset['new_id'].unique()
}
# Combine multiple time series data into a single DataFrame
multiple_ts_long = pd.concat(multiple_ts.values(), axis=0)
# Ensure all required columns are present
required_columns = [
"onpromotion", "onpromotion_lag1", "day_of_week_Friday", "day_of_week_Monday",
"day_of_week_Saturday", "day_of_week_Sunday", "day_of_week_Thursday",
"day_of_week_Tuesday", "day_of_week_Wednesday", "sales", "new_id", "date"
]
for col in required_columns:
if col not in multiple_ts_long.columns:
multiple_ts_long[col] = 0
# Define dynamic_real_columns and static_cat_columns
dynamic_real_columns = [
"onpromotion", "onpromotion_lag1"
]
static_cat_columns = [
"day_of_week_Friday", "day_of_week_Monday",
"day_of_week_Saturday", "day_of_week_Sunday",
"day_of_week_Thursday", "day_of_week_Tuesday",
"day_of_week_Wednesday"
]
# Create PandasDataset for long dataset
multiple_ts_long_dataset = PandasDataset.from_long_dataframe(
multiple_ts_long,
item_id="new_id",
timestamp="date",
feat_dynamic_real=dynamic_real_columns,
static_feature_columns=static_cat_columns,
target="sales",
freq='D' # freq setting added
)
# Set GPU context
ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
# Compute cardinality and embedding_dimension
cardinality = [2 for _ in static_cat_columns]
embedding_dimension = [min(50, (cat + 1) // 2) for cat in cardinality]
# Train the model
estimator = DeepAREstimator(
freq="D",
prediction_length=16,
use_feat_dynamic_real=True,
use_feat_static_cat=True,
cardinality=cardinality,
embedding_dimension=embedding_dimension,
trainer=Trainer(epochs=1, ctx=ctx)
)
predictor = estimator.train(multiple_ts_long_dataset)
# Make predictions
forecast_it, ts_it = make_evaluation_predictions(
dataset=multiple_ts_long_dataset,
predictor=predictor,
num_samples=100
)
# Retrieve forecasts and actual values
forecasts = list(forecast_it)
tss = list(ts_it)
# Calculate MSLE
msle_values = []
for ts, forecast in zip(tss, forecasts):
actual = ts[-forecast.prediction_length:].values
predicted = forecast.mean
msle = mean_squared_log_error(actual, predicted)
msle_values.append(msle)
print(f"MSLE: {msle}")
# Print average MSLE
average_msle = np.mean(msle_values)
print(f"Average MSLE: {average_msle}")
</code>
import pandas as pd
import numpy as np
import mxnet as mx
from gluonts.dataset.pandas import PandasDataset
from gluonts.mx import DeepAREstimator, Trainer
from gluonts.evaluation import make_evaluation_predictions
from sklearn.metrics import mean_squared_log_error
# Set date range
final_custom_dataset['date'] = pd.to_datetime(final_custom_dataset['date'])
start_date = '2016-01-01'
end_date = '2017-08-15'
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
def generate_single_ts_with_features(final_custom_dataset, item_id, date_range) -> pd.DataFrame:
ts = final_custom_dataset[final_custom_dataset['new_id'] == item_id].copy()
ts.set_index('date', inplace=True)
ts = ts.reindex(date_range, fill_value=0)
ts['new_id'] = item_id
ts['date'] = ts.index
return ts
# Generate multiple time series data
multiple_ts = {
item_id: generate_single_ts_with_features(final_custom_dataset, item_id, date_range)
for item_id in ['25_AUTOMOTIVE', '26_AUTOMOTIVE'] # final_custom_dataset['new_id'].unique()
}
# Combine multiple time series data into a single DataFrame
multiple_ts_long = pd.concat(multiple_ts.values(), axis=0)
# Ensure all required columns are present
required_columns = [
"onpromotion", "onpromotion_lag1", "day_of_week_Friday", "day_of_week_Monday",
"day_of_week_Saturday", "day_of_week_Sunday", "day_of_week_Thursday",
"day_of_week_Tuesday", "day_of_week_Wednesday", "sales", "new_id", "date"
]
for col in required_columns:
if col not in multiple_ts_long.columns:
multiple_ts_long[col] = 0
# Define dynamic_real_columns and static_cat_columns
dynamic_real_columns = [
"onpromotion", "onpromotion_lag1"
]
static_cat_columns = [
"day_of_week_Friday", "day_of_week_Monday",
"day_of_week_Saturday", "day_of_week_Sunday",
"day_of_week_Thursday", "day_of_week_Tuesday",
"day_of_week_Wednesday"
]
# Create PandasDataset for long dataset
multiple_ts_long_dataset = PandasDataset.from_long_dataframe(
multiple_ts_long,
item_id="new_id",
timestamp="date",
feat_dynamic_real=dynamic_real_columns,
static_feature_columns=static_cat_columns,
target="sales",
freq='D' # freq setting added
)
# Set GPU context
ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
# Compute cardinality and embedding_dimension
cardinality = [2 for _ in static_cat_columns]
embedding_dimension = [min(50, (cat + 1) // 2) for cat in cardinality]
# Train the model
estimator = DeepAREstimator(
freq="D",
prediction_length=16,
use_feat_dynamic_real=True,
use_feat_static_cat=True,
cardinality=cardinality,
embedding_dimension=embedding_dimension,
trainer=Trainer(epochs=1, ctx=ctx)
)
predictor = estimator.train(multiple_ts_long_dataset)
# Make predictions
forecast_it, ts_it = make_evaluation_predictions(
dataset=multiple_ts_long_dataset,
predictor=predictor,
num_samples=100
)
# Retrieve forecasts and actual values
forecasts = list(forecast_it)
tss = list(ts_it)
# Calculate MSLE
msle_values = []
for ts, forecast in zip(tss, forecasts):
actual = ts[-forecast.prediction_length:].values
predicted = forecast.mean
msle = mean_squared_log_error(actual, predicted)
msle_values.append(msle)
print(f"MSLE: {msle}")
# Print average MSLE
average_msle = np.mean(msle_values)
print(f"Average MSLE: {average_msle}")
<code>---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-59-18bccb777b58> in <cell line: 56>()
54
55 # Create PandasDataset for long dataset
---> 56 multiple_ts_long_dataset = PandasDataset.from_long_dataframe(
57 multiple_ts_long,
58 item_id="new_id",
/usr/local/lib/python3.10/dist-packages/gluonts/dataset/pandas.py in from_long_dataframe(cls, dataframe, item_id, timestamp, static_feature_columns, static_features, **kwargs)
299 .set_index(item_id)
300 )
--> 301 assert len(other_static_features) == len(
302 dataframe[item_id].unique()
303 )
AssertionError:
</code>
<code>---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-59-18bccb777b58> in <cell line: 56>()
54
55 # Create PandasDataset for long dataset
---> 56 multiple_ts_long_dataset = PandasDataset.from_long_dataframe(
57 multiple_ts_long,
58 item_id="new_id",
/usr/local/lib/python3.10/dist-packages/gluonts/dataset/pandas.py in from_long_dataframe(cls, dataframe, item_id, timestamp, static_feature_columns, static_features, **kwargs)
299 .set_index(item_id)
300 )
--> 301 assert len(other_static_features) == len(
302 dataframe[item_id].unique()
303 )
AssertionError:
</code>
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-59-18bccb777b58> in <cell line: 56>()
54
55 # Create PandasDataset for long dataset
---> 56 multiple_ts_long_dataset = PandasDataset.from_long_dataframe(
57 multiple_ts_long,
58 item_id="new_id",
/usr/local/lib/python3.10/dist-packages/gluonts/dataset/pandas.py in from_long_dataframe(cls, dataframe, item_id, timestamp, static_feature_columns, static_features, **kwargs)
299 .set_index(item_id)
300 )
--> 301 assert len(other_static_features) == len(
302 dataframe[item_id].unique()
303 )
AssertionError:
I suspect there’s an issue with how I’m using static_feature_columns=static_cat_columns and use_feat_static_cat=True. Could anyone help me understand the relationship between these parameters and how to resolve this error?