I have the following preprocessing stage before applying linear regression. Would there be a way to add interaction terms ‘Numeric_Var1Categorical_Var1’and ‘Numeric_Var2Categorical_Var1′ after splitting the dataset into train and test sets?
Sample Data and Example Code (before solution)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
# Sample dataset
data = {
'Numeric_Var1': [10, 20, 30, 40, 50],
'Numeric_Var2': [2.5, 3.0, 4.5, 5.5, 6.0],
'Categorical_Var1': ['A', 'B', 'A', 'B', 'B'],
'Categorical_Var2': ['X', 'Y', 'X', 'Y', 'X'],
'Dependent_Var': [100, 200, 250, 300, 400]
}
# Creating DataFrame
df = pd.DataFrame(data)
# Define independent and dependent variables
indep_vars = ['Numeric_Var1', 'Numeric_Var2', 'Categorical_Var1', 'Categorical_Var2']
dep_var = 'Dependent_Var'
# Extract the datasets
X = df[indep_vars]
y = df[dep_var]
# Standardizing non-binary numeric variables
numeric = ['Numeric_Var1', 'Numeric_Var2']
X_numerical = X[numeric]
sc_X = StandardScaler()
X_numerical_scaled = sc_X.fit_transform(X_numerical)
X_scaled = X.copy()
X_scaled[numeric] = X_numerical_scaled
sc_y = StandardScaler()
y_scaled = sc_y.fit_transform(y.values.reshape(-1, 1))
# Encoding categorical variables into dummies
categorical_indices = [2, 3] # assuming indices for categorical variables
ct = ColumnTransformer(
transformers=[('encoder', OneHotEncoder(), categorical_indices)],
remainder='passthrough'
)
X_scaled_dummified = ct.fit_transform(X_scaled)
X_scaled_array = X_scaled_dummified.toarray()
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test =
train_test_split(X_scaled_array, y_scaled, test_size=1/3, random_state=0)
m_lr = LinearRegression()
m_lr.fit(X_train, y_train)
p_lr = m_lr.predict(X_test)
r_lr = r2_score(y_test, p_lr)
print(r_lr)