from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
# Selecting numerical columns
numerical_cols = data[['MonthlyCharges', 'TotalCharges', 'tenure']].to_numpy()
# Selecting an ordinal column
ordinal_cols = data[['Contract']].to_numpy()
# Selecting categorical columns
categorical_cols = data[['PaperlessBilling', 'PaymentMethod']].to_numpy()
# Selecting the target column
target_col = data['Churn'].to_numpy()
# Define the order for the 'Contract' column
contract_order = ['Month-to-month', 'One year', 'Two year']
# Create transformers for the numeric and categorical data
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler()),
('scaler', StandardScaler()),
('scaler', StandardScaler())])
# Assuming 'contract_column' is the name of the contract column in your DataFrame
ordinal_transformer = Pipeline(steps=[
('ordinal', OrdinalEncoder(categories=[contract_order]))])
# Assuming 'categorical_columns' is the list of other categorical columns in your DataFrame
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore')),
('onehot', OneHotEncoder(handle_unknown='ignore')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Create a ColumnTransformer to apply the transformations to the correct columns
preprocessor = ColumnTransformer([
('num', numeric_transformer, numerical_cols),
('ord', ordinal_transformer, ordinal_cols),
('cat', categorical_transformer, categorical_cols)
])
# Create a pipeline that combines the preprocessor with the model
model = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
print(numerical_cols)
print(ordinal_cols)
print(categorical_cols)
# Separate the features and the target
X = data[numerical_cols + ordinal_cols + categorical_cols]
y = data[target_col].apply(lambda x: 1 if x == 'Yes' else 0) # Encoding
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
model.fit(X_train, y_train)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[8], line 68
65 print(categorical_cols)
67 # Separate the features and the target
---> 68 X = data[numerical_cols + ordinal_cols + categorical_cols]
69 y = data[target_col].apply(lambda x: 1 if x == 'Yes' else 0) # Encoding
72 # Split data into train and test sets
TypeError: unsupported operand type(s) for +: 'float' and 'str'
I am trying to do logistic regression with data which involves a variety of datatypes. I split the data into the numerical_cols, ordinal_cols and categorical_cols variables. However, when I print them I see they have not been transformed. I think this is the reason for the error. How can I transform the variables correctly?
New contributor
s213439 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.