I am doing a random forest model on PC orders data, which is mostly in Chinese. I have done the model and accuracy checks. However, I can’t seem to generate the image due to a UnicodeEncodeError, which most likely is because of the Chinese characters contained in the dataset. I have tried StringIO and BytesIO, but nothing seems to work.
Here are my imports:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.tree import export_graphviz
import pydot
from IPython.display import Image
import six
import sys
sys.modules['sklearn.externals.six'] = six
from io import StringIO,BytesIO
Here is my random forest
from sklearn.ensemble import RandomForestClassifier
X = finaldata.drop(columns=['是否赢单'])
y = finaldata['是否赢单']
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns
preprocessor = ColumnTransformer(
transformers=[
('num', 'passthrough', numerical_cols),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
and here is the plotting code
feature_names = clf.named_steps['preprocessor'].get_feature_names_out()
single_tree = clf.named_steps['classifier'].estimators_[0]
dot_data = StringIO()
export_graphviz(single_tree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,
feature_names=feature_names,
class_names=['Loss', 'Win'])
dot_data_str = dot_data.getvalue()
(graph,) = pydot.graph_from_dot_data(dot_data_str)
graph.write_png('decision_tree.png')
Image(filename='decision_tree.png')
Here is the error message
UnicodeEncodeError Traceback (most recent call last)
Cell In [18], line 13
11 # Draw the graph using pydot
12 (graph,) = pydot.graph_from_dot_data(dot_data.getvalue())
---> 13 graph.write_png('decision_tree.png')
15 # Display the image
16 Image(filename='decision_tree.png')
File c:UsersTheodoreAppDataLocalProgramsPythonPython310libsite-packagespydot.py:1743, in Dot.__init__.<locals>.new_method(path, f, prog, encoding)
1739 def new_method(
1740 path, f=frmt, prog=self.prog,
1741 encoding=None):
1742 """Refer to docstring of method `write.`"""
-> 1743 self.write(
1744 path, format=f, prog=prog,
1745 encoding=encoding)
File c:UsersTheodoreAppDataLocalProgramsPythonPython310libsite-packagespydot.py:1828, in Dot.write(self, path, prog, format, encoding)
1826 f.write(s)
1827 else:
-> 1828 s = self.create(prog, format, encoding=encoding)
1829 with io.open(path, mode='wb') as f:
1830 f.write(s)
...
File c:UsersTheodoreAppDataLocalProgramsPythonPython310libencodingscp1252.py:19, in IncrementalEncoder.encode(self, input, final)
18 def encode(self, input, final=False):
---> 19 return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 163-166: character maps to <undefined>
5