diff --git a/decisiontree.py b/decisiontree.py new file mode 100644 index 0000000..ea63ace --- /dev/null +++ b/decisiontree.py @@ -0,0 +1,76 @@ +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +import category_encoders as ce +from sklearn.metrics import accuracy_score +from sklearn.tree import DecisionTreeClassifier +from sklearn import tree +import random +import graphviz +import warnings +warnings.filterwarnings('ignore') + +data = './database/datasetadult.csv' +df = pd.read_csv(data) +#print(df.shape) +#print(df.head()) +#print(df.info()) +#print(df['Outfit'].value_counts()) +#print(df.isnull().sum()) + +X = df.drop(['Adult'], axis=1) + +y = df['Adult'] +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42) +#print(X_train.shape, X_test.shape) +#print(X_train.dtypes) +#print(X_train.head()) + +encoder = ce.OrdinalEncoder(cols=['Wrinkles', 'Balding', 'Beard', 'Outfit', 'Glasses', 'Tattoo', 'Hair', 'Behaviour']) + +X_train = encoder.fit_transform(X_train) + +X_test = encoder.transform(X_test) + +#print(X_train.head()) +#print(X_test.head()) +clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0) +clf_en.fit(X_train, y_train) + +y_pred_en = clf_en.predict(X_test) +print('Model accuracy score with criterion entropy: {0:0.4f}'.format(accuracy_score(y_test, y_pred_en))) + +y_pred_train_en = clf_en.predict(X_train) +print('Training-set accuracy score: {0:0.4f}'.format(accuracy_score(y_train, y_pred_train_en))) + +print('Training set score: {:.4f}'.format(clf_en.score(X_train, y_train))) +print('Test set score: {:.4f}'.format(clf_en.score(X_test, y_test))) + +dot_data = tree.export_graphviz(clf_en, out_file=None, + feature_names=X_train.columns, + class_names=y_train.unique(), + filled=True, rounded=True, + special_characters=True) + +#nowy klient testowo +new_client = { + "Wrinkles": random.choice(['Yes', 'No']), + "Balding": random.choice(['Yes', 'No']), + "Beard": random.choice(['Yes', 'No']), + "Outfit": random.choice(['Messy', 'Casual','Formal']), + "Glasses": random.choice(['Yes', 'No']), + "Tattoo": random.choice(['Yes', 'No']), + "Hair": random.choice(['Color', 'Grey', 'Natural']), + "Behaviour": random.choice(['Energetic', 'Stressed', 'Calm']) +} + +new_client_df = pd.DataFrame(new_client, index=[0]) +new_client_df_encoded = encoder.transform(new_client_df) +prediction = clf_en.predict(new_client_df_encoded) + +print("\nNew client:") +print(new_client_df) +print("Prediction:", prediction[0]) + +graph = graphviz.Source(dot_data) +graph.render("decision_tree", format='png')