162 lines
5.4 KiB
Python
162 lines
5.4 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import json
|
|
|
|
|
|
train_data_m = pd.read_csv("01.csv")
|
|
test_data_m = pd.read_csv("10.csv")
|
|
|
|
def calc_total_entropy(train_data, label, class_list):
|
|
total_row = train_data.shape[0]
|
|
total_entr = 0
|
|
|
|
for c in class_list:
|
|
total_class_count = train_data[train_data[label] == c].shape[0]
|
|
if total_class_count == 0:
|
|
total_class_entr = 0
|
|
else:
|
|
total_class_entr = - (total_class_count / total_row) * np.log2(total_class_count / total_row)
|
|
total_entr += total_class_entr
|
|
|
|
return total_entr
|
|
|
|
|
|
def calc_entropy(feature_value_data, label, class_list):
|
|
class_count = feature_value_data.shape[0]
|
|
entropy = 0
|
|
|
|
for c in class_list:
|
|
label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]
|
|
|
|
entropy_class = 0
|
|
if label_class_count != 0:
|
|
probability_class = label_class_count / class_count
|
|
entropy_class = - probability_class * np.log2(probability_class)
|
|
|
|
entropy += entropy_class
|
|
|
|
return entropy
|
|
|
|
|
|
def calc_info_gain(feature_name, train_data, label, class_list):
|
|
feature_value_list = train_data[feature_name].unique()
|
|
total_row = train_data.shape[0]
|
|
feature_info = 0.0
|
|
|
|
for feature_value in feature_value_list:
|
|
feature_value_data = train_data[train_data[feature_name] == feature_value]
|
|
feature_value_count = feature_value_data.shape[0]
|
|
feature_value_entropy = calc_entropy(feature_value_data, label, class_list)
|
|
feature_value_probability = feature_value_count / total_row
|
|
feature_info += feature_value_probability * feature_value_entropy
|
|
|
|
return calc_total_entropy(train_data, label, class_list) - feature_info
|
|
|
|
|
|
def find_most_informative_feature(train_data, label, class_list):
|
|
feature_list = train_data.columns.drop(label)
|
|
max_info_gain = -1
|
|
max_info_feature = None
|
|
|
|
for feature in feature_list:
|
|
feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
|
|
if max_info_gain < feature_info_gain:
|
|
max_info_gain = feature_info_gain
|
|
max_info_feature = feature
|
|
return max_info_feature
|
|
|
|
|
|
def generate_sub_tree(feature_name, train_data, label, class_list):
|
|
feature_value_count_dict = train_data[feature_name].value_counts(sort=False)
|
|
tree = {}
|
|
for feature_value, count in feature_value_count_dict.items():
|
|
feature_value_data = train_data[train_data[feature_name] == feature_value]
|
|
|
|
assigned_to_node = False
|
|
for c in class_list:
|
|
class_count = feature_value_data[feature_value_data[label] == c].shape[0]
|
|
if class_count == count:
|
|
tree[feature_value] = c
|
|
train_data = train_data[train_data[feature_name] != feature_value]
|
|
assigned_to_node = True
|
|
if not assigned_to_node:
|
|
tree[feature_value] = "?"
|
|
return tree, train_data
|
|
|
|
|
|
def make_tree(root, prev_feature_value, train_data, label, class_list):
|
|
if train_data.shape[0] != 0:
|
|
max_info_feature = find_most_informative_feature(train_data, label, class_list)
|
|
tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list)
|
|
next_root = None
|
|
if prev_feature_value != None:
|
|
root[prev_feature_value] = dict()
|
|
root[prev_feature_value][max_info_feature] = tree
|
|
next_root = root[prev_feature_value][max_info_feature]
|
|
else:
|
|
root[max_info_feature] = tree
|
|
next_root = root[max_info_feature]
|
|
|
|
for node, branch in list(next_root.items()):
|
|
if branch == "?":
|
|
feature_value_data = train_data[train_data[max_info_feature] == node]
|
|
make_tree(next_root, node, feature_value_data, label, class_list)
|
|
|
|
|
|
def id3(train_data_m, label):
|
|
train_data = train_data_m.copy()
|
|
tree = {}
|
|
class_list = train_data[label].unique()
|
|
make_tree(tree, None, train_data, label, class_list)
|
|
|
|
return tree
|
|
|
|
def predict(tree, instance):
|
|
if not isinstance(tree, dict):
|
|
return tree
|
|
else:
|
|
root_node = next(iter(tree))
|
|
feature_value = instance[root_node]
|
|
print(root_node)
|
|
print(feature_value)
|
|
if feature_value in tree[root_node]:
|
|
return predict(tree[root_node][feature_value], instance)
|
|
else:
|
|
return None
|
|
|
|
|
|
def evaluate(tree, test_data_m, label):
|
|
correct_preditct = 0
|
|
wrong_preditct = 0
|
|
for index, row in test_data_m.iterrows():
|
|
print(test_data_m.iloc[index])
|
|
result = predict(tree, test_data_m.iloc[index])
|
|
print()
|
|
if result == test_data_m[label].iloc[index]:
|
|
correct_preditct += 1
|
|
else:
|
|
wrong_preditct += 1
|
|
accuracy = correct_preditct / (correct_preditct + wrong_preditct)
|
|
return accuracy
|
|
|
|
class NpEncoder(json.JSONEncoder):
|
|
def default(self, obj):
|
|
if isinstance(obj, np.integer):
|
|
return int(obj)
|
|
if isinstance(obj, np.floating):
|
|
return float(obj)
|
|
if isinstance(obj, np.ndarray):
|
|
return obj.tolist()
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
tree = id3(train_data_m, 'go to: 1)next veget. 2)gas station 3)warehouse 4)sleep 5)GAME OVER')
|
|
# print(tree)
|
|
# json_str = json.dumps(tree, indent=2, cls=NpEncoder)
|
|
# print(json_str)
|
|
|
|
accuracy = evaluate(tree, test_data_m, 'go to: 1)next veget. 2)gas station 3)warehouse 4)sleep 5)GAME OVER')
|
|
# print(accuracy)
|
|
|
|
|