import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score class GadId3Classifier: def fit(self, input, output): data = input.copy() data[output.name] = output self.tree = self.decision_tree(data, data, input.columns, output.name) def predict(self, input): # convert input data into a dictionary of samples samples = input.to_dict(orient='records') predictions = [] # make a prediction for every sample for sample in samples: predictions.append(self.make_prediction(sample, self.tree, 1.0)) return predictions def entropy(self, attribute_column): # find unique values and their frequency counts for the given attribute values, counts = np.unique(attribute_column, return_counts=True) # calculate entropy for each unique value entropy_list = [] for i in range(len(values)): probability = counts[i]/np.sum(counts) entropy_list.append(-probability*np.log2(probability)) # calculate sum of individual entropy values total_entropy = np.sum(entropy_list) return total_entropy def information_gain(self, data, feature_attribute_name, target_attribute_name): # find total entropy of given subset total_entropy = self.entropy(data[target_attribute_name]) # find unique values and their frequency counts for the attribute to be split values, counts = np.unique(data[feature_attribute_name], return_counts=True) # calculate weighted entropy of subset weighted_entropy_list = [] for i in range(len(values)): subset_probability = counts[i]/np.sum(counts) subset_entropy = self.entropy(data.where(data[feature_attribute_name]==values[i]).dropna()[target_attribute_name]) weighted_entropy_list.append(subset_probability*subset_entropy) total_weighted_entropy = np.sum(weighted_entropy_list) # calculate information gain information_gain = total_entropy - total_weighted_entropy return information_gain def decision_tree(self, data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class=None): # base cases: # if data is pure, return the majority class of subset unique_classes = np.unique(data[target_attribute_name]) if len(unique_classes) <= 1: return unique_classes[0] # if subset is empty, ie. no samples, return majority class of original data elif len(data) == 0: majority_class_index = np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1]) return np.unique(original_data[target_attribute_name])[majority_class_index] # if data set contains no features to train with, return parent node class elif len(feature_attribute_names) == 0: return parent_node_class # if none of the above are true, construct a branch: else: # determine parent node class of current branch majority_class_index = np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1]) parent_node_class = unique_classes[majority_class_index] # determine information gain values for each feature # choose feature which best splits the data, ie. highest value ig_values = [self.information_gain(data, feature, target_attribute_name) for feature in feature_attribute_names] best_feature_index = np.argmax(ig_values) best_feature = feature_attribute_names[best_feature_index] # create tree structure, empty at first tree = {best_feature: {}} # remove best feature from available features, it will become the parent node feature_attribute_names = [i for i in feature_attribute_names if i != best_feature] # create nodes under parent node parent_attribute_values = np.unique(data[best_feature]) for value in parent_attribute_values: sub_data = data.where(data[best_feature] == value).dropna() # call the algorithm recursively subtree = self.decision_tree(sub_data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class) # add subtree to original tree tree[best_feature][value] = subtree return tree def make_prediction(self, sample, tree, default=1): # map sample data to tree for attribute in list(sample.keys()): # check if feature exists in tree if attribute in list(tree.keys()): try: result = tree[attribute][sample[attribute]] except: return default result = tree[attribute][sample[attribute]] # if more attributes exist within result, recursively find best result if isinstance(result, dict): return self.make_prediction(sample, result) else: return result #data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" #df = pd.read_csv(data_url, header=None) df = pd.read_csv("data_dd3.csv", header=None) # rename known columns columns = ['p_strength','p_agility','p_wisdom','p_health','p_melee_damage','p_ranged_damage','p_magic_damage', 'p_armor_defence','p_armor_magic_protection','e_strength','e_agility','e_wisdom','e_health','e_melee_damage', 'e_ranged_damage','e_magic_damage','e_armor_defence','e_armor_magic_protection','e_attack_type','strategy'] #columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', #'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'disease_present'] df.columns = columns # convert disease_present feature to binary # df['disease_present'] = df.disease_present.replace([1,2,3,4], 1) # drop rows with missing values, missing = ? df = df.replace("?", np.nan) df = df.dropna() # organize data into input and output #X = df.drop(columns="disease_present") #y = df["disease_present"] X = df.drop(columns="strategy") y = df["strategy"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # initialize and fit model model = GadId3Classifier() model.fit(X_train, y_train) # return accuracy score y_pred = model.predict(X_test) a = accuracy_score(y_test, y_pred) print(a) #print(y_pred) #print(y_test)