decision tree algorythm in python

with datasets for model
2021-05-18 23:42:07 +02:00 · 2021-05-18 23:42:07 +02:00 · 26927b6a1d
commit 26927b6a1d
parent 97e762fd08
3 changed files with 6160 additions and 0 deletions
--- a/data_dd2.csv
+++ b/data_dd2.csv
--- a/data_dd3.csv
+++ b/data_dd3.csv
--- a/dt.py
+++ b/dt.py
@ -0,0 +1,160 @@
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+
+class GadId3Classifier:
+  def fit(self, input, output):
+    data = input.copy()
+    data[output.name] = output
+    self.tree = self.decision_tree(data, data, input.columns, output.name)
+
+  def predict(self, input):
+    # convert input data into a dictionary of samples
+    samples = input.to_dict(orient='records')
+    predictions = []
+
+    # make a prediction for every sample
+    for sample in samples:
+      predictions.append(self.make_prediction(sample, self.tree, 1.0))
+
+    return predictions
+
+  def entropy(self, attribute_column):
+    # find unique values and their frequency counts for the given attribute
+    values, counts = np.unique(attribute_column, return_counts=True)
+
+    # calculate entropy for each unique value
+    entropy_list = []
+
+    for i in range(len(values)):
+      probability = counts[i]/np.sum(counts)
+      entropy_list.append(-probability*np.log2(probability))
+
+    # calculate sum of individual entropy values
+    total_entropy = np.sum(entropy_list)
+
+    return total_entropy
+
+  def information_gain(self, data, feature_attribute_name, target_attribute_name):
+    # find total entropy of given subset
+    total_entropy = self.entropy(data[target_attribute_name])
+
+    # find unique values and their frequency counts for the attribute to be split
+    values, counts = np.unique(data[feature_attribute_name], return_counts=True)
+
+    # calculate weighted entropy of subset
+    weighted_entropy_list = []
+
+    for i in range(len(values)):
+      subset_probability = counts[i]/np.sum(counts)
+      subset_entropy = self.entropy(data.where(data[feature_attribute_name]==values[i]).dropna()[target_attribute_name])
+      weighted_entropy_list.append(subset_probability*subset_entropy)
+
+    total_weighted_entropy = np.sum(weighted_entropy_list)
+
+    # calculate information gain
+    information_gain = total_entropy - total_weighted_entropy
+
+    return information_gain
+
+  def decision_tree(self, data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class=None):
+    # base cases:
+    # if data is pure, return the majority class of subset
+    unique_classes = np.unique(data[target_attribute_name])
+    if len(unique_classes) <= 1:
+      return unique_classes[0]
+    # if subset is empty, ie. no samples, return majority class of original data
+    elif len(data) == 0:
+      majority_class_index = np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])
+      return np.unique(original_data[target_attribute_name])[majority_class_index]
+    # if data set contains no features to train with, return parent node class
+    elif len(feature_attribute_names) == 0:
+      return parent_node_class
+    # if none of the above are true, construct a branch:
+    else:
+      # determine parent node class of current branch
+      majority_class_index = np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])
+      parent_node_class = unique_classes[majority_class_index]
+
+      # determine information gain values for each feature
+      # choose feature which best splits the data, ie. highest value
+      ig_values = [self.information_gain(data, feature, target_attribute_name) for feature in feature_attribute_names]
+      best_feature_index = np.argmax(ig_values)
+      best_feature = feature_attribute_names[best_feature_index]
+
+      # create tree structure, empty at first
+      tree = {best_feature: {}}
+
+      # remove best feature from available features, it will become the parent node
+      feature_attribute_names = [i for i in feature_attribute_names if i != best_feature]
+
+      # create nodes under parent node
+      parent_attribute_values = np.unique(data[best_feature])
+      for value in parent_attribute_values:
+        sub_data = data.where(data[best_feature] == value).dropna()
+
+        # call the algorithm recursively
+        subtree = self.decision_tree(sub_data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class)
+
+        # add subtree to original tree
+        tree[best_feature][value] = subtree
+
+      return tree
+
+  def make_prediction(self, sample, tree, default=1):
+    # map sample data to tree
+    for attribute in list(sample.keys()):
+      # check if feature exists in tree
+      if attribute in list(tree.keys()):
+        try:
+          result = tree[attribute][sample[attribute]]
+        except:
+          return default
+
+        result = tree[attribute][sample[attribute]]
+
+        # if more attributes exist within result, recursively find best result
+        if isinstance(result, dict):
+          return self.make_prediction(sample, result)
+        else:
+          return result
+
+#data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
+#df = pd.read_csv(data_url, header=None)
+df = pd.read_csv("data_dd3.csv", header=None)
+
+# rename known columns
+columns = ['p_strength','p_agility','p_wisdom','p_health','p_melee_damage','p_ranged_damage','p_magic_damage',
+           'p_armor_defence','p_armor_magic_protection','e_strength','e_agility','e_wisdom','e_health','e_melee_damage',
+           'e_ranged_damage','e_magic_damage','e_armor_defence','e_armor_magic_protection','e_attack_type','strategy']
+#columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
+           #'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'disease_present']
+df.columns = columns
+
+# convert disease_present feature to binary
+# df['disease_present'] = df.disease_present.replace([1,2,3,4], 1)
+
+# drop rows with missing values, missing = ?
+df = df.replace("?", np.nan)
+df = df.dropna()
+
+# organize data into input and output
+#X = df.drop(columns="disease_present")
+#y = df["disease_present"]
+X = df.drop(columns="strategy")
+y = df["strategy"]
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
+
+# initialize and fit model
+model = GadId3Classifier()
+model.fit(X_train, y_train)
+
+# return accuracy score
+y_pred = model.predict(X_test)
+a = accuracy_score(y_test, y_pred)
+
+print(a)
+#print(y_pred)
+#print(y_test)