training_data = [ #nawodnienie, kiedyNawadniano, coIleDniTrzebaNawadniac, czyMaPadac, kiedyPadalo ['n', 2, 3, 't', 1], ['s', 1, 3, 't', 1], ['s', 5, 2, 'n', 1], ['n', 3, 5, 'n', 1], ['s', 3, 1, 't', 2], ['n', 2, 4, 'n', 2], ['n', 4, 6, 't', 3], ['n', 6, 5, 't', 3], ['s', 1, 2, 't', 4], ['s', 7, 3, 'n', 5], ['n', 4, 4, 'n', 5], ['s', 5, 6, 't', 5], ['n', 2, 7, 't', 1], ['s', 5, 6, 't', 7], ['s', 5, 3, 'n', 7], ['n', 3, 2, 'n', 7], ['s', 3, 5, 't', 4], ['n', 3, 4, 'n', 4], ['n', 4, 3, 't', 6], ['n', 6, 3, 't', 6], ['s', 1, 4, 't', 6], ['s', 7, 5, 'n', 3], ['n', 2, 5, 'n', 3], ['s', 4, 6, 't', 3], ['s', 4, 8, 'n', 4] ] header = ["nawodnienie", "kiedyNawadniano", "coIleDni", "czyMaPadac", "kiedyPadalo"] def class_counts(rows): counts = {} for row in rows: label = row[-1] if label not in counts: counts[label] = 0 counts[label] += 1 return counts def is_numeric(value): return isinstance(value, int) or isinstance(value, float) class Question: def __init__(self, column, value): self.column = column self.value = value def match(self, example): val = example[self.column] if is_numeric(val): return val >= self.value else: return val == self.value def __repr__(self): condition = "==" if is_numeric(self.value): condition = ">=" return "Czy %s %s %s?" % ( header[self.column], condition, str(self.value)) def partition(rows, question): true_rows, false_rows = [], [] for row in rows: if question.match(row): true_rows.append(row) else: false_rows.append(row) return true_rows, false_rows def gini(rows): counts = class_counts(rows) impurity = 1 for lbl in counts: prob_of_lbl = counts[lbl] / float(len(rows)) impurity -= prob_of_lbl**2 return impurity def info_gain(left, right, current_uncertainty): p = float(len(left)) / (len(left) + len(right)) return current_uncertainty - p * gini(left) - (1 - p) * gini(right) def find_best_split(rows): best_gain = 0 best_question = None current_uncertainty = gini(rows) n_features = len(rows[0]) - 1 for col in range(n_features): values = set([row[col] for row in rows]) for val in values: question = Question(col, val) true_rows, false_rows = partition(rows, question) if len(true_rows) == 0 or len(false_rows) == 0: continue gain = info_gain(true_rows, false_rows, current_uncertainty) if gain >= best_gain: best_gain, best_question = gain, question return best_gain, best_question class Leaf: def __init__(self, rows): self.predictions = class_counts(rows) class Decision_Node: def __init__(self, question, true_branch, false_branch): self.question = question self.true_branch = true_branch self.false_branch = false_branch def build_tree(rows): gain, question = find_best_split(rows) if gain == 0: return Leaf(rows) true_rows, false_rows = partition(rows, question) true_branch = build_tree(true_rows) false_branch = build_tree(false_rows) return Decision_Node(question, true_branch, false_branch) def print_tree(node, spacing=""): if isinstance(node, Leaf): print (spacing + "Predict", node.predictions) return print (spacing + str(node.question)) print (spacing + '--> True:') print_tree(node.true_branch, spacing + " ") print (spacing + '--> False:') print_tree(node.false_branch, spacing + " ") my_tree = build_tree(training_data) print_tree(my_tree)