SzIProjekt/Traktorek/Natalia Wiśniewska- drzewo_decyzyjne.py

145 lines
4.0 KiB
Python
Raw Normal View History

2020-06-03 12:38:28 +02:00
training_data = [
#nawodnienie, kiedyNawadniano, coIleDniTrzebaNawadniac, czyMaPadac, kiedyPadalo
['n', 2, 3, 't', 1],
['s', 1, 3, 't', 1],
['s', 5, 2, 'n', 1],
['n', 3, 5, 'n', 1],
['s', 3, 1, 't', 2],
['n', 2, 4, 'n', 2],
['n', 4, 6, 't', 3],
['n', 6, 5, 't', 3],
['s', 1, 2, 't', 4],
['s', 7, 3, 'n', 5],
['n', 4, 4, 'n', 5],
['s', 5, 6, 't', 5],
['n', 2, 7, 't', 1],
['s', 5, 6, 't', 7],
['s', 5, 3, 'n', 7],
['n', 3, 2, 'n', 7],
['s', 3, 5, 't', 4],
['n', 3, 4, 'n', 4],
['n', 4, 3, 't', 6],
['n', 6, 3, 't', 6],
['s', 1, 4, 't', 6],
['s', 7, 5, 'n', 3],
['n', 2, 5, 'n', 3],
['s', 4, 6, 't', 3],
['s', 4, 8, 'n', 4]
]
header = ["nawodnienie", "kiedyNawadniano", "coIleDni", "czyMaPadac", "kiedyPadalo"]
def class_counts(rows):
counts = {}
for row in rows:
label = row[-1]
if label not in counts:
counts[label] = 0
counts[label] += 1
return counts
def is_numeric(value):
return isinstance(value, int) or isinstance(value, float)
class Question:
def __init__(self, column, value):
self.column = column
self.value = value
def match(self, example):
val = example[self.column]
if is_numeric(val):
return val >= self.value
else:
return val == self.value
def __repr__(self):
condition = "=="
if is_numeric(self.value):
condition = ">="
return "Czy %s %s %s?" % (
header[self.column], condition, str(self.value))
def partition(rows, question):
true_rows, false_rows = [], []
for row in rows:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
def gini(rows):
counts = class_counts(rows)
impurity = 1
for lbl in counts:
prob_of_lbl = counts[lbl] / float(len(rows))
impurity -= prob_of_lbl**2
return impurity
def info_gain(left, right, current_uncertainty):
p = float(len(left)) / (len(left) + len(right))
return current_uncertainty - p * gini(left) - (1 - p) * gini(right)
def find_best_split(rows):
best_gain = 0
best_question = None
current_uncertainty = gini(rows)
n_features = len(rows[0]) - 1
for col in range(n_features):
values = set([row[col] for row in rows])
for val in values:
question = Question(col, val)
true_rows, false_rows = partition(rows, question)
if len(true_rows) == 0 or len(false_rows) == 0:
continue
gain = info_gain(true_rows, false_rows, current_uncertainty)
if gain >= best_gain:
best_gain, best_question = gain, question
return best_gain, best_question
class Leaf:
def __init__(self, rows):
self.predictions = class_counts(rows)
class Decision_Node:
def __init__(self,
question,
true_branch,
false_branch):
self.question = question
self.true_branch = true_branch
self.false_branch = false_branch
def build_tree(rows):
gain, question = find_best_split(rows)
if gain == 0:
return Leaf(rows)
true_rows, false_rows = partition(rows, question)
true_branch = build_tree(true_rows)
false_branch = build_tree(false_rows)
return Decision_Node(question, true_branch, false_branch)
def print_tree(node, spacing=""):
if isinstance(node, Leaf):
print (spacing + "Predict", node.predictions)
return
print (spacing + str(node.question))
print (spacing + '--> True:')
print_tree(node.true_branch, spacing + " ")
print (spacing + '--> False:')
print_tree(node.false_branch, spacing + " ")
my_tree = build_tree(training_data)
print_tree(my_tree)