SI2020/drzewaDecyzyjne.py

training_data = [
    #zyznosc, nawodnienie, cien, kwasowosc
    ['z', 'n', 's', 'z', 1],
    ['z', 'n', 's', 'n', 1],
    ['j', 'n', 's', 'z', 1],
    ['z', 's', 's', 'n', 1],
    ['j', 'n', 'c', 'n', 1],
    ['z', 'n', 's', 'k', 1],
    ['z', 'n', 'c', 'k', 2],
    ['z', 's', 's', 'k', 2],
    ['z', 's', 'c', 'k', 2],
    ['j', 'n', 's', 'k', 2],
    ['z', 's', 'c', 'z', 3],
    ['j', 'n', 's', 'n', 3]
]

header = ["zyznosc", "nawodnienie", "cien", "kwasowosc", "wybor"]

def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)


class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Czy %s %s %s?" % (
            header[self.column], condition, str(self.value))

def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows


def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity


def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)


def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1
    for col in range(n_features):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain >= best_gain:
                best_gain, best_question = gain, question
    return best_gain, best_question


class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

class Decision_Node:
    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)


def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return
    print (spacing + str(node.question))
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")


my_tree = build_tree(training_data)

print_tree(my_tree)

def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs


with open( 'dane.txt', "r" ) as f:
    testing_data = [ line.split() for line in f ]


file = open("decyzje.txt", "w")
file.write("")
file.close()

for row in testing_data:
    pom = print_leaf(classify(row, my_tree))
    f = open("decyzje.txt", "a")
    if pom == {1: '100%'}:
        f.write("B\n")
    if pom == {2: '100%'}:
        f.write("Z\n")
    if pom == {3: '100%'}:
        f.write(".\n")
    f.close()
Prześlij pliki do '' 2020-06-06 10:00:53 +02:00			`training_data = [`
			`#zyznosc, nawodnienie, cien, kwasowosc`
			`['z', 'n', 's', 'z', 1],`
			`['z', 'n', 's', 'n', 1],`
			`['j', 'n', 's', 'z', 1],`
			`['z', 's', 's', 'n', 1],`
			`['j', 'n', 'c', 'n', 1],`
			`['z', 'n', 's', 'k', 1],`
			`['z', 'n', 'c', 'k', 2],`
			`['z', 's', 's', 'k', 2],`
			`['z', 's', 'c', 'k', 2],`
			`['j', 'n', 's', 'k', 2],`
			`['z', 's', 'c', 'z', 3],`
			`['j', 'n', 's', 'n', 3]`
			`]`

			`header = ["zyznosc", "nawodnienie", "cien", "kwasowosc", "wybor"]`

			`def class_counts(rows):`
			`counts = {}`
			`for row in rows:`
			`label = row[-1]`
			`if label not in counts:`
			`counts[label] = 0`
			`counts[label] += 1`
			`return counts`


			`def is_numeric(value):`
			`return isinstance(value, int) or isinstance(value, float)`


			`class Question:`
			`def __init__(self, column, value):`
			`self.column = column`
			`self.value = value`

			`def match(self, example):`
			`val = example[self.column]`
			`if is_numeric(val):`
			`return val >= self.value`
			`else:`
			`return val == self.value`

			`def __repr__(self):`
			`condition = "=="`
			`if is_numeric(self.value):`
			`condition = ">="`
			`return "Czy %s %s %s?" % (`
			`header[self.column], condition, str(self.value))`

			`def partition(rows, question):`
			`true_rows, false_rows = [], []`
			`for row in rows:`
			`if question.match(row):`
			`true_rows.append(row)`
			`else:`
			`false_rows.append(row)`
			`return true_rows, false_rows`


			`def gini(rows):`
			`counts = class_counts(rows)`
			`impurity = 1`
			`for lbl in counts:`
			`prob_of_lbl = counts[lbl] / float(len(rows))`
			`impurity -= prob_of_lbl**2`
			`return impurity`


			`def info_gain(left, right, current_uncertainty):`
			`p = float(len(left)) / (len(left) + len(right))`
			`return current_uncertainty - p * gini(left) - (1 - p) * gini(right)`


			`def find_best_split(rows):`
			`best_gain = 0`
			`best_question = None`
			`current_uncertainty = gini(rows)`
			`n_features = len(rows[0]) - 1`
			`for col in range(n_features):`
			`values = set([row[col] for row in rows])`
			`for val in values:`
			`question = Question(col, val)`
			`true_rows, false_rows = partition(rows, question)`
			`if len(true_rows) == 0 or len(false_rows) == 0:`
			`continue`
			`gain = info_gain(true_rows, false_rows, current_uncertainty)`
			`if gain >= best_gain:`
			`best_gain, best_question = gain, question`
			`return best_gain, best_question`


			`class Leaf:`
			`def __init__(self, rows):`
			`self.predictions = class_counts(rows)`

			`class Decision_Node:`
			`def __init__(self,`
			`question,`
			`true_branch,`
			`false_branch):`
			`self.question = question`
			`self.true_branch = true_branch`
			`self.false_branch = false_branch`

			`def build_tree(rows):`
			`gain, question = find_best_split(rows)`
			`if gain == 0:`
			`return Leaf(rows)`
			`true_rows, false_rows = partition(rows, question)`
			`true_branch = build_tree(true_rows)`
			`false_branch = build_tree(false_rows)`
			`return Decision_Node(question, true_branch, false_branch)`


			`def print_tree(node, spacing=""):`
			`if isinstance(node, Leaf):`
			`print (spacing + "Predict", node.predictions)`
			`return`
			`print (spacing + str(node.question))`
			`print (spacing + '--> True:')`
			`print_tree(node.true_branch, spacing + " ")`
			`print (spacing + '--> False:')`
			`print_tree(node.false_branch, spacing + " ")`


			`my_tree = build_tree(training_data)`

			`print_tree(my_tree)`

			`def classify(row, node):`
			`if isinstance(node, Leaf):`
			`return node.predictions`
			`if node.question.match(row):`
			`return classify(row, node.true_branch)`
			`else:`
			`return classify(row, node.false_branch)`

			`def print_leaf(counts):`
			`total = sum(counts.values()) * 1.0`
			`probs = {}`
			`for lbl in counts.keys():`
			`probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"`
			`return probs`


			`with open( 'dane.txt', "r" ) as f:`
			`testing_data = [ line.split() for line in f ]`


			`file = open("decyzje.txt", "w")`
			`file.write("")`
			`file.close()`

			`for row in testing_data:`
			`pom = print_leaf(classify(row, my_tree))`
			`f = open("decyzje.txt", "a")`
			`if pom == {1: '100%'}:`
			`f.write("B\n")`
			`if pom == {2: '100%'}:`
			`f.write("Z\n")`
			`if pom == {3: '100%'}:`
			`f.write(".\n")`
			`f.close()`