paranormal-or-skeptic/train.py

#!/usr/bin/python3

import sys
import pickle
from normalize import normalize

def train():
    dokuments_total = 0
    skeptic_dokuments_total = 0

    vocabulary = set()

    skeptick_words_total = 0
    paranormal_words_total = 0

    skeptic_count = {}
    paranormal_count = {}

    for line in sys.stdin:
        line = line.rstrip()
        fields = line.split('\t')
        label = fields[0].strip()
        dokument = fields[1]
        terms = normalize(dokument)

        for t in terms:
            vocabulary.add(t)

        dokuments_total += 1
        if label == 'S':
            skeptic_dokuments_total += 1
            skeptick_words_total += len(terms)
            for term in terms:
                if term in skeptic_count:
                    skeptic_count[term] +=1
                else:
                    skeptic_count[term] = 1
        else:
            paranormal_words_total += len(terms)
            for term in terms:
                if term in paranormal_count:
                    paranormal_count[term] +=1
                else:
                    paranormal_count[term] = 1


    pskeptic = skeptic_dokuments_total / dokuments_total
    vocabulary_size = len(vocabulary)
    print(pskeptic)
    print(vocabulary_size)
    print(paranormal_words_total)
    print(skeptick_words_total)

    model = (pskeptic, vocabulary_size,
            skeptick_words_total, paranormal_words_total,
            skeptic_count,paranormal_count)

    pickle.dump(model, open("model.pkl", "wb"))

train()
Self-made naive bayes solution 2020-03-21 19:01:09 +01:00			`#!/usr/bin/python3`

			`import sys`
			`import pickle`
			`from normalize import normalize`

			`def train():`
			`dokuments_total = 0`
			`skeptic_dokuments_total = 0`

			`vocabulary = set()`

			`skeptick_words_total = 0`
			`paranormal_words_total = 0`

			`skeptic_count = {}`
			`paranormal_count = {}`

			`for line in sys.stdin:`
			`line = line.rstrip()`
			`fields = line.split('\t')`
			`label = fields[0].strip()`
			`dokument = fields[1]`
			`terms = normalize(dokument)`

			`for t in terms:`
			`vocabulary.add(t)`

			`dokuments_total += 1`
			`if label == 'S':`
			`skeptic_dokuments_total += 1`
			`skeptick_words_total += len(terms)`
			`for term in terms:`
			`if term in skeptic_count:`
			`skeptic_count[term] +=1`
			`else:`
			`skeptic_count[term] = 1`
			`else:`
			`paranormal_words_total += len(terms)`
			`for term in terms:`
			`if term in paranormal_count:`
			`paranormal_count[term] +=1`
			`else:`
			`paranormal_count[term] = 1`


			`pskeptic = skeptic_dokuments_total / dokuments_total`
			`vocabulary_size = len(vocabulary)`
			`print(pskeptic)`
			`print(vocabulary_size)`
			`print(paranormal_words_total)`
			`print(skeptick_words_total)`

			`model = (pskeptic, vocabulary_size,`
			`skeptick_words_total, paranormal_words_total,`
			`skeptic_count,paranormal_count)`

			`pickle.dump(model, open("model.pkl", "wb"))`

			`train()`