61 lines
1.5 KiB
Python
61 lines
1.5 KiB
Python
|
#!/usr/bin/python3
|
||
|
|
||
|
import sys
|
||
|
import pickle
|
||
|
from normalize import normalize
|
||
|
|
||
|
def train():
|
||
|
dokuments_total = 0
|
||
|
skeptic_dokuments_total = 0
|
||
|
|
||
|
vocabulary = set()
|
||
|
|
||
|
skeptick_words_total = 0
|
||
|
paranormal_words_total = 0
|
||
|
|
||
|
skeptic_count = {}
|
||
|
paranormal_count = {}
|
||
|
|
||
|
for line in sys.stdin:
|
||
|
line = line.rstrip()
|
||
|
fields = line.split('\t')
|
||
|
label = fields[0].strip()
|
||
|
dokument = fields[1]
|
||
|
terms = normalize(dokument)
|
||
|
|
||
|
for t in terms:
|
||
|
vocabulary.add(t)
|
||
|
|
||
|
dokuments_total += 1
|
||
|
if label == 'S':
|
||
|
skeptic_dokuments_total += 1
|
||
|
skeptick_words_total += len(terms)
|
||
|
for term in terms:
|
||
|
if term in skeptic_count:
|
||
|
skeptic_count[term] +=1
|
||
|
else:
|
||
|
skeptic_count[term] = 1
|
||
|
else:
|
||
|
paranormal_words_total += len(terms)
|
||
|
for term in terms:
|
||
|
if term in paranormal_count:
|
||
|
paranormal_count[term] +=1
|
||
|
else:
|
||
|
paranormal_count[term] = 1
|
||
|
|
||
|
|
||
|
pskeptic = skeptic_dokuments_total / dokuments_total
|
||
|
vocabulary_size = len(vocabulary)
|
||
|
print(pskeptic)
|
||
|
print(vocabulary_size)
|
||
|
print(paranormal_words_total)
|
||
|
print(skeptick_words_total)
|
||
|
|
||
|
model = (pskeptic, vocabulary_size,
|
||
|
skeptick_words_total, paranormal_words_total,
|
||
|
skeptic_count,paranormal_count)
|
||
|
|
||
|
pickle.dump(model, open("model.pkl", "wb"))
|
||
|
|
||
|
train()
|