paranormal-or-skeptic/trainNB.py

63 lines
1.5 KiB
Python
Raw Permalink Normal View History

2020-03-23 13:24:57 +01:00
#!/usr/bin/python3
import re
import sys
import pickle
from tokenize import tokenize
def train():
documents_total = 0
sceptic_documents_total = 0
vocabulary = set()
sceptic_words_total = 0
paranormal_words_total = 0
skeptic_count = {}
paranormal_count = {}
for line in sys.stdin:
line = line.rstrip()
fields = line.split('\t')
label = fields[0].strip()
document = fields[1]
print(document)
terms = tokenize(document)
print(terms)
for t in terms:
vocabulary.add(t)
documents_total += 1
if label == 'S':
sceptic_documents_total += 1
sceptic_words_total += len(terms)
for term in terms:
if term in skeptic_count:
skeptic_count[term] += 1
else:
skeptic_count[term] = 1
else:
paranormal_words_total += len(terms)
for term in terms:
if term in paranormal_count:
paranormal_count[term] += 1
else:
paranormal_count[term] = 1
pskeptic = sceptic_documents_total / documents_total
vocabulary_size = len(vocabulary)
model = (pskeptic,
vocabulary_size,
sceptic_words_total,
paranormal_words_total,
skeptic_count,
paranormal_count)
pickle.dump(model, open("model.pkl", "wb"))
train()