Self-made naive bayes solution

This commit is contained in:
Łukasz Szymula 2020-03-21 19:01:09 +01:00
parent 6d42bd9ed7
commit dd89b36a5a
5 changed files with 523531 additions and 1406 deletions

260927
dev-0/out.tsv

File diff suppressed because it is too large Load Diff

2
normalize.py Normal file
View File

@ -0,0 +1,2 @@
def normalize(d):
return d.split(' ')

29
predict.py Normal file
View File

@ -0,0 +1,29 @@
import sys
import pickle
import math
from normalize import normalize
model = pickle.load(open("model.pkl", "rb"))
pskeptic, vocabulary_size,skeptick_words_total, paranormal_words_total, skeptic_count,paranormal_count = model
for line in sys.stdin:
document = line.rstrip()
terms = normalize(document)
log_prob_skeptic = math.log(pskeptic)
log_prob_paranormal = math.log(1-pskeptic)
for term in terms:
if term not in skeptic_count:
skeptic_count[term] = 0
if term not in paranormal_count:
paranormal_count[term] = 0
log_prob_skeptic += math.log((skeptic_count[term]+1)/(skeptick_words_total + vocabulary_size))
log_prob_paranormal += math.log((paranormal_count[term]+1)/(paranormal_words_total + vocabulary_size))
if log_prob_skeptic > log_prob_paranormal:
print("S")
else:
print("P")

File diff suppressed because it is too large Load Diff

60
train.py Executable file
View File

@ -0,0 +1,60 @@
#!/usr/bin/python3
import sys
import pickle
from normalize import normalize
def train():
dokuments_total = 0
skeptic_dokuments_total = 0
vocabulary = set()
skeptick_words_total = 0
paranormal_words_total = 0
skeptic_count = {}
paranormal_count = {}
for line in sys.stdin:
line = line.rstrip()
fields = line.split('\t')
label = fields[0].strip()
dokument = fields[1]
terms = normalize(dokument)
for t in terms:
vocabulary.add(t)
dokuments_total += 1
if label == 'S':
skeptic_dokuments_total += 1
skeptick_words_total += len(terms)
for term in terms:
if term in skeptic_count:
skeptic_count[term] +=1
else:
skeptic_count[term] = 1
else:
paranormal_words_total += len(terms)
for term in terms:
if term in paranormal_count:
paranormal_count[term] +=1
else:
paranormal_count[term] = 1
pskeptic = skeptic_dokuments_total / dokuments_total
vocabulary_size = len(vocabulary)
print(pskeptic)
print(vocabulary_size)
print(paranormal_words_total)
print(skeptick_words_total)
model = (pskeptic, vocabulary_size,
skeptick_words_total, paranormal_words_total,
skeptic_count,paranormal_count)
pickle.dump(model, open("model.pkl", "wb"))
train()