Self-made naive bayes solution
This commit is contained in:
parent
6d42bd9ed7
commit
dd89b36a5a
260927
dev-0/out.tsv
260927
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
2
normalize.py
Normal file
2
normalize.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
def normalize(d):
|
||||||
|
return d.split(' ')
|
29
predict.py
Normal file
29
predict.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
import math
|
||||||
|
from normalize import normalize
|
||||||
|
|
||||||
|
model = pickle.load(open("model.pkl", "rb"))
|
||||||
|
|
||||||
|
pskeptic, vocabulary_size,skeptick_words_total, paranormal_words_total, skeptic_count,paranormal_count = model
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
document = line.rstrip()
|
||||||
|
terms = normalize(document)
|
||||||
|
|
||||||
|
log_prob_skeptic = math.log(pskeptic)
|
||||||
|
log_prob_paranormal = math.log(1-pskeptic)
|
||||||
|
|
||||||
|
for term in terms:
|
||||||
|
if term not in skeptic_count:
|
||||||
|
skeptic_count[term] = 0
|
||||||
|
if term not in paranormal_count:
|
||||||
|
paranormal_count[term] = 0
|
||||||
|
|
||||||
|
log_prob_skeptic += math.log((skeptic_count[term]+1)/(skeptick_words_total + vocabulary_size))
|
||||||
|
log_prob_paranormal += math.log((paranormal_count[term]+1)/(paranormal_words_total + vocabulary_size))
|
||||||
|
|
||||||
|
if log_prob_skeptic > log_prob_paranormal:
|
||||||
|
print("S")
|
||||||
|
else:
|
||||||
|
print("P")
|
263919
test-A/out.tsv
263919
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
60
train.py
Executable file
60
train.py
Executable file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from normalize import normalize
|
||||||
|
|
||||||
|
def train():
|
||||||
|
dokuments_total = 0
|
||||||
|
skeptic_dokuments_total = 0
|
||||||
|
|
||||||
|
vocabulary = set()
|
||||||
|
|
||||||
|
skeptick_words_total = 0
|
||||||
|
paranormal_words_total = 0
|
||||||
|
|
||||||
|
skeptic_count = {}
|
||||||
|
paranormal_count = {}
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.rstrip()
|
||||||
|
fields = line.split('\t')
|
||||||
|
label = fields[0].strip()
|
||||||
|
dokument = fields[1]
|
||||||
|
terms = normalize(dokument)
|
||||||
|
|
||||||
|
for t in terms:
|
||||||
|
vocabulary.add(t)
|
||||||
|
|
||||||
|
dokuments_total += 1
|
||||||
|
if label == 'S':
|
||||||
|
skeptic_dokuments_total += 1
|
||||||
|
skeptick_words_total += len(terms)
|
||||||
|
for term in terms:
|
||||||
|
if term in skeptic_count:
|
||||||
|
skeptic_count[term] +=1
|
||||||
|
else:
|
||||||
|
skeptic_count[term] = 1
|
||||||
|
else:
|
||||||
|
paranormal_words_total += len(terms)
|
||||||
|
for term in terms:
|
||||||
|
if term in paranormal_count:
|
||||||
|
paranormal_count[term] +=1
|
||||||
|
else:
|
||||||
|
paranormal_count[term] = 1
|
||||||
|
|
||||||
|
|
||||||
|
pskeptic = skeptic_dokuments_total / dokuments_total
|
||||||
|
vocabulary_size = len(vocabulary)
|
||||||
|
print(pskeptic)
|
||||||
|
print(vocabulary_size)
|
||||||
|
print(paranormal_words_total)
|
||||||
|
print(skeptick_words_total)
|
||||||
|
|
||||||
|
model = (pskeptic, vocabulary_size,
|
||||||
|
skeptick_words_total, paranormal_words_total,
|
||||||
|
skeptic_count,paranormal_count)
|
||||||
|
|
||||||
|
pickle.dump(model, open("model.pkl", "wb"))
|
||||||
|
|
||||||
|
train()
|
Loading…
Reference in New Issue
Block a user