tokenizer_nltk
This commit is contained in:
parent
73b72d2df3
commit
c88dde7f62
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from tokenizator import tokenizator
|
||||||
|
import math
|
||||||
|
model = pickle.load(open('model.pkl', 'rb', ))
|
||||||
|
pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptics_count, paranormal_count = model
|
||||||
|
for line in sys.stdin:
|
||||||
|
document = line.rstrip()
|
||||||
|
terms = tokenizator(document)
|
||||||
|
log_prob_skeptic = math.log(pskeptic)
|
||||||
|
log_prob_paranormal = math.log(1-pskeptic)
|
||||||
|
|
||||||
|
for term in terms:
|
||||||
|
if term not in skeptics_count:
|
||||||
|
skeptics_count[term]=0
|
||||||
|
if term not in paranormal_count:
|
||||||
|
paranormal_count[term]=0
|
||||||
|
log_prob_skeptic += math.log((skeptics_count[term] +1)/(skeptic_words_total + vocabulary_size))
|
||||||
|
log_prob_paranormal += math.log((paranormal_count[term] +1)/(paranormal_words_total + vocabulary_size))
|
||||||
|
if log_prob_skeptic > log_prob_paranormal:
|
||||||
|
print("S")
|
||||||
|
else:
|
||||||
|
print("P")
|
|
@ -0,0 +1,6 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import nltk
|
||||||
|
nltk.download('punkt')
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
def tokenizator(d):
|
||||||
|
return word_tokenize(d)
|
Loading…
Reference in New Issue