Self-made naive bayes solution

2020-03-21 19:01:09 +01:00 · 2020-03-21 19:01:09 +01:00 · dd89b36a5a
commit dd89b36a5a
parent 6d42bd9ed7
5 changed files with 523531 additions and 1406 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/normalize.py
+++ b/normalize.py
@ -0,0 +1,2 @@
 def normalize(d):
    return d.split(' ')
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,29 @@
 import sys
 import pickle
 import math
 from normalize import normalize
 model = pickle.load(open("model.pkl", "rb"))
 pskeptic, vocabulary_size,skeptick_words_total, paranormal_words_total, skeptic_count,paranormal_count = model
 for line in sys.stdin:
    document = line.rstrip()
    terms = normalize(document)
    log_prob_skeptic = math.log(pskeptic)
    log_prob_paranormal = math.log(1-pskeptic)
    for term in terms:
        if term not in skeptic_count:
            skeptic_count[term] = 0
        if term not in paranormal_count:
            paranormal_count[term] = 0
        log_prob_skeptic += math.log((skeptic_count[term]+1)/(skeptick_words_total + vocabulary_size))
        log_prob_paranormal += math.log((paranormal_count[term]+1)/(paranormal_words_total + vocabulary_size))
        if log_prob_skeptic > log_prob_paranormal:
            print("S")
        else:
            print("P")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -0,0 +1,60 @@
 #!/usr/bin/python3
 import sys
 import pickle
 from normalize import normalize
 def train():
    dokuments_total = 0
    skeptic_dokuments_total = 0
    vocabulary = set()
    skeptick_words_total = 0
    paranormal_words_total = 0
    skeptic_count = {}
    paranormal_count = {}
    for line in sys.stdin:
        line = line.rstrip()
        fields = line.split('\t')
        label = fields[0].strip()
        dokument = fields[1]
        terms = normalize(dokument)
        for t in terms:
            vocabulary.add(t)
        dokuments_total += 1
        if label == 'S':
            skeptic_dokuments_total += 1
            skeptick_words_total += len(terms)
            for term in terms:
                if term in skeptic_count:
                    skeptic_count[term] +=1
                else:
                    skeptic_count[term] = 1
        else:
            paranormal_words_total += len(terms)
            for term in terms:
                if term in paranormal_count:
                    paranormal_count[term] +=1
                else:
                    paranormal_count[term] = 1
    pskeptic = skeptic_dokuments_total / dokuments_total
    vocabulary_size = len(vocabulary)
    print(pskeptic)
    print(vocabulary_size)
    print(paranormal_words_total)
    print(skeptick_words_total)
    model = (pskeptic, vocabulary_size,
            skeptick_words_total, paranormal_words_total,
            skeptic_count,paranormal_count)
    pickle.dump(model, open("model.pkl", "wb"))
 train()