Self-made naive bayes solution

2020-03-21 19:01:09 +01:00 · 2020-03-21 19:01:09 +01:00 · dd89b36a5a
commit dd89b36a5a
parent 6d42bd9ed7
5 changed files with 523531 additions and 1406 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/normalize.py
+++ b/normalize.py
@ -0,0 +1,2 @@
+def normalize(d):
+    return d.split(' ')
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,29 @@
+import sys
+import pickle
+import math
+from normalize import normalize
+
+model = pickle.load(open("model.pkl", "rb"))
+
+pskeptic, vocabulary_size,skeptick_words_total, paranormal_words_total, skeptic_count,paranormal_count = model
+
+for line in sys.stdin:
+    document = line.rstrip()
+    terms = normalize(document)
+
+    log_prob_skeptic = math.log(pskeptic)
+    log_prob_paranormal = math.log(1-pskeptic)
+
+    for term in terms:
+        if term not in skeptic_count:
+            skeptic_count[term] = 0
+        if term not in paranormal_count:
+            paranormal_count[term] = 0
+
+        log_prob_skeptic += math.log((skeptic_count[term]+1)/(skeptick_words_total + vocabulary_size))
+        log_prob_paranormal += math.log((paranormal_count[term]+1)/(paranormal_words_total + vocabulary_size))
+
+        if log_prob_skeptic > log_prob_paranormal:
+            print("S")
+        else:
+            print("P")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -0,0 +1,60 @@
+#!/usr/bin/python3
+
+import sys
+import pickle
+from normalize import normalize
+
+def train():
+    dokuments_total = 0
+    skeptic_dokuments_total = 0
+
+    vocabulary = set()
+
+    skeptick_words_total = 0
+    paranormal_words_total = 0
+
+    skeptic_count = {}
+    paranormal_count = {}
+
+    for line in sys.stdin:
+        line = line.rstrip()
+        fields = line.split('\t')
+        label = fields[0].strip()
+        dokument = fields[1]
+        terms = normalize(dokument)
+
+        for t in terms:
+            vocabulary.add(t)
+
+        dokuments_total += 1
+        if label == 'S':
+            skeptic_dokuments_total += 1
+            skeptick_words_total += len(terms)
+            for term in terms:
+                if term in skeptic_count:
+                    skeptic_count[term] +=1
+                else:
+                    skeptic_count[term] = 1
+        else:
+            paranormal_words_total += len(terms)
+            for term in terms:
+                if term in paranormal_count:
+                    paranormal_count[term] +=1
+                else:
+                    paranormal_count[term] = 1
+
+
+    pskeptic = skeptic_dokuments_total / dokuments_total
+    vocabulary_size = len(vocabulary)
+    print(pskeptic)
+    print(vocabulary_size)
+    print(paranormal_words_total)
+    print(skeptick_words_total)
+
+    model = (pskeptic, vocabulary_size,
+            skeptick_words_total, paranormal_words_total,
+            skeptic_count,paranormal_count)
+
+    pickle.dump(model, open("model.pkl", "wb"))
+
+train()