ISI-2 naive bayes, self-made tokenizer

2020-03-23 13:24:57 +01:00
11 changed files with 3130 additions and 2983 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-
+model.pkl
 *~
 *.swp
 *.bak
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/modelNB.txt
+++ b/modelNB.txt
@ -0,0 +1,13 @@
+P(c)
+P(nie c)
+|v|
+ile wyraców w c
+ile wyrazów w nie c
+
+wystąpienia wyrazów w c
+    -the-1544
+    ....
+wystąpienia wyrazów w nie c
+...
+
+
--- a/predictNB.py
+++ b/predictNB.py
@ -0,0 +1,30 @@
+import sys
+import pickle
+from  math import log
+from tokenize import tokenize
+
+model = pickle.load(open("model.pkl", "rb"))
+pskeptic, vocabulary_size, sceptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
+
+for line in sys.stdin:
+    document = line.rstrip()
+    terms = tokenize(document)
+    
+    log_prob_skeptic = log(pskeptic)
+    log_prob_paranormal = log(1-pskeptic)
+
+    for term in terms:
+        if term not in skeptic_count:
+            skeptic_count[term] = 0
+        if term not in paranormal_count:
+            paranormal_count[term] = 0
+
+        log_prob_skeptic += log((skeptic_count[term] + 1) 
+                                    / (sceptic_words_total + vocabulary_size))
+        log_prob_paranormal += log((paranormal_count[term] + 1)
+                                    / (paranormal_words_total + vocabulary_size) )
+
+    if log_prob_skeptic > log_prob_paranormal:
+        print("S")
+    else:
+        print("P")
--- a/scores.txt
+++ b/scores.txt
@ -13,3 +13,26 @@
 0.6965
 0.6954
 0.6965
+0.6978
+0.3583
+0.5421
+0.5398
+0.6978
+
+NB:
+
+0.7549
+0.7568
+0.7601
+0.7606
+0.7618
+0.7610
+0.7578
+0.7593
+0.7661
+0.7661
+0.7665
+0.7788
+0.7781
+0.7803
+0.7802
--- a/solution.py
+++ b/solution.py
@ -2,7 +2,7 @@ import re
 import sys

 for line in sys.stdin:
-    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
+    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep'.lower(), line.lower()):
        print("P")
    else:
        print("S")
@ -14,7 +14,5 @@ for line in sys.stdin:

 """

-
-happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
-
+UFO|paranormal|i|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story|moved|orbs|bright|aircraft|experienced|military|woke|Paranormal|Ghost|advanced|board|planet|shape|travel|lens|asleep|move|remember|hoax|circles|witnesses|extraterrestrial|freaked|walking|technology|abduction|seeing|eyes|wake|balloons|EVP|ETs|orb|Ouija|felt|dont|ship|phone|demons|flash|flares|cool|Looks|entities|demon|terrestrial|triangle|vu|incident|zoom|recording|sharing|location|happening|nosleep|filmed|dimensional|Greer|dimension|reflection|bedroom|UAP|crop|dreaming|im|dad|scary|ouija|entity|awake|shaped|Roswell|floating|Aliens|meteor|noise|distance|encounters|seconds
 """
--- a/startNB.sh
+++ b/startNB.sh
@ -0,0 +1,9 @@
+xzcat train/in.tsv.xz | paste train/expected.tsv - | python3 trainNB.py
+
+
+xzcat dev-0/in.tsv.xz | python3 predictNB.py > dev-0/out.tsv
+
+xzcat test-A/in.tsv.xz | python3 predictNB.py > test-A/out.tsv
+
+geval -t dev-0 >>scores.txt
+cat scores.txt
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tokenize.py
+++ b/tokenize.py
@ -0,0 +1,10 @@
+import re
+
+def tokenize(d):
+    d = re.sub(r'(\s+|\\n)', ' ', d)
+    d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
+    d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
+    d = re.sub(r'\d+', 'NUM', d)
+    
+
+    return re.split(r'\s+', d)  
--- a/train/sNB.sh
+++ b/train/sNB.sh
@ -0,0 +1,2 @@
+xzcat in.tsv.xz | paste expected.tsv - 
+
--- a/trainNB.py
+++ b/trainNB.py
@ -0,0 +1,62 @@
+#!/usr/bin/python3
+
+
+import re
+import sys
+import pickle
+from tokenize import tokenize
+
+
+
+
+def train():
+    documents_total = 0
+    sceptic_documents_total = 0
+    vocabulary = set()
+
+    sceptic_words_total = 0 
+    paranormal_words_total = 0
+
+    skeptic_count = {}
+    paranormal_count = {}
+
+    for line in sys.stdin:
+        line = line.rstrip()
+        fields = line.split('\t')
+        label = fields[0].strip()
+        document = fields[1]
+        print(document)
+        terms = tokenize(document)
+        print(terms)
+        for t in terms:
+            vocabulary.add(t)
+
+        documents_total += 1
+        if label == 'S':
+            sceptic_documents_total += 1
+            sceptic_words_total += len(terms)
+            for term in terms:
+                if term in skeptic_count:
+                    skeptic_count[term] += 1
+                else:
+                    skeptic_count[term] = 1
+        else:
+            paranormal_words_total += len(terms)
+            for term in terms:
+                if term in paranormal_count:
+                    paranormal_count[term] += 1
+                else:
+                    paranormal_count[term] = 1
+
+    pskeptic =  sceptic_documents_total / documents_total
+    vocabulary_size = len(vocabulary)
+    model = (pskeptic,
+             vocabulary_size,
+             sceptic_words_total,
+             paranormal_words_total,
+             skeptic_count,
+             paranormal_count)
+    pickle.dump(model, open("model.pkl", "wb"))
+
+
+train()