Compare commits

..

1 Commits

Author SHA1 Message Date
ksanu
24dd877b29 ISI-2 naive bayes, self-made tokenizer 2020-03-23 13:24:57 +01:00
11 changed files with 3130 additions and 2983 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
model.pkl
*~
*.swp
*.bak

File diff suppressed because it is too large Load Diff

13
modelNB.txt Normal file
View File

@ -0,0 +1,13 @@
P(c)
P(nie c)
|v|
ile wyraców w c
ile wyrazów w nie c
wystąpienia wyrazów w c
-the-1544
....
wystąpienia wyrazów w nie c
...

30
predictNB.py Normal file
View File

@ -0,0 +1,30 @@
import sys
import pickle
from math import log
from tokenize import tokenize
model = pickle.load(open("model.pkl", "rb"))
pskeptic, vocabulary_size, sceptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
for line in sys.stdin:
document = line.rstrip()
terms = tokenize(document)
log_prob_skeptic = log(pskeptic)
log_prob_paranormal = log(1-pskeptic)
for term in terms:
if term not in skeptic_count:
skeptic_count[term] = 0
if term not in paranormal_count:
paranormal_count[term] = 0
log_prob_skeptic += log((skeptic_count[term] + 1)
/ (sceptic_words_total + vocabulary_size))
log_prob_paranormal += log((paranormal_count[term] + 1)
/ (paranormal_words_total + vocabulary_size) )
if log_prob_skeptic > log_prob_paranormal:
print("S")
else:
print("P")

View File

@ -13,3 +13,26 @@
0.6965
0.6954
0.6965
0.6978
0.3583
0.5421
0.5398
0.6978
NB:
0.7549
0.7568
0.7601
0.7606
0.7618
0.7610
0.7578
0.7593
0.7661
0.7661
0.7665
0.7788
0.7781
0.7803
0.7802

View File

@ -2,7 +2,7 @@ import re
import sys
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep'.lower(), line.lower()):
print("P")
else:
print("S")
@ -14,7 +14,5 @@ for line in sys.stdin:
"""
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
UFO|paranormal|i|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story|moved|orbs|bright|aircraft|experienced|military|woke|Paranormal|Ghost|advanced|board|planet|shape|travel|lens|asleep|move|remember|hoax|circles|witnesses|extraterrestrial|freaked|walking|technology|abduction|seeing|eyes|wake|balloons|EVP|ETs|orb|Ouija|felt|dont|ship|phone|demons|flash|flares|cool|Looks|entities|demon|terrestrial|triangle|vu|incident|zoom|recording|sharing|location|happening|nosleep|filmed|dimensional|Greer|dimension|reflection|bedroom|UAP|crop|dreaming|im|dad|scary|ouija|entity|awake|shaped|Roswell|floating|Aliens|meteor|noise|distance|encounters|seconds
"""

9
startNB.sh Executable file
View File

@ -0,0 +1,9 @@
xzcat train/in.tsv.xz | paste train/expected.tsv - | python3 trainNB.py
xzcat dev-0/in.tsv.xz | python3 predictNB.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 predictNB.py > test-A/out.tsv
geval -t dev-0 >>scores.txt
cat scores.txt

File diff suppressed because it is too large Load Diff

10
tokenize.py Normal file
View File

@ -0,0 +1,10 @@
import re
def tokenize(d):
d = re.sub(r'(\s+|\\n)', ' ', d)
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
d = re.sub(r'\d+', 'NUM', d)
return re.split(r'\s+', d)

2
train/sNB.sh Normal file
View File

@ -0,0 +1,2 @@
xzcat in.tsv.xz | paste expected.tsv -

62
trainNB.py Normal file
View File

@ -0,0 +1,62 @@
#!/usr/bin/python3
import re
import sys
import pickle
from tokenize import tokenize
def train():
documents_total = 0
sceptic_documents_total = 0
vocabulary = set()
sceptic_words_total = 0
paranormal_words_total = 0
skeptic_count = {}
paranormal_count = {}
for line in sys.stdin:
line = line.rstrip()
fields = line.split('\t')
label = fields[0].strip()
document = fields[1]
print(document)
terms = tokenize(document)
print(terms)
for t in terms:
vocabulary.add(t)
documents_total += 1
if label == 'S':
sceptic_documents_total += 1
sceptic_words_total += len(terms)
for term in terms:
if term in skeptic_count:
skeptic_count[term] += 1
else:
skeptic_count[term] = 1
else:
paranormal_words_total += len(terms)
for term in terms:
if term in paranormal_count:
paranormal_count[term] += 1
else:
paranormal_count[term] = 1
pskeptic = sceptic_documents_total / documents_total
vocabulary_size = len(vocabulary)
model = (pskeptic,
vocabulary_size,
sceptic_words_total,
paranormal_words_total,
skeptic_count,
paranormal_count)
pickle.dump(model, open("model.pkl", "wb"))
train()