Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
24dd877b29 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
||||
|
||||
model.pkl
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
|
3070
dev-0/out.tsv
3070
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
13
modelNB.txt
Normal file
13
modelNB.txt
Normal file
@ -0,0 +1,13 @@
|
||||
P(c)
|
||||
P(nie c)
|
||||
|v|
|
||||
ile wyraców w c
|
||||
ile wyrazów w nie c
|
||||
|
||||
wystąpienia wyrazów w c
|
||||
-the-1544
|
||||
....
|
||||
wystąpienia wyrazów w nie c
|
||||
...
|
||||
|
||||
|
30
predictNB.py
Normal file
30
predictNB.py
Normal file
@ -0,0 +1,30 @@
|
||||
import sys
|
||||
import pickle
|
||||
from math import log
|
||||
from tokenize import tokenize
|
||||
|
||||
model = pickle.load(open("model.pkl", "rb"))
|
||||
pskeptic, vocabulary_size, sceptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
|
||||
|
||||
for line in sys.stdin:
|
||||
document = line.rstrip()
|
||||
terms = tokenize(document)
|
||||
|
||||
log_prob_skeptic = log(pskeptic)
|
||||
log_prob_paranormal = log(1-pskeptic)
|
||||
|
||||
for term in terms:
|
||||
if term not in skeptic_count:
|
||||
skeptic_count[term] = 0
|
||||
if term not in paranormal_count:
|
||||
paranormal_count[term] = 0
|
||||
|
||||
log_prob_skeptic += log((skeptic_count[term] + 1)
|
||||
/ (sceptic_words_total + vocabulary_size))
|
||||
log_prob_paranormal += log((paranormal_count[term] + 1)
|
||||
/ (paranormal_words_total + vocabulary_size) )
|
||||
|
||||
if log_prob_skeptic > log_prob_paranormal:
|
||||
print("S")
|
||||
else:
|
||||
print("P")
|
23
scores.txt
23
scores.txt
@ -13,3 +13,26 @@
|
||||
0.6965
|
||||
0.6954
|
||||
0.6965
|
||||
0.6978
|
||||
0.3583
|
||||
0.5421
|
||||
0.5398
|
||||
0.6978
|
||||
|
||||
NB:
|
||||
|
||||
0.7549
|
||||
0.7568
|
||||
0.7601
|
||||
0.7606
|
||||
0.7618
|
||||
0.7610
|
||||
0.7578
|
||||
0.7593
|
||||
0.7661
|
||||
0.7661
|
||||
0.7665
|
||||
0.7788
|
||||
0.7781
|
||||
0.7803
|
||||
0.7802
|
||||
|
@ -2,7 +2,7 @@ import re
|
||||
import sys
|
||||
|
||||
for line in sys.stdin:
|
||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
|
||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep'.lower(), line.lower()):
|
||||
print("P")
|
||||
else:
|
||||
print("S")
|
||||
@ -14,7 +14,5 @@ for line in sys.stdin:
|
||||
|
||||
"""
|
||||
|
||||
|
||||
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
|
||||
|
||||
UFO|paranormal|i|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story|moved|orbs|bright|aircraft|experienced|military|woke|Paranormal|Ghost|advanced|board|planet|shape|travel|lens|asleep|move|remember|hoax|circles|witnesses|extraterrestrial|freaked|walking|technology|abduction|seeing|eyes|wake|balloons|EVP|ETs|orb|Ouija|felt|dont|ship|phone|demons|flash|flares|cool|Looks|entities|demon|terrestrial|triangle|vu|incident|zoom|recording|sharing|location|happening|nosleep|filmed|dimensional|Greer|dimension|reflection|bedroom|UAP|crop|dreaming|im|dad|scary|ouija|entity|awake|shaped|Roswell|floating|Aliens|meteor|noise|distance|encounters|seconds
|
||||
"""
|
||||
|
9
startNB.sh
Executable file
9
startNB.sh
Executable file
@ -0,0 +1,9 @@
|
||||
xzcat train/in.tsv.xz | paste train/expected.tsv - | python3 trainNB.py
|
||||
|
||||
|
||||
xzcat dev-0/in.tsv.xz | python3 predictNB.py > dev-0/out.tsv
|
||||
|
||||
xzcat test-A/in.tsv.xz | python3 predictNB.py > test-A/out.tsv
|
||||
|
||||
geval -t dev-0 >>scores.txt
|
||||
cat scores.txt
|
2886
test-A/out.tsv
2886
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
10
tokenize.py
Normal file
10
tokenize.py
Normal file
@ -0,0 +1,10 @@
|
||||
import re
|
||||
|
||||
def tokenize(d):
|
||||
d = re.sub(r'(\s+|\\n)', ' ', d)
|
||||
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
||||
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
|
||||
d = re.sub(r'\d+', 'NUM', d)
|
||||
|
||||
|
||||
return re.split(r'\s+', d)
|
2
train/sNB.sh
Normal file
2
train/sNB.sh
Normal file
@ -0,0 +1,2 @@
|
||||
xzcat in.tsv.xz | paste expected.tsv -
|
||||
|
62
trainNB.py
Normal file
62
trainNB.py
Normal file
@ -0,0 +1,62 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
|
||||
import re
|
||||
import sys
|
||||
import pickle
|
||||
from tokenize import tokenize
|
||||
|
||||
|
||||
|
||||
|
||||
def train():
|
||||
documents_total = 0
|
||||
sceptic_documents_total = 0
|
||||
vocabulary = set()
|
||||
|
||||
sceptic_words_total = 0
|
||||
paranormal_words_total = 0
|
||||
|
||||
skeptic_count = {}
|
||||
paranormal_count = {}
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.rstrip()
|
||||
fields = line.split('\t')
|
||||
label = fields[0].strip()
|
||||
document = fields[1]
|
||||
print(document)
|
||||
terms = tokenize(document)
|
||||
print(terms)
|
||||
for t in terms:
|
||||
vocabulary.add(t)
|
||||
|
||||
documents_total += 1
|
||||
if label == 'S':
|
||||
sceptic_documents_total += 1
|
||||
sceptic_words_total += len(terms)
|
||||
for term in terms:
|
||||
if term in skeptic_count:
|
||||
skeptic_count[term] += 1
|
||||
else:
|
||||
skeptic_count[term] = 1
|
||||
else:
|
||||
paranormal_words_total += len(terms)
|
||||
for term in terms:
|
||||
if term in paranormal_count:
|
||||
paranormal_count[term] += 1
|
||||
else:
|
||||
paranormal_count[term] = 1
|
||||
|
||||
pskeptic = sceptic_documents_total / documents_total
|
||||
vocabulary_size = len(vocabulary)
|
||||
model = (pskeptic,
|
||||
vocabulary_size,
|
||||
sceptic_words_total,
|
||||
paranormal_words_total,
|
||||
skeptic_count,
|
||||
paranormal_count)
|
||||
pickle.dump(model, open("model.pkl", "wb"))
|
||||
|
||||
|
||||
train()
|
Loading…
Reference in New Issue
Block a user