Compare commits
No commits in common. "master" and "ISI-39" have entirely different histories.
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
||||
|
||||
local
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
|
104
Makefile
Normal file
104
Makefile
Normal file
@ -0,0 +1,104 @@
|
||||
SHELL=/bin/bash
|
||||
.SECONDARY:
|
||||
.DELETE_ON_ERROR:
|
||||
|
||||
|
||||
KENLM_DIR=/home/jakub/ISI/kenlm/build/bin
|
||||
|
||||
# $< - pierwsza zależność
|
||||
# $@ - cel
|
||||
|
||||
dev-0/out.tsv: dev-0/local/paranormal_out.tsv dev-0/local/skeptic_out.tsv test-A/out.tsv
|
||||
python3 normalize_out.py dev-0/local/paranormal_out.tsv dev-0/local/skeptic_out.tsv $@
|
||||
geval -t dev-0 >> score.txt
|
||||
|
||||
test-A/out.tsv: test-A/local/paranormal_out.tsv test-A/local/skeptic_out.tsv
|
||||
python3 normalize_out.py test-A/local/paranormal_out.tsv test-A/local/skeptic_out.tsv $@
|
||||
|
||||
|
||||
dev-0/local/paranormal_out.tsv: dev-0/local/paranormal_out_total parse_out.py dev-0/local/
|
||||
cat $< | python3 parse_out.py > $@
|
||||
|
||||
test-A/local/paranormal_out.tsv: test-A/local/paranormal_out_total parse_out.py test-A/local/
|
||||
cat $< | python3 parse_out.py > $@
|
||||
|
||||
dev-0/local/skeptic_out.tsv: dev-0/local/skeptic_out_total parse_out.py dev-0/local/
|
||||
cat $< | python3 parse_out.py > $@
|
||||
|
||||
test-A/local/skeptic_out.tsv: test-A/local/skeptic_out_total parse_out.py test-A/local/
|
||||
cat $< | python3 parse_out.py > $@
|
||||
|
||||
dev-0/local/paranormal_out_total: dev-0/local/raw_paranormal_out dev-0/local/
|
||||
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
|
||||
|
||||
test-A/local/paranormal_out_total: test-A/local/raw_paranormal_out test-A/local/
|
||||
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
|
||||
|
||||
dev-0/local/skeptic_out_total: dev-0/local/raw_skeptic_out dev-0/local/
|
||||
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
|
||||
|
||||
test-A/local/skeptic_out_total: test-A/local/raw_skeptic_out test-A/local/
|
||||
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
|
||||
|
||||
dev-0/local/raw_paranormal_out: local/paranormal.lm.arpa dev-0/local/tokenized.dev-0.in.tsv dev-0/local/
|
||||
cat dev-0/local/tokenized.dev-0.in.tsv | $(KENLM_DIR)/query $< > $@
|
||||
|
||||
test-A/local/raw_paranormal_out: local/paranormal.lm.arpa test-A/local/tokenized.test-A.in.tsv test-A/local/
|
||||
cat test-A/local/tokenized.test-A.in.tsv | $(KENLM_DIR)/query $< > $@
|
||||
|
||||
dev-0/local/raw_skeptic_out: local/skeptic.lm.arpa dev-0/local/tokenized.dev-0.in.tsv dev-0/local/
|
||||
cat dev-0/local/tokenized.dev-0.in.tsv | $(KENLM_DIR)/query $< > $@
|
||||
|
||||
test-A/local/raw_skeptic_out: local/skeptic.lm.arpa test-A/local/tokenized.test-A.in.tsv test-A/local/
|
||||
cat test-A/local/tokenized.test-A.in.tsv | $(KENLM_DIR)/query $< > $@
|
||||
|
||||
dev-0/local/tokenized.dev-0.in.tsv: dev-0/in.tsv.xz mytokenize.py mytokenizer.py dev-0/local/
|
||||
xzcat $< | python3 mytokenizer.py > $@
|
||||
|
||||
test-A/local/tokenized.test-A.in.tsv: test-A/in.tsv.xz mytokenize.py mytokenizer.py test-A/local/
|
||||
xzcat $< | python3 mytokenizer.py > $@
|
||||
|
||||
test-A/local/:
|
||||
mkdir test-A/local/
|
||||
|
||||
dev-0/local/:
|
||||
mkdir dev-0/local/
|
||||
#local/paranormal.lm.bin: local/paranormal.lm.arpa
|
||||
# $(KENLM_DIR)/build_binary <$< >$@
|
||||
|
||||
local/paranormal.lm.arpa: local/x100tokenized.in.paranormal local/
|
||||
$(KENLM_DIR)/lmplz -o 5 -S 50% --discount_fallback <$< >$@
|
||||
# --discount_fallback
|
||||
local/skeptic.lm.arpa: local/x100tokenized.in.skeptic local/
|
||||
$(KENLM_DIR)/lmplz -o 5 -S 50% --discount_fallback <$< >$@
|
||||
|
||||
local/x100tokenized.in.paranormal: local/x10tokenized.in.paranormal
|
||||
cat $< $< $< $< $< $< $< $< $< $< > $@
|
||||
|
||||
local/x10tokenized.in.paranormal: local/tokenized.in.paranormal local/
|
||||
cat $< $< $< $< $< $< $< $< $< $< > $@
|
||||
|
||||
local/x100tokenized.in.skeptic: local/x10tokenized.in.skeptic local/
|
||||
cat $< $< $< $< $< $< $< $< $< $< > $@
|
||||
|
||||
local/x10tokenized.in.skeptic: local/tokenized.in.skeptic local/
|
||||
cat $< $< $< $< $< $< $< $< $< $< > $@
|
||||
|
||||
local/tokenized.in.paranormal: local/in.paranormal mytokenize.py mytokenizer.py local/
|
||||
cat $< | python3 mytokenizer.py > $@
|
||||
|
||||
local/tokenized.in.skeptic: local/in.skeptic mytokenize.py mytokenizer.py local/
|
||||
cat $< | python3 mytokenizer.py > $@
|
||||
|
||||
local/in.paranormal: train/in.tsv.xz train/expected.tsv local/
|
||||
xzcat $< | paste train/expected.tsv - | egrep '^1.*' > $@
|
||||
|
||||
local/in.skeptic: train/in.tsv.xz train/expected.tsv local/
|
||||
xzcat $< | paste train/expected.tsv - | egrep '^0.*' > $@
|
||||
|
||||
local/:
|
||||
mkdir local/
|
||||
|
||||
|
||||
|
||||
|
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is `S` and `P`.
|
||||
Output label is the probability of a paranormal subreddit.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
@ -1 +1 @@
|
||||
--metric Accuracy --precision 4
|
||||
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
32
mytokenize.py
Normal file
32
mytokenize.py
Normal file
@ -0,0 +1,32 @@
|
||||
import re
|
||||
|
||||
"""
|
||||
Takes a document and returns a list of tokens.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
def tokenize(d):
|
||||
d = re.sub(r'(\\n)', '', d)
|
||||
p = re.compile('[a-zA-Z\']+')
|
||||
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
||||
d = d.lower()
|
||||
return p.findall(d)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def tokenize(d):
|
||||
d = re.sub(r'(\s+|\\n)', ' ', d)
|
||||
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
||||
d = re.sub(r'\d+', 'NUM', d)
|
||||
|
||||
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !").replace("*", " * ")
|
||||
d = d.replace("\"", " \" ")
|
||||
|
||||
|
||||
return re.split(r'\s+', d)
|
||||
"""
|
11
mytokenizer.py
Normal file
11
mytokenizer.py
Normal file
@ -0,0 +1,11 @@
|
||||
from mytokenize import tokenize
|
||||
import lzma
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
tokens = tokenize(line)
|
||||
print(" ".join(tokens))
|
||||
|
||||
|
78
normalize_out.py
Normal file
78
normalize_out.py
Normal file
@ -0,0 +1,78 @@
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
def sigmoid(mean, x):
|
||||
y = np.true_divide(1.0, (1.0 + np.power(np.e, -1.0*(0.00001*(x+mean)))))
|
||||
return y
|
||||
|
||||
|
||||
paranormal_out_path = sys.argv[1]
|
||||
skeptic_out_path = sys.argv[2]
|
||||
|
||||
x_min = 0.0
|
||||
x_max = -10000.0
|
||||
paranormal_out = []
|
||||
with open(paranormal_out_path, 'r') as f:
|
||||
for line in f:
|
||||
v = float(line.rstrip())
|
||||
paranormal_out.append(v)
|
||||
if v > x_max:
|
||||
x_max = v
|
||||
if v < x_min:
|
||||
x_min = v
|
||||
|
||||
paranormal_mean = np.mean(paranormal_out)
|
||||
print("p mean: " + str(paranormal_mean))
|
||||
|
||||
skeptic_out = []
|
||||
with open(skeptic_out_path, 'r') as f:
|
||||
for line in f:
|
||||
v = float(line.rstrip())
|
||||
skeptic_out.append(v)
|
||||
if v > x_max:
|
||||
x_max = v
|
||||
if v < x_min:
|
||||
x_min = v
|
||||
|
||||
skeptic_mean = np.mean(skeptic_out)
|
||||
print("s mean: " + str(skeptic_mean))
|
||||
|
||||
if len(skeptic_out) != len(paranormal_out):
|
||||
print("ERROR! s len: %d" % len(skeptic_out), "\tp len: %d" % len(skeptic_out))
|
||||
quit(-1)
|
||||
|
||||
print("min: %f" % x_min, "\tmax: %f" % x_max)
|
||||
#print(log_out)
|
||||
|
||||
#out_path = "dev-0/out.tsv"
|
||||
out_path = sys.argv[3]
|
||||
with open(out_path, 'w') as o:
|
||||
for i in range(0, len(skeptic_out)):
|
||||
paranormal = sigmoid(paranormal_mean, paranormal_out[i])
|
||||
skeptic = sigmoid(skeptic_mean, skeptic_out[i])
|
||||
if skeptic > paranormal:
|
||||
out_p = 0.3
|
||||
#out_p = skeptic
|
||||
else:
|
||||
out_p = 0.7
|
||||
#out_p = paranormal
|
||||
|
||||
|
||||
o.write(str(out_p) + "\n")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
8
parse_out.py
Normal file
8
parse_out.py
Normal file
@ -0,0 +1,8 @@
|
||||
import sys
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
log_prob = line.rstrip().split(" ")[1]
|
||||
print(log_prob)
|
||||
|
||||
|
5
score.txt
Normal file
5
score.txt
Normal file
@ -0,0 +1,5 @@
|
||||
Likelihood 0.5500
|
||||
Accuracy 0.7153
|
||||
F1.0 0.3471
|
||||
Precision 0.9215
|
||||
Recall 0.2138
|
15
scores.txt
15
scores.txt
@ -1,15 +0,0 @@
|
||||
0.6920
|
||||
0.6857
|
||||
0.6969
|
||||
0.6931
|
||||
0.6927
|
||||
0.6952
|
||||
0.6969
|
||||
0.6969
|
||||
0.6959
|
||||
0.6959
|
||||
0.6965
|
||||
0.6965
|
||||
0.6965
|
||||
0.6954
|
||||
0.6965
|
20
solution.py
20
solution.py
@ -1,20 +0,0 @@
|
||||
import re
|
||||
import sys
|
||||
|
||||
for line in sys.stdin:
|
||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
|
||||
print("P")
|
||||
else:
|
||||
print("S")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
|
||||
|
||||
"""
|
4
start.sh
4
start.sh
@ -1,4 +0,0 @@
|
||||
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
|
||||
|
||||
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
|
||||
geval -t dev-0 >>scores.txt
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
@ -1,4 +0,0 @@
|
||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
|
||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
|
||||
comm -23 sortedP sortedS > PsetsubtractionS
|
||||
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt
|
Loading…
Reference in New Issue
Block a user