Compare commits

...

No commits in common. "master" and "ISI-39" have entirely different histories.

19 changed files with 305516 additions and 348709 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
local
*~
*.swp
*.bak

104
Makefile Normal file
View File

@ -0,0 +1,104 @@
SHELL=/bin/bash
.SECONDARY:
.DELETE_ON_ERROR:
KENLM_DIR=/home/jakub/ISI/kenlm/build/bin
# $< - pierwsza zależność
# $@ - cel
dev-0/out.tsv: dev-0/local/paranormal_out.tsv dev-0/local/skeptic_out.tsv test-A/out.tsv
python3 normalize_out.py dev-0/local/paranormal_out.tsv dev-0/local/skeptic_out.tsv $@
geval -t dev-0 >> score.txt
test-A/out.tsv: test-A/local/paranormal_out.tsv test-A/local/skeptic_out.tsv
python3 normalize_out.py test-A/local/paranormal_out.tsv test-A/local/skeptic_out.tsv $@
dev-0/local/paranormal_out.tsv: dev-0/local/paranormal_out_total parse_out.py dev-0/local/
cat $< | python3 parse_out.py > $@
test-A/local/paranormal_out.tsv: test-A/local/paranormal_out_total parse_out.py test-A/local/
cat $< | python3 parse_out.py > $@
dev-0/local/skeptic_out.tsv: dev-0/local/skeptic_out_total parse_out.py dev-0/local/
cat $< | python3 parse_out.py > $@
test-A/local/skeptic_out.tsv: test-A/local/skeptic_out_total parse_out.py test-A/local/
cat $< | python3 parse_out.py > $@
dev-0/local/paranormal_out_total: dev-0/local/raw_paranormal_out dev-0/local/
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
test-A/local/paranormal_out_total: test-A/local/raw_paranormal_out test-A/local/
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
dev-0/local/skeptic_out_total: dev-0/local/raw_skeptic_out dev-0/local/
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
test-A/local/skeptic_out_total: test-A/local/raw_skeptic_out test-A/local/
cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
dev-0/local/raw_paranormal_out: local/paranormal.lm.arpa dev-0/local/tokenized.dev-0.in.tsv dev-0/local/
cat dev-0/local/tokenized.dev-0.in.tsv | $(KENLM_DIR)/query $< > $@
test-A/local/raw_paranormal_out: local/paranormal.lm.arpa test-A/local/tokenized.test-A.in.tsv test-A/local/
cat test-A/local/tokenized.test-A.in.tsv | $(KENLM_DIR)/query $< > $@
dev-0/local/raw_skeptic_out: local/skeptic.lm.arpa dev-0/local/tokenized.dev-0.in.tsv dev-0/local/
cat dev-0/local/tokenized.dev-0.in.tsv | $(KENLM_DIR)/query $< > $@
test-A/local/raw_skeptic_out: local/skeptic.lm.arpa test-A/local/tokenized.test-A.in.tsv test-A/local/
cat test-A/local/tokenized.test-A.in.tsv | $(KENLM_DIR)/query $< > $@
dev-0/local/tokenized.dev-0.in.tsv: dev-0/in.tsv.xz mytokenize.py mytokenizer.py dev-0/local/
xzcat $< | python3 mytokenizer.py > $@
test-A/local/tokenized.test-A.in.tsv: test-A/in.tsv.xz mytokenize.py mytokenizer.py test-A/local/
xzcat $< | python3 mytokenizer.py > $@
test-A/local/:
mkdir test-A/local/
dev-0/local/:
mkdir dev-0/local/
#local/paranormal.lm.bin: local/paranormal.lm.arpa
# $(KENLM_DIR)/build_binary <$< >$@
local/paranormal.lm.arpa: local/x100tokenized.in.paranormal local/
$(KENLM_DIR)/lmplz -o 5 -S 50% --discount_fallback <$< >$@
# --discount_fallback
local/skeptic.lm.arpa: local/x100tokenized.in.skeptic local/
$(KENLM_DIR)/lmplz -o 5 -S 50% --discount_fallback <$< >$@
local/x100tokenized.in.paranormal: local/x10tokenized.in.paranormal
cat $< $< $< $< $< $< $< $< $< $< > $@
local/x10tokenized.in.paranormal: local/tokenized.in.paranormal local/
cat $< $< $< $< $< $< $< $< $< $< > $@
local/x100tokenized.in.skeptic: local/x10tokenized.in.skeptic local/
cat $< $< $< $< $< $< $< $< $< $< > $@
local/x10tokenized.in.skeptic: local/tokenized.in.skeptic local/
cat $< $< $< $< $< $< $< $< $< $< > $@
local/tokenized.in.paranormal: local/in.paranormal mytokenize.py mytokenizer.py local/
cat $< | python3 mytokenizer.py > $@
local/tokenized.in.skeptic: local/in.skeptic mytokenize.py mytokenizer.py local/
cat $< | python3 mytokenizer.py > $@
local/in.paranormal: train/in.tsv.xz train/expected.tsv local/
xzcat $< | paste train/expected.tsv - | egrep '^1.*' > $@
local/in.skeptic: train/in.tsv.xz train/expected.tsv local/
xzcat $< | paste train/expected.tsv - | egrep '^0.*' > $@
local/:
mkdir local/

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`.
Output label is the probability of a paranormal subreddit.
Sources
-------

View File

@ -1 +1 @@
--metric Accuracy --precision 4
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

32
mytokenize.py Normal file
View File

@ -0,0 +1,32 @@
import re
"""
Takes a document and returns a list of tokens.
"""
def tokenize(d):
d = re.sub(r'(\\n)', '', d)
p = re.compile('[a-zA-Z\']+')
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = d.lower()
return p.findall(d)
"""
def tokenize(d):
d = re.sub(r'(\s+|\\n)', ' ', d)
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = re.sub(r'\d+', 'NUM', d)
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !").replace("*", " * ")
d = d.replace("\"", " \" ")
return re.split(r'\s+', d)
"""

11
mytokenizer.py Normal file
View File

@ -0,0 +1,11 @@
from mytokenize import tokenize
import lzma
import sys
for line in sys.stdin:
tokens = tokenize(line)
print(" ".join(tokens))

78
normalize_out.py Normal file
View File

@ -0,0 +1,78 @@
import sys
import numpy as np
def sigmoid(mean, x):
y = np.true_divide(1.0, (1.0 + np.power(np.e, -1.0*(0.00001*(x+mean)))))
return y
paranormal_out_path = sys.argv[1]
skeptic_out_path = sys.argv[2]
x_min = 0.0
x_max = -10000.0
paranormal_out = []
with open(paranormal_out_path, 'r') as f:
for line in f:
v = float(line.rstrip())
paranormal_out.append(v)
if v > x_max:
x_max = v
if v < x_min:
x_min = v
paranormal_mean = np.mean(paranormal_out)
print("p mean: " + str(paranormal_mean))
skeptic_out = []
with open(skeptic_out_path, 'r') as f:
for line in f:
v = float(line.rstrip())
skeptic_out.append(v)
if v > x_max:
x_max = v
if v < x_min:
x_min = v
skeptic_mean = np.mean(skeptic_out)
print("s mean: " + str(skeptic_mean))
if len(skeptic_out) != len(paranormal_out):
print("ERROR! s len: %d" % len(skeptic_out), "\tp len: %d" % len(skeptic_out))
quit(-1)
print("min: %f" % x_min, "\tmax: %f" % x_max)
#print(log_out)
#out_path = "dev-0/out.tsv"
out_path = sys.argv[3]
with open(out_path, 'w') as o:
for i in range(0, len(skeptic_out)):
paranormal = sigmoid(paranormal_mean, paranormal_out[i])
skeptic = sigmoid(skeptic_mean, skeptic_out[i])
if skeptic > paranormal:
out_p = 0.3
#out_p = skeptic
else:
out_p = 0.7
#out_p = paranormal
o.write(str(out_p) + "\n")

8
parse_out.py Normal file
View File

@ -0,0 +1,8 @@
import sys
for line in sys.stdin:
log_prob = line.rstrip().split(" ")[1]
print(log_prob)

5
score.txt Normal file
View File

@ -0,0 +1,5 @@
Likelihood 0.5500
Accuracy 0.7153
F1.0 0.3471
Precision 0.9215
Recall 0.2138

View File

@ -1,15 +0,0 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,20 +0,0 @@
import re
import sys
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
print("P")
else:
print("S")
"""
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""

View File

@ -1,4 +0,0 @@
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
geval -t dev-0 >>scores.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt