Compare commits

...

No commits in common. "master" and "ISI-14" have entirely different histories.

13 changed files with 305314 additions and 348701 deletions

4
.gitignore vendored
View File

@ -1,4 +1,5 @@
in.tsv
model.pkl
*~
*.swp
*.bak
@ -6,3 +7,4 @@
*.o
.DS_Store
.token
.idea

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`.
Output label is 0 (for skeptic) and 1 (for paranormal).
Sources
-------

View File

@ -1 +1 @@
--metric Accuracy --precision 4
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +0,0 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,20 +1,42 @@
import re
import sys
import pandas as pd
import numpy as np
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
print("P")
else:
print("S")
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None)
#print(y)
#train
X_train_counts = count_vect.fit_transform(texts)
clf = LogisticRegression().fit(X_train_counts, y)
print(texts[0])
print(len(texts))
print(len(y))
#predict
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
dev0_new_counts = count_vect.transform(dev0)
testA_new_counts = count_vect.transform(testA)
predicted_dev0 = clf.predict(dev0_new_counts)
predicted_testA = clf.predict(testA_new_counts)
print(len(dev0))
print(len(predicted_dev0))
"""
with open("dev-0/out.tsv", "w") as out1:
for line in predicted_dev0:
out1.write(str(line))
out1.write("\n")
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""
with open("test-A/out.tsv", "w") as out2:
for line in predicted_testA:
out2.write(str(line))
out2.write("\n")

View File

@ -1,4 +0,0 @@
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
geval -t dev-0 >>scores.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt