Compare commits

...

No commits in common. "ISI-14" and "master" have entirely different histories.

13 changed files with 348701 additions and 305314 deletions

4
.gitignore vendored
View File

@ -1,5 +1,4 @@
in.tsv
model.pkl
*~
*.swp
*.bak
@ -7,4 +6,3 @@ model.pkl
*.o
.DS_Store
.token
.idea

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is 0 (for skeptic) and 1 (for paranormal).
Output label is `S` and `P`.
Sources
-------

View File

@ -1 +1 @@
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--metric Accuracy --precision 4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

15
scores.txt Normal file
View File

@ -0,0 +1,15 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,42 +1,20 @@
import pandas as pd
import numpy as np
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
import re
import sys
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None)
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
print("P")
else:
print("S")
#print(y)
#train
X_train_counts = count_vect.fit_transform(texts)
clf = LogisticRegression().fit(X_train_counts, y)
print(texts[0])
print(len(texts))
print(len(y))
#predict
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
dev0_new_counts = count_vect.transform(dev0)
testA_new_counts = count_vect.transform(testA)
predicted_dev0 = clf.predict(dev0_new_counts)
predicted_testA = clf.predict(testA_new_counts)
print(len(dev0))
print(len(predicted_dev0))
with open("dev-0/out.tsv", "w") as out1:
for line in predicted_dev0:
out1.write(str(line))
out1.write("\n")
"""
with open("test-A/out.tsv", "w") as out2:
for line in predicted_testA:
out2.write(str(line))
out2.write("\n")
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""

4
start.sh Executable file
View File

@ -0,0 +1,4 @@
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
geval -t dev-0 >>scores.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1
train/PssS_c_clean Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

4
train/s.sh Executable file
View File

@ -0,0 +1,4 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt