Compare commits
No commits in common. "master" and "ISI-14" have entirely different histories.
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
|
||||
in.tsv
|
||||
model.pkl
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
@ -6,3 +7,4 @@
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
||||
.idea
|
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is `S` and `P`.
|
||||
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
@ -1 +1 @@
|
||||
--metric Accuracy --precision 4
|
||||
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
15
scores.txt
15
scores.txt
@ -1,15 +0,0 @@
|
||||
0.6920
|
||||
0.6857
|
||||
0.6969
|
||||
0.6931
|
||||
0.6927
|
||||
0.6952
|
||||
0.6969
|
||||
0.6969
|
||||
0.6959
|
||||
0.6959
|
||||
0.6965
|
||||
0.6965
|
||||
0.6965
|
||||
0.6954
|
||||
0.6965
|
46
solution.py
46
solution.py
@ -1,20 +1,42 @@
|
||||
import re
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import csv
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
count_vect = CountVectorizer()
|
||||
|
||||
for line in sys.stdin:
|
||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
|
||||
print("P")
|
||||
else:
|
||||
print("S")
|
||||
#load data:
|
||||
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
|
||||
texts = train["text"]
|
||||
y = pd.read_csv("train/expected.tsv", header=None)
|
||||
|
||||
#print(y)
|
||||
#train
|
||||
X_train_counts = count_vect.fit_transform(texts)
|
||||
clf = LogisticRegression().fit(X_train_counts, y)
|
||||
print(texts[0])
|
||||
print(len(texts))
|
||||
print(len(y))
|
||||
|
||||
#predict
|
||||
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
|
||||
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
|
||||
|
||||
dev0_new_counts = count_vect.transform(dev0)
|
||||
testA_new_counts = count_vect.transform(testA)
|
||||
|
||||
predicted_dev0 = clf.predict(dev0_new_counts)
|
||||
predicted_testA = clf.predict(testA_new_counts)
|
||||
|
||||
print(len(dev0))
|
||||
print(len(predicted_dev0))
|
||||
|
||||
"""
|
||||
with open("dev-0/out.tsv", "w") as out1:
|
||||
for line in predicted_dev0:
|
||||
out1.write(str(line))
|
||||
out1.write("\n")
|
||||
|
||||
|
||||
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
|
||||
|
||||
"""
|
||||
with open("test-A/out.tsv", "w") as out2:
|
||||
for line in predicted_testA:
|
||||
out2.write(str(line))
|
||||
out2.write("\n")
|
||||
|
4
start.sh
4
start.sh
@ -1,4 +0,0 @@
|
||||
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
|
||||
|
||||
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
|
||||
geval -t dev-0 >>scores.txt
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
@ -1,4 +0,0 @@
|
||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
|
||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
|
||||
comm -23 sortedP sortedS > PsetsubtractionS
|
||||
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt
|
Loading…
Reference in New Issue
Block a user