13 changed files with 348701 additions and 305314 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,4 @@
-in.tsv
-model.pkl
+
 *~
 *.swp
 *.bak
@ -7,4 +6,3 @@ model.pkl
 *.o
 .DS_Store
 .token
-.idea
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
 "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
 ,Glitch-in-the-Matrix, conspiracytheories).

-Output label is 0 (for skeptic) and  1 (for paranormal).
+Output label is `S` and `P`.

 Sources
 -------
--- a/config.txt
+++ b/config.txt
@ -1 +1 @@
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
+--metric Accuracy --precision 4  
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/scores.txt
+++ b/scores.txt
@ -0,0 +1,15 @@
+0.6920
+0.6857
+0.6969
+0.6931
+0.6927
+0.6952
+0.6969
+0.6969
+0.6959
+0.6959
+0.6965
+0.6965
+0.6965
+0.6954
+0.6965
--- a/solution.py
+++ b/solution.py
@ -1,42 +1,20 @@
-import pandas as pd
-import numpy as np
-import csv
-from sklearn.linear_model import LogisticRegression
-from sklearn.feature_extraction.text import CountVectorizer
-count_vect = CountVectorizer()
+import re
+import sys

-#load data:
-train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
-texts = train["text"]
-y = pd.read_csv("train/expected.tsv", header=None)
+for line in sys.stdin:
+    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
+        print("P")
+    else:
+        print("S")

-#print(y)
-#train
-X_train_counts = count_vect.fit_transform(texts)
-clf = LogisticRegression().fit(X_train_counts, y)
-print(texts[0])
-print(len(texts))
-print(len(y))

-#predict
-dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
-testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]

-dev0_new_counts = count_vect.transform(dev0)
-testA_new_counts = count_vect.transform(testA)

-predicted_dev0 = clf.predict(dev0_new_counts)
-predicted_testA = clf.predict(testA_new_counts)

-print(len(dev0))
-print(len(predicted_dev0))

-with open("dev-0/out.tsv", "w") as out1:
-    for line in predicted_dev0:
-        out1.write(str(line))
-        out1.write("\n")
+"""

-with open("test-A/out.tsv", "w") as out2:
-    for line in predicted_testA:
-        out2.write(str(line))
-        out2.write("\n")
+
+happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
+
+"""
--- a/start.sh
+++ b/start.sh
@ -0,0 +1,4 @@
+xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
+
+xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
+geval -t dev-0 >>scores.txt
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/PsetsubtractionS_counted.txt
+++ b/train/PsetsubtractionS_counted.txt
--- a/train/PssS_c_clean
+++ b/train/PssS_c_clean
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/s.sh
+++ b/train/s.sh
@ -0,0 +1,4 @@
+xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
+xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
+comm -23 sortedP sortedS > PsetsubtractionS
+cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt