Initial

2020-06-07 12:52:17 +02:00 · 2020-06-07 12:52:17 +02:00 · 56d1fdc698
commit 56d1fdc698
19 changed files with 305397 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,2 @@
+# Default ignored files
+/workspace.xml
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/paranormal-skeptic-tf-idf.iml" filepath="$PROJECT_DIR$/.idea/paranormal-skeptic-tf-idf.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/paranormal-skeptic-tf-idf.iml
+++ b/.idea/paranormal-skeptic-tf-idf.iml
@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/README.md
+++ b/README.md
@ -0,0 +1,13 @@
+Skeptic vs paranormal subreddits
+================================
+
+Classify a reddit as either from Skeptic subreddit or one of the
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+,Glitch-in-the-Matrix, conspiracytheories).
+
+Output label is the probability of a paranormal subreddit.
+
+Sources
+-------
+
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+PostText	Timestamp
--- a/main.py
+++ b/main.py
@ -0,0 +1,58 @@
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.neighbors import KNeighborsClassifier
+import pandas as pd
+import csv
+
+
+def generate_output(predicted_proba, path):
+    f3 = open(path, "w")
+
+    predicted_proba[predicted_proba < 0.05] = 0.05
+    predicted_proba[predicted_proba > 0.95] = 0.95
+
+    string = ""
+    for probability in predicted_proba:
+        string += f"{probability[1]}\n"
+    f3.write(string)
+
+
+training_data = pd.read_csv('train/in.tsv.xz', header=None, names=["text", "date"], quoting=csv.QUOTE_NONE)
+y_train = pd.read_csv('train/expected.tsv', header=None, sep='	')
+
+count_vect = CountVectorizer()
+X_train_counts = count_vect.fit_transform(training_data['text'].values.astype('U'))
+
+# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
+tfidf_transformer = TfidfTransformer()
+X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
+
+knn = KNeighborsClassifier(n_neighbors=15)
+
+# training our classifier ; y_train will have numbers assigned for each category in train data
+clf = knn.fit(X_train_tfidf, y_train)
+
+# Input Data to predict their classes of the given y_train
+dev_data = pd.read_csv("dev-0/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
+# building up feature vector of our input
+X_new_counts = count_vect.transform(dev_data[0])
+# We call transform instead of fit_transform because it's already been fit
+X_new_tfidf = tfidf_transformer.transform(X_new_counts)
+
+print("Generating dev-0 output...")
+predicted_proba_dev = knn.predict_proba(X_new_tfidf)
+generate_output(predicted_proba_dev, "dev-0/out.tsv")
+print("Generated dev-0 output!")
+
+# Input Data to predict their classes of the given y_train
+test_data = pd.read_csv("test-A/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
+
+# building up feature vector of our input
+X_new_counts = count_vect.transform(test_data[0])
+# We call transform instead of fit_transform because it's already been fit
+X_new_tfidf = tfidf_transformer.transform(X_new_counts)
+
+print("Generating test-A output...")
+predicted_proba_test = knn.predict_proba(X_new_tfidf)
+generate_output(predicted_proba_test, "test-A/out.tsv")
+print("Generated test-A output!")
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Label
--- a/test-A/in.tsv.xz
+++ b/test-A/in.tsv.xz
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz
				`@ -0,0 +1 @@`
				`--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv`