This commit is contained in:
Yevheniia Tsapkova 2020-06-07 12:52:17 +02:00
commit 56d1fdc698
19 changed files with 305397 additions and 0 deletions

8
.gitignore vendored Executable file
View File

@ -0,0 +1,8 @@
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token

2
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-skeptic-tf-idf.iml" filepath="$PROJECT_DIR$/.idea/paranormal-skeptic-tf-idf.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

13
README.md Executable file
View File

@ -0,0 +1,13 @@
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is the probability of a paranormal subreddit.
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.

1
config.txt Executable file
View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

5272
dev-0/expected.tsv Executable file

File diff suppressed because it is too large Load Diff

BIN
dev-0/in.tsv.xz Executable file

Binary file not shown.

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
in-header.tsv Executable file
View File

@ -0,0 +1 @@
PostText Timestamp
1 PostText Timestamp

58
main.py Normal file
View File

@ -0,0 +1,58 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import csv
def generate_output(predicted_proba, path):
f3 = open(path, "w")
predicted_proba[predicted_proba < 0.05] = 0.05
predicted_proba[predicted_proba > 0.95] = 0.95
string = ""
for probability in predicted_proba:
string += f"{probability[1]}\n"
f3.write(string)
training_data = pd.read_csv('train/in.tsv.xz', header=None, names=["text", "date"], quoting=csv.QUOTE_NONE)
y_train = pd.read_csv('train/expected.tsv', header=None, sep=' ')
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data['text'].values.astype('U'))
# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
knn = KNeighborsClassifier(n_neighbors=15)
# training our classifier ; y_train will have numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, y_train)
# Input Data to predict their classes of the given y_train
dev_data = pd.read_csv("dev-0/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
# building up feature vector of our input
X_new_counts = count_vect.transform(dev_data[0])
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Generating dev-0 output...")
predicted_proba_dev = knn.predict_proba(X_new_tfidf)
generate_output(predicted_proba_dev, "dev-0/out.tsv")
print("Generated dev-0 output!")
# Input Data to predict their classes of the given y_train
test_data = pd.read_csv("test-A/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
# building up feature vector of our input
X_new_counts = count_vect.transform(test_data[0])
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Generating test-A output...")
predicted_proba_test = knn.predict_proba(X_new_tfidf)
generate_output(predicted_proba_test, "test-A/out.tsv")
print("Generated test-A output!")

1
out-header.tsv Executable file
View File

@ -0,0 +1 @@
Label
1 Label

BIN
test-A/in.tsv.xz Executable file

Binary file not shown.

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

289579
train/expected.tsv Executable file

File diff suppressed because it is too large Load Diff

BIN
train/in.tsv.xz Executable file

Binary file not shown.