Initial
This commit is contained in:
commit
56d1fdc698
8
.gitignore
vendored
Executable file
8
.gitignore
vendored
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
*~
|
||||||
|
*.swp
|
||||||
|
*.bak
|
||||||
|
*.pyc
|
||||||
|
*.o
|
||||||
|
.DS_Store
|
||||||
|
.token
|
2
.idea/.gitignore
vendored
Normal file
2
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/workspace.xml
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
7
.idea/misc.xml
Normal file
7
.idea/misc.xml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="JavaScriptSettings">
|
||||||
|
<option name="languageLevel" value="ES6" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-skeptic-tf-idf.iml" filepath="$PROJECT_DIR$/.idea/paranormal-skeptic-tf-idf.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
11
.idea/paranormal-skeptic-tf-idf.iml
Normal file
11
.idea/paranormal-skeptic-tf-idf.iml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="TestRunnerService">
|
||||||
|
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
|
||||||
|
</component>
|
||||||
|
</module>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
13
README.md
Executable file
13
README.md
Executable file
@ -0,0 +1,13 @@
|
|||||||
|
Skeptic vs paranormal subreddits
|
||||||
|
================================
|
||||||
|
|
||||||
|
Classify a reddit as either from Skeptic subreddit or one of the
|
||||||
|
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||||
|
,Glitch-in-the-Matrix, conspiracytheories).
|
||||||
|
|
||||||
|
Output label is the probability of a paranormal subreddit.
|
||||||
|
|
||||||
|
Sources
|
||||||
|
-------
|
||||||
|
|
||||||
|
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
1
config.txt
Executable file
1
config.txt
Executable file
@ -0,0 +1 @@
|
|||||||
|
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
5272
dev-0/expected.tsv
Executable file
5272
dev-0/expected.tsv
Executable file
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
Executable file
BIN
dev-0/in.tsv.xz
Executable file
Binary file not shown.
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
in-header.tsv
Executable file
1
in-header.tsv
Executable file
@ -0,0 +1 @@
|
|||||||
|
PostText Timestamp
|
|
58
main.py
Normal file
58
main.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
|
||||||
|
def generate_output(predicted_proba, path):
|
||||||
|
f3 = open(path, "w")
|
||||||
|
|
||||||
|
predicted_proba[predicted_proba < 0.05] = 0.05
|
||||||
|
predicted_proba[predicted_proba > 0.95] = 0.95
|
||||||
|
|
||||||
|
string = ""
|
||||||
|
for probability in predicted_proba:
|
||||||
|
string += f"{probability[1]}\n"
|
||||||
|
f3.write(string)
|
||||||
|
|
||||||
|
|
||||||
|
training_data = pd.read_csv('train/in.tsv.xz', header=None, names=["text", "date"], quoting=csv.QUOTE_NONE)
|
||||||
|
y_train = pd.read_csv('train/expected.tsv', header=None, sep=' ')
|
||||||
|
|
||||||
|
count_vect = CountVectorizer()
|
||||||
|
X_train_counts = count_vect.fit_transform(training_data['text'].values.astype('U'))
|
||||||
|
|
||||||
|
# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
|
||||||
|
tfidf_transformer = TfidfTransformer()
|
||||||
|
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
|
||||||
|
|
||||||
|
knn = KNeighborsClassifier(n_neighbors=15)
|
||||||
|
|
||||||
|
# training our classifier ; y_train will have numbers assigned for each category in train data
|
||||||
|
clf = knn.fit(X_train_tfidf, y_train)
|
||||||
|
|
||||||
|
# Input Data to predict their classes of the given y_train
|
||||||
|
dev_data = pd.read_csv("dev-0/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
|
||||||
|
# building up feature vector of our input
|
||||||
|
X_new_counts = count_vect.transform(dev_data[0])
|
||||||
|
# We call transform instead of fit_transform because it's already been fit
|
||||||
|
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
|
||||||
|
|
||||||
|
print("Generating dev-0 output...")
|
||||||
|
predicted_proba_dev = knn.predict_proba(X_new_tfidf)
|
||||||
|
generate_output(predicted_proba_dev, "dev-0/out.tsv")
|
||||||
|
print("Generated dev-0 output!")
|
||||||
|
|
||||||
|
# Input Data to predict their classes of the given y_train
|
||||||
|
test_data = pd.read_csv("test-A/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
|
||||||
|
|
||||||
|
# building up feature vector of our input
|
||||||
|
X_new_counts = count_vect.transform(test_data[0])
|
||||||
|
# We call transform instead of fit_transform because it's already been fit
|
||||||
|
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
|
||||||
|
|
||||||
|
print("Generating test-A output...")
|
||||||
|
predicted_proba_test = knn.predict_proba(X_new_tfidf)
|
||||||
|
generate_output(predicted_proba_test, "test-A/out.tsv")
|
||||||
|
print("Generated test-A output!")
|
1
out-header.tsv
Executable file
1
out-header.tsv
Executable file
@ -0,0 +1 @@
|
|||||||
|
Label
|
|
BIN
test-A/in.tsv.xz
Executable file
BIN
test-A/in.tsv.xz
Executable file
Binary file not shown.
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/expected.tsv
Executable file
289579
train/expected.tsv
Executable file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Executable file
BIN
train/in.tsv.xz
Executable file
Binary file not shown.
Loading…
Reference in New Issue
Block a user