ISI-9 ready-made, MultinomialNB

This commit is contained in:
ksanu 2020-03-23 18:43:43 +01:00
parent f9b346e3fb
commit c05e1e4df7
8 changed files with 76 additions and 0 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
in.tsv
model.pkl
*~
*.swp

2
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-or-skeptic.iml" filepath="$PROJECT_DIR$/.idea/paranormal-or-skeptic.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,38 @@
import pandas as pd
import numpy as np
import csv
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None)
#print(y)
#train
X_train_counts = count_vect.fit_transform(texts)
clf = MultinomialNB().fit(X_train_counts, y)
print(texts[0])
print(len(texts))
print(len(y))
#predict
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
dev0_new_counts = count_vect.transform(dev0)
testA_new_counts = count_vect.transform(testA)
predicted_dev0 = clf.predict(dev0_new_counts)
predicted_testA = clf.predict(testA_new_counts)
print(len(dev0))
print(len(predicted_dev0))
with open("dev-0/out.tsv"):
print(predicted_dev0)
with open("test-A/out.tsv"):
print(predicted_testA)