naive bayes

This commit is contained in:
domzal 2022-05-08 15:06:12 +02:00
parent 756ef4277a
commit dc2cadc034
11 changed files with 10642 additions and 0 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,19 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="W29" />
<option value="E501" />
<option value="W29" />
<option value="E501" />
<option value="W29" />
<option value="E501" />
<option value="W29" />
<option value="E501" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" filepath="$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

135
run.ipynb Normal file
View File

@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import lzma\n",
"\n",
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"count_vect = CountVectorizer()\n",
"X_train_counts = count_vect.fit_transform(X_train)\n",
"X_dev0_counts = count_vect.transform(X_dev0)\n",
"X_test_counts = count_vect.transform(X_test)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"clf = MultinomialNB().fit(X_train_counts, y_train)\n",
"\n",
"y_predicted_dev0_MNB = clf.predict(X_dev0_counts)\n",
"y_predicted_test_MNB = clf.predict(X_test_counts)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy dev0: 0.8025417298937785\n"
]
}
],
"source": [
"accuracy_dev0_MNB = accuracy_score(y_expected_dev0, y_predicted_dev0_MNB)\n",
"print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"open(\"dev-0/out.tsv\", mode='w').writelines(y_predicted_dev0_MNB)\n",
"open(\"test-A/out.tsv\", mode='w').writelines(y_predicted_test_MNB)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

24
run.py Normal file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding: utf-8
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma
X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train = open('train/expected.tsv').readlines()
X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines()
X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_dev0_counts = count_vect.transform(X_dev0)
X_test_counts = count_vect.transform(X_test)
clf = MultinomialNB().fit(X_train_counts, y_train)
y_predicted_dev0_MNB = clf.predict(X_dev0_counts)
y_predicted_test_MNB = clf.predict(X_test_counts)
open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB)
open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff