This commit is contained in:
Sebastian 2022-05-11 00:52:19 +02:00
parent 9cb2fb2612
commit 744d368d0b
11 changed files with 229462 additions and 0 deletions

View File

@ -0,0 +1,203 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a8bcddf9-596c-4493-bf2a-8e32255115ce",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sklearn\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "da067d47-0543-48b3-bdf4-844061f827c9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Programy\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
"\n",
"\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
}
],
"source": [
"train = pd.read_csv('train/train.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"train = train.head(2000)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "94390d90-898c-42df-8482-0e1b8a3ea706",
"metadata": {},
"outputs": [],
"source": [
"x_train = train[1]\n",
"y_train = train[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "df870ce3-c258-4de0-bbda-f5d71a53163c",
"metadata": {},
"outputs": [],
"source": [
"x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"x_dev = x_dev[0]\n",
"y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t', error_bad_lines=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ce5621d9-655a-46d7-b235-8638daac733e",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bca4dc07-fdcd-4ae5-8f24-584a3cda3b79",
"metadata": {},
"outputs": [],
"source": [
"x_train = vectorizer.fit_transform(x_train)\n",
"x_dev = vectorizer.transform(x_dev)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "96840a5e-bfb9-4fae-a5f9-7acc0d7e4c53",
"metadata": {},
"outputs": [],
"source": [
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "aed08803-9aef-43e2-8ee2-1d79458b49ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(x_train.toarray(), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7461bb8d-3b3d-4164-9d47-a62b73dc0e36",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9418561995597946\n"
]
}
],
"source": [
"dev_predicted = gnb.predict(x_dev.toarray())\n",
"\n",
"with open('dev-0/out.tsv', 'wt') as f:\n",
" for i in dev_predicted:\n",
" f.write(str(i)+'\\n')\n",
"\n",
"dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\\t')\n",
"dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t')\n",
"print(accuracy_score(dev_out, dev_expected))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2e18bdbe-6d06-42e3-b952-0d5e7bc60325",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:\n",
" x_test = f.readlines()\n",
" \n",
"x_test = pd.Series(x_test)\n",
"x_test = vectorizer.transform(x_test)\n",
"\n",
"test_predicted = gnb.predict(x_test.toarray())\n",
"\n",
"with open('test-A/out.tsv', 'wt') as f:\n",
" for i in test_predicted:\n",
" f.write(str(i)+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9463e664-5f74-4a96-8959-03eb224715e7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 1502 bytes to run.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

203
run.ipynb Normal file
View File

@ -0,0 +1,203 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a8bcddf9-596c-4493-bf2a-8e32255115ce",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sklearn\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "da067d47-0543-48b3-bdf4-844061f827c9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Programy\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
"\n",
"\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
}
],
"source": [
"train = pd.read_csv('train/train.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"train = train.head(2000)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "94390d90-898c-42df-8482-0e1b8a3ea706",
"metadata": {},
"outputs": [],
"source": [
"x_train = train[1]\n",
"y_train = train[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "df870ce3-c258-4de0-bbda-f5d71a53163c",
"metadata": {},
"outputs": [],
"source": [
"x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"x_dev = x_dev[0]\n",
"y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t', error_bad_lines=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ce5621d9-655a-46d7-b235-8638daac733e",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bca4dc07-fdcd-4ae5-8f24-584a3cda3b79",
"metadata": {},
"outputs": [],
"source": [
"x_train = vectorizer.fit_transform(x_train)\n",
"x_dev = vectorizer.transform(x_dev)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "96840a5e-bfb9-4fae-a5f9-7acc0d7e4c53",
"metadata": {},
"outputs": [],
"source": [
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "aed08803-9aef-43e2-8ee2-1d79458b49ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(x_train.toarray(), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7461bb8d-3b3d-4164-9d47-a62b73dc0e36",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9418561995597946\n"
]
}
],
"source": [
"dev_predicted = gnb.predict(x_dev.toarray())\n",
"\n",
"with open('dev-0/out.tsv', 'wt') as f:\n",
" for i in dev_predicted:\n",
" f.write(str(i)+'\\n')\n",
"\n",
"dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\\t')\n",
"dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t')\n",
"print(accuracy_score(dev_out, dev_expected))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2e18bdbe-6d06-42e3-b952-0d5e7bc60325",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:\n",
" x_test = f.readlines()\n",
" \n",
"x_test = pd.Series(x_test)\n",
"x_test = vectorizer.transform(x_test)\n",
"\n",
"test_predicted = gnb.predict(x_test.toarray())\n",
"\n",
"with open('test-A/out.tsv', 'wt') as f:\n",
" for i in test_predicted:\n",
" f.write(str(i)+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9463e664-5f74-4a96-8959-03eb224715e7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 1502 bytes to run.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

90
run.py Normal file
View File

@ -0,0 +1,90 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
# In[2]:
train = pd.read_csv('train/train.tsv', header=None, sep='\t', error_bad_lines=False)
train = train.head(2000)
# In[3]:
x_train = train[1]
y_train = train[0]
# In[4]:
x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\t', error_bad_lines=False)
x_dev = x_dev[0]
y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t', error_bad_lines=False)
# In[5]:
vectorizer = TfidfVectorizer()
# In[6]:
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
# In[7]:
gnb = GaussianNB()
# In[8]:
gnb.fit(x_train.toarray(), y_train)
# In[9]:
dev_predicted = gnb.predict(x_dev.toarray())
with open('dev-0/out.tsv', 'wt') as f:
for i in dev_predicted:
f.write(str(i)+'\n')
dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\t')
dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t')
print(accuracy_score(dev_out, dev_expected))
# In[10]:
with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:
x_test = f.readlines()
x_test = pd.Series(x_test)
x_test = vectorizer.transform(x_test)
test_predicted = gnb.predict(x_test.toarray())
with open('test-A/out.tsv', 'wt') as f:
for i in test_predicted:
f.write(str(i)+'\n')

File diff suppressed because it is too large Load Diff

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff