Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

11 changed files with 0 additions and 229462 deletions

View File

@ -1,203 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a8bcddf9-596c-4493-bf2a-8e32255115ce",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sklearn\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "da067d47-0543-48b3-bdf4-844061f827c9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Programy\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
"\n",
"\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
}
],
"source": [
"train = pd.read_csv('train/train.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"train = train.head(2000)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "94390d90-898c-42df-8482-0e1b8a3ea706",
"metadata": {},
"outputs": [],
"source": [
"x_train = train[1]\n",
"y_train = train[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "df870ce3-c258-4de0-bbda-f5d71a53163c",
"metadata": {},
"outputs": [],
"source": [
"x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"x_dev = x_dev[0]\n",
"y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t', error_bad_lines=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ce5621d9-655a-46d7-b235-8638daac733e",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bca4dc07-fdcd-4ae5-8f24-584a3cda3b79",
"metadata": {},
"outputs": [],
"source": [
"x_train = vectorizer.fit_transform(x_train)\n",
"x_dev = vectorizer.transform(x_dev)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "96840a5e-bfb9-4fae-a5f9-7acc0d7e4c53",
"metadata": {},
"outputs": [],
"source": [
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "aed08803-9aef-43e2-8ee2-1d79458b49ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(x_train.toarray(), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7461bb8d-3b3d-4164-9d47-a62b73dc0e36",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9418561995597946\n"
]
}
],
"source": [
"dev_predicted = gnb.predict(x_dev.toarray())\n",
"\n",
"with open('dev-0/out.tsv', 'wt') as f:\n",
" for i in dev_predicted:\n",
" f.write(str(i)+'\\n')\n",
"\n",
"dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\\t')\n",
"dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t')\n",
"print(accuracy_score(dev_out, dev_expected))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2e18bdbe-6d06-42e3-b952-0d5e7bc60325",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:\n",
" x_test = f.readlines()\n",
" \n",
"x_test = pd.Series(x_test)\n",
"x_test = vectorizer.transform(x_test)\n",
"\n",
"test_predicted = gnb.predict(x_test.toarray())\n",
"\n",
"with open('test-A/out.tsv', 'wt') as f:\n",
" for i in test_predicted:\n",
" f.write(str(i)+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9463e664-5f74-4a96-8959-03eb224715e7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 1502 bytes to run.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

203
run.ipynb
View File

@ -1,203 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a8bcddf9-596c-4493-bf2a-8e32255115ce",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sklearn\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "da067d47-0543-48b3-bdf4-844061f827c9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Programy\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
"\n",
"\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
}
],
"source": [
"train = pd.read_csv('train/train.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"train = train.head(2000)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "94390d90-898c-42df-8482-0e1b8a3ea706",
"metadata": {},
"outputs": [],
"source": [
"x_train = train[1]\n",
"y_train = train[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "df870ce3-c258-4de0-bbda-f5d71a53163c",
"metadata": {},
"outputs": [],
"source": [
"x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
"x_dev = x_dev[0]\n",
"y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t', error_bad_lines=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ce5621d9-655a-46d7-b235-8638daac733e",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bca4dc07-fdcd-4ae5-8f24-584a3cda3b79",
"metadata": {},
"outputs": [],
"source": [
"x_train = vectorizer.fit_transform(x_train)\n",
"x_dev = vectorizer.transform(x_dev)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "96840a5e-bfb9-4fae-a5f9-7acc0d7e4c53",
"metadata": {},
"outputs": [],
"source": [
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "aed08803-9aef-43e2-8ee2-1d79458b49ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(x_train.toarray(), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7461bb8d-3b3d-4164-9d47-a62b73dc0e36",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9418561995597946\n"
]
}
],
"source": [
"dev_predicted = gnb.predict(x_dev.toarray())\n",
"\n",
"with open('dev-0/out.tsv', 'wt') as f:\n",
" for i in dev_predicted:\n",
" f.write(str(i)+'\\n')\n",
"\n",
"dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\\t')\n",
"dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t')\n",
"print(accuracy_score(dev_out, dev_expected))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2e18bdbe-6d06-42e3-b952-0d5e7bc60325",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:\n",
" x_test = f.readlines()\n",
" \n",
"x_test = pd.Series(x_test)\n",
"x_test = vectorizer.transform(x_test)\n",
"\n",
"test_predicted = gnb.predict(x_test.toarray())\n",
"\n",
"with open('test-A/out.tsv', 'wt') as f:\n",
" for i in test_predicted:\n",
" f.write(str(i)+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9463e664-5f74-4a96-8959-03eb224715e7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 1502 bytes to run.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

90
run.py
View File

@ -1,90 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
# In[2]:
train = pd.read_csv('train/train.tsv', header=None, sep='\t', error_bad_lines=False)
train = train.head(2000)
# In[3]:
x_train = train[1]
y_train = train[0]
# In[4]:
x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\t', error_bad_lines=False)
x_dev = x_dev[0]
y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t', error_bad_lines=False)
# In[5]:
vectorizer = TfidfVectorizer()
# In[6]:
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
# In[7]:
gnb = GaussianNB()
# In[8]:
gnb.fit(x_train.toarray(), y_train)
# In[9]:
dev_predicted = gnb.predict(x_dev.toarray())
with open('dev-0/out.tsv', 'wt') as f:
for i in dev_predicted:
f.write(str(i)+'\n')
dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\t')
dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t')
print(accuracy_score(dev_out, dev_expected))
# In[10]:
with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:
x_test = f.readlines()
x_test = pd.Series(x_test)
x_test = vectorizer.transform(x_test)
test_predicted = gnb.predict(x_test.toarray())
with open('test-A/out.tsv', 'wt') as f:
for i in test_predicted:
f.write(str(i)+'\n')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff