Compare commits
No commits in common. "master" and "master" have entirely different histories.
@ -1,203 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a8bcddf9-596c-4493-bf2a-8e32255115ce",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import sklearn\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "da067d47-0543-48b3-bdf4-844061f827c9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"D:\\Programy\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
|
||||
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train = pd.read_csv('train/train.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
|
||||
"train = train.head(2000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "94390d90-898c-42df-8482-0e1b8a3ea706",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train = train[1]\n",
|
||||
"y_train = train[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "df870ce3-c258-4de0-bbda-f5d71a53163c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
|
||||
"x_dev = x_dev[0]\n",
|
||||
"y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t', error_bad_lines=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ce5621d9-655a-46d7-b235-8638daac733e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = TfidfVectorizer()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "bca4dc07-fdcd-4ae5-8f24-584a3cda3b79",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train = vectorizer.fit_transform(x_train)\n",
|
||||
"x_dev = vectorizer.transform(x_dev)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "96840a5e-bfb9-4fae-a5f9-7acc0d7e4c53",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gnb = GaussianNB()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "aed08803-9aef-43e2-8ee2-1d79458b49ac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"GaussianNB()"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gnb.fit(x_train.toarray(), y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "7461bb8d-3b3d-4164-9d47-a62b73dc0e36",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.9418561995597946\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dev_predicted = gnb.predict(x_dev.toarray())\n",
|
||||
"\n",
|
||||
"with open('dev-0/out.tsv', 'wt') as f:\n",
|
||||
" for i in dev_predicted:\n",
|
||||
" f.write(str(i)+'\\n')\n",
|
||||
"\n",
|
||||
"dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\\t')\n",
|
||||
"dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t')\n",
|
||||
"print(accuracy_score(dev_out, dev_expected))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "2e18bdbe-6d06-42e3-b952-0d5e7bc60325",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:\n",
|
||||
" x_test = f.readlines()\n",
|
||||
" \n",
|
||||
"x_test = pd.Series(x_test)\n",
|
||||
"x_test = vectorizer.transform(x_test)\n",
|
||||
"\n",
|
||||
"test_predicted = gnb.predict(x_test.toarray())\n",
|
||||
"\n",
|
||||
"with open('test-A/out.tsv', 'wt') as f:\n",
|
||||
" for i in test_predicted:\n",
|
||||
" f.write(str(i)+'\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "9463e664-5f74-4a96-8959-03eb224715e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[NbConvertApp] Converting notebook run.ipynb to script\n",
|
||||
"[NbConvertApp] Writing 1502 bytes to run.py\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!jupyter nbconvert --to script run.ipynb"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
5452
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
203
run.ipynb
203
run.ipynb
@ -1,203 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a8bcddf9-596c-4493-bf2a-8e32255115ce",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import sklearn\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "da067d47-0543-48b3-bdf4-844061f827c9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"D:\\Programy\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
|
||||
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train = pd.read_csv('train/train.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
|
||||
"train = train.head(2000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "94390d90-898c-42df-8482-0e1b8a3ea706",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train = train[1]\n",
|
||||
"y_train = train[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "df870ce3-c258-4de0-bbda-f5d71a53163c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\\t', error_bad_lines=False)\n",
|
||||
"x_dev = x_dev[0]\n",
|
||||
"y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t', error_bad_lines=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ce5621d9-655a-46d7-b235-8638daac733e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = TfidfVectorizer()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "bca4dc07-fdcd-4ae5-8f24-584a3cda3b79",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train = vectorizer.fit_transform(x_train)\n",
|
||||
"x_dev = vectorizer.transform(x_dev)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "96840a5e-bfb9-4fae-a5f9-7acc0d7e4c53",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gnb = GaussianNB()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "aed08803-9aef-43e2-8ee2-1d79458b49ac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"GaussianNB()"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gnb.fit(x_train.toarray(), y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "7461bb8d-3b3d-4164-9d47-a62b73dc0e36",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.9418561995597946\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dev_predicted = gnb.predict(x_dev.toarray())\n",
|
||||
"\n",
|
||||
"with open('dev-0/out.tsv', 'wt') as f:\n",
|
||||
" for i in dev_predicted:\n",
|
||||
" f.write(str(i)+'\\n')\n",
|
||||
"\n",
|
||||
"dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\\t')\n",
|
||||
"dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\\t')\n",
|
||||
"print(accuracy_score(dev_out, dev_expected))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "2e18bdbe-6d06-42e3-b952-0d5e7bc60325",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:\n",
|
||||
" x_test = f.readlines()\n",
|
||||
" \n",
|
||||
"x_test = pd.Series(x_test)\n",
|
||||
"x_test = vectorizer.transform(x_test)\n",
|
||||
"\n",
|
||||
"test_predicted = gnb.predict(x_test.toarray())\n",
|
||||
"\n",
|
||||
"with open('test-A/out.tsv', 'wt') as f:\n",
|
||||
" for i in test_predicted:\n",
|
||||
" f.write(str(i)+'\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "9463e664-5f74-4a96-8959-03eb224715e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[NbConvertApp] Converting notebook run.ipynb to script\n",
|
||||
"[NbConvertApp] Writing 1502 bytes to run.py\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!jupyter nbconvert --to script run.ipynb"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
90
run.py
90
run.py
@ -1,90 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[1]:
|
||||
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import sklearn
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
# In[2]:
|
||||
|
||||
|
||||
train = pd.read_csv('train/train.tsv', header=None, sep='\t', error_bad_lines=False)
|
||||
train = train.head(2000)
|
||||
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
x_train = train[1]
|
||||
y_train = train[0]
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
x_dev = pd.read_csv('dev-0/in.tsv', header=None, sep='\t', error_bad_lines=False)
|
||||
x_dev = x_dev[0]
|
||||
y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t', error_bad_lines=False)
|
||||
|
||||
|
||||
# In[5]:
|
||||
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
|
||||
# In[6]:
|
||||
|
||||
|
||||
x_train = vectorizer.fit_transform(x_train)
|
||||
x_dev = vectorizer.transform(x_dev)
|
||||
|
||||
|
||||
# In[7]:
|
||||
|
||||
|
||||
gnb = GaussianNB()
|
||||
|
||||
|
||||
# In[8]:
|
||||
|
||||
|
||||
gnb.fit(x_train.toarray(), y_train)
|
||||
|
||||
|
||||
# In[9]:
|
||||
|
||||
|
||||
dev_predicted = gnb.predict(x_dev.toarray())
|
||||
|
||||
with open('dev-0/out.tsv', 'wt') as f:
|
||||
for i in dev_predicted:
|
||||
f.write(str(i)+'\n')
|
||||
|
||||
dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\t')
|
||||
dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t')
|
||||
print(accuracy_score(dev_out, dev_expected))
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
|
||||
with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f:
|
||||
x_test = f.readlines()
|
||||
|
||||
x_test = pd.Series(x_test)
|
||||
x_test = vectorizer.transform(x_test)
|
||||
|
||||
test_predicted = gnb.predict(x_test.toarray())
|
||||
|
||||
with open('test-A/out.tsv', 'wt') as f:
|
||||
for i in test_predicted:
|
||||
f.write(str(i)+'\n')
|
||||
|
File diff suppressed because it is too large
Load Diff
5447
test-A/out.tsv
5447
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
98132
train/train.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user