This commit is contained in:
Julian Zabłoński 2022-06-15 10:19:33 +02:00
parent 87a9c7ca30
commit 7fd572eee7
11 changed files with 125375 additions and 125538 deletions

View File

@ -1,10 +1,14 @@
Sport Texts Classification Challenge
====================================
Sport Texts Classification Challenge - Ball
======================
Guess the sport discipline for a Polish article.
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
Possible disciplines: pilka-nozna, siatkowka, sporty-walki, pilka-reczna, koszykowka, tenis, moto, zimowe. Evaluation metric is Accuracy.
Classes
-------
* `1` — ball
* `0` — no-ball
Directory structure
-------------------

View File

@ -1 +1 @@
--metric LikelihoodHashed --metric Accuracy --precision 5
--metric Likelihood --metric Accuracy --precision 5

File diff suppressed because it is too large Load Diff

10886
dev-0/in.tsv

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

167
run.ipynb
View File

@ -1,167 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import make_pipeline"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\User\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
"\n",
"\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
"b'Skipping line 3249: expected 2 fields, saw 3\\nSkipping line 66393: expected 2 fields, saw 3\\nSkipping line 76415: expected 2 fields, saw 3\\n'\n"
]
}
],
"source": [
"data = pd.read_csv('train/train.tsv', sep='\\t', header=None, error_bad_lines=False)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"X = data[1]\n",
"\n",
"with open('dev-0/in.tsv', 'r', encoding='utf8') as f:\n",
" Xdev = f.readlines()\n",
"Xdev = pd.Series(Xdev)\n",
"\n",
"with open('test-A/in.tsv', 'r', encoding='utf8') as f:\n",
" Xtest = f.readlines()\n",
"Xtest = pd.Series(Xtest)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"y = data[0].astype('string')\n",
"\n",
"ydev = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)\n",
"ydev = ydev.squeeze()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('multinomialnb', MultinomialNB())])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"predictions_dev0 = model.predict(Xdev)\n",
"predictions_dev0 = pd.Series(predictions_dev0)\n",
"predictions_dev0 = predictions_dev0"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'wt') as f:\n",
" for pred in predictions_dev0:\n",
" f.write(str(pred)+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"predictions_testA = model.predict(Xtest)\n",
"predictions_testA = pd.Series(predictions_testA)\n",
"predictions_testA = predictions_testA"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/out.tsv', 'wt') as f:\n",
" for pred in predictions_testA:\n",
" f.write(str(pred)+'\\n')"
]
}
],
"metadata": {
"interpreter": {
"hash": "f08154012ddadd8e950e6e9e035c7a7b32c136e7647e9b7c77e02eb723a8bedb"
},
"kernelspec": {
"display_name": "Python 3.9.7 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

8
run.py
View File

@ -21,7 +21,7 @@ Xtest = pd.Series(Xtest)
y = data[0].astype('string')
y = data[0].astype('str')
ydev = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None)
ydev = ydev.squeeze()
@ -34,7 +34,7 @@ model.fit(X, y)
predictions_dev0 = model.predict(Xdev)
predictions_dev0 = pd.Series(predictions_dev0)
predictions_dev0 = predictions_dev0
predictions_dev0 = predictions_dev0.astype('int')
with open('dev-0/out.tsv', 'wt') as f:
@ -44,10 +44,10 @@ with open('dev-0/out.tsv', 'wt') as f:
predictions_testA = model.predict(Xtest)
predictions_testA = pd.Series(predictions_testA)
predictions_testA = predictions_testA
predictions_testA = predictions_testA.astype('int')
with open('test-A/out.tsv', 'wt') as f:
for pred in predictions_testA:
f.write(str(pred)+'\n')
f.write(str(pred)+'\n')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.