This commit is contained in:
szymonj98 2022-06-18 14:48:44 +02:00
commit a49e52ee6c
5 changed files with 11050 additions and 0 deletions

5452
expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
in.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
projekt.docx Normal file

Binary file not shown.

146
projekt.ipynb Normal file
View File

@ -0,0 +1,146 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Naiwny bayes:\n",
"recal score = 0.9939463822427212\n",
"acuracy score = 0.9889948642699926\n",
"precision score = 0.9888156008029825\n",
"f score score = 0.9913743530764807\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Regresja logistyczna:\n",
"recal score = 0.9979821274142404\n",
"acuracy score = 0.9891782831988262\n",
"precision score = 0.9852020489470689\n",
"f score score = 0.9915509093512818\n"
]
}
],
"source": [
"import lzma\n",
"import pandas as pd\n",
"import numpy as np\n",
"import gzip\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.metrics import recall_score\n",
"from sklearn.metrics import precision_score\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import f1_score\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"def readFile(filename):\n",
" X_dev = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",
" for line in dev_in:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" X_dev.append(text)\n",
" return X_dev\n",
" \n",
"def writePred(filename, predictions):\n",
" with open(filename, \"w\") as out_file:\n",
" for pred in predictions:\n",
" out_file.write(str(pred) + \"\\n\")\n",
"\n",
"with gzip.open('train.tsv.gz', 'rb') as f:\n",
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
"\n",
"x = data['text']\n",
"y = data['isBall']\n",
"\n",
"x = np.asarray(x)\n",
"y = np.asarray(y)\n",
"\n",
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
"model.fit(x,y)\n",
"\n",
"dev = readFile('in.tsv')\n",
"pred = model.predict(dev)\n",
"trueClass = readFile('expected.tsv')\n",
"\n",
"trueClass = [int(x) for x in trueClass]\n",
"print('Naiwny bayes:')\n",
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n",
"\n",
"#x = x[:50000]\n",
"#y = y[:50000]\n",
"model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",
"model.fit(x,y)\n",
"pred = model.predict(dev)\n",
"\n",
"print('\\nRegresja logistyczna:')\n",
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n"
]
}
],
"metadata": {
"interpreter": {
"hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"
},
"kernelspec": {
"display_name": "Python 3.9.2 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

BIN
train.tsv.gz Normal file

Binary file not shown.