147 lines
5.0 KiB
Plaintext
147 lines
5.0 KiB
Plaintext
![]() |
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
|
||
|
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Naiwny bayes:\n",
|
||
|
"recal score = 0.9939463822427212\n",
|
||
|
"acuracy score = 0.9889948642699926\n",
|
||
|
"precision score = 0.9888156008029825\n",
|
||
|
"f score score = 0.9913743530764807\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
||
|
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
|
||
|
"\n",
|
||
|
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
||
|
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
||
|
"Please also refer to the documentation for alternative solver options:\n",
|
||
|
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
||
|
" n_iter_i = _check_optimize_result(\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"\n",
|
||
|
"Regresja logistyczna:\n",
|
||
|
"recal score = 0.9979821274142404\n",
|
||
|
"acuracy score = 0.9891782831988262\n",
|
||
|
"precision score = 0.9852020489470689\n",
|
||
|
"f score score = 0.9915509093512818\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import lzma\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"import gzip\n",
|
||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||
|
"from sklearn.pipeline import make_pipeline\n",
|
||
|
"from sklearn.metrics import recall_score\n",
|
||
|
"from sklearn.metrics import precision_score\n",
|
||
|
"from sklearn.metrics import accuracy_score\n",
|
||
|
"from sklearn.metrics import f1_score\n",
|
||
|
"from sklearn.linear_model import LogisticRegression\n",
|
||
|
"\n",
|
||
|
"def readFile(filename):\n",
|
||
|
" X_dev = []\n",
|
||
|
" with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",
|
||
|
" for line in dev_in:\n",
|
||
|
" text = line.split(\"\\t\")[0].strip()\n",
|
||
|
" X_dev.append(text)\n",
|
||
|
" return X_dev\n",
|
||
|
" \n",
|
||
|
"def writePred(filename, predictions):\n",
|
||
|
" with open(filename, \"w\") as out_file:\n",
|
||
|
" for pred in predictions:\n",
|
||
|
" out_file.write(str(pred) + \"\\n\")\n",
|
||
|
"\n",
|
||
|
"with gzip.open('train.tsv.gz', 'rb') as f:\n",
|
||
|
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
|
||
|
"\n",
|
||
|
"x = data['text']\n",
|
||
|
"y = data['isBall']\n",
|
||
|
"\n",
|
||
|
"x = np.asarray(x)\n",
|
||
|
"y = np.asarray(y)\n",
|
||
|
"\n",
|
||
|
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
|
||
|
"model.fit(x,y)\n",
|
||
|
"\n",
|
||
|
"dev = readFile('in.tsv')\n",
|
||
|
"pred = model.predict(dev)\n",
|
||
|
"trueClass = readFile('expected.tsv')\n",
|
||
|
"\n",
|
||
|
"trueClass = [int(x) for x in trueClass]\n",
|
||
|
"print('Naiwny bayes:')\n",
|
||
|
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
|
||
|
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
|
||
|
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
|
||
|
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n",
|
||
|
"\n",
|
||
|
"#x = x[:50000]\n",
|
||
|
"#y = y[:50000]\n",
|
||
|
"model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",
|
||
|
"model.fit(x,y)\n",
|
||
|
"pred = model.predict(dev)\n",
|
||
|
"\n",
|
||
|
"print('\\nRegresja logistyczna:')\n",
|
||
|
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
|
||
|
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
|
||
|
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
|
||
|
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"interpreter": {
|
||
|
"hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"
|
||
|
},
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3.9.2 64-bit",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.9.2"
|
||
|
},
|
||
|
"orig_nbformat": 4
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|