projekt-uczenie/projekt.ipynb

147 lines
5.0 KiB
Plaintext
Raw Normal View History

2022-06-18 14:48:44 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Naiwny bayes:\n",
"recal score = 0.9939463822427212\n",
"acuracy score = 0.9889948642699926\n",
"precision score = 0.9888156008029825\n",
"f score score = 0.9913743530764807\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Regresja logistyczna:\n",
"recal score = 0.9979821274142404\n",
"acuracy score = 0.9891782831988262\n",
"precision score = 0.9852020489470689\n",
"f score score = 0.9915509093512818\n"
]
}
],
"source": [
"import lzma\n",
"import pandas as pd\n",
"import numpy as np\n",
"import gzip\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.metrics import recall_score\n",
"from sklearn.metrics import precision_score\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import f1_score\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"def readFile(filename):\n",
" X_dev = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",
" for line in dev_in:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" X_dev.append(text)\n",
" return X_dev\n",
" \n",
"def writePred(filename, predictions):\n",
" with open(filename, \"w\") as out_file:\n",
" for pred in predictions:\n",
" out_file.write(str(pred) + \"\\n\")\n",
"\n",
"with gzip.open('train.tsv.gz', 'rb') as f:\n",
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
"\n",
"x = data['text']\n",
"y = data['isBall']\n",
"\n",
"x = np.asarray(x)\n",
"y = np.asarray(y)\n",
"\n",
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
"model.fit(x,y)\n",
"\n",
"dev = readFile('in.tsv')\n",
"pred = model.predict(dev)\n",
"trueClass = readFile('expected.tsv')\n",
"\n",
"trueClass = [int(x) for x in trueClass]\n",
"print('Naiwny bayes:')\n",
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n",
"\n",
"#x = x[:50000]\n",
"#y = y[:50000]\n",
"model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",
"model.fit(x,y)\n",
"pred = model.predict(dev)\n",
"\n",
"print('\\nRegresja logistyczna:')\n",
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n"
]
}
],
"metadata": {
"interpreter": {
"hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"
},
"kernelspec": {
"display_name": "Python 3.9.2 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}