{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", "\n", "\n", " data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n", "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Naiwny bayes:\n", "recal score = 0.9939463822427212\n", "acuracy score = 0.9889948642699926\n", "precision score = 0.9888156008029825\n", "f score score = 0.9913743530764807\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Regresja logistyczna:\n", "recal score = 0.9979821274142404\n", "acuracy score = 0.9891782831988262\n", "precision score = 0.9852020489470689\n", "f score score = 0.9915509093512818\n" ] } ], "source": [ "import lzma\n", "import pandas as pd\n", "import numpy as np\n", "import gzip\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import f1_score\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "def readFile(filename):\n", " X_dev = []\n", " with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n", " for line in dev_in:\n", " text = line.split(\"\\t\")[0].strip()\n", " X_dev.append(text)\n", " return X_dev\n", " \n", "def writePred(filename, predictions):\n", " with open(filename, \"w\") as out_file:\n", " for pred in predictions:\n", " out_file.write(str(pred) + \"\\n\")\n", "\n", "with gzip.open('train.tsv.gz', 'rb') as f:\n", " data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n", "\n", "x = data['text']\n", "y = data['isBall']\n", "\n", "x = np.asarray(x)\n", "y = np.asarray(y)\n", "\n", "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n", "model.fit(x,y)\n", "\n", "dev = readFile('in.tsv')\n", "pred = model.predict(dev)\n", "trueClass = readFile('expected.tsv')\n", "\n", "trueClass = [int(x) for x in trueClass]\n", "print('Naiwny bayes:')\n", "print(\"recal score = \",recall_score(trueClass,list(pred)))\n", "print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n", "print(\"precision score = \",precision_score(trueClass,list(pred)))\n", "print(\"f score score = \",f1_score(trueClass,list(pred)))\n", "\n", "#x = x[:50000]\n", "#y = y[:50000]\n", "model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n", "model.fit(x,y)\n", "pred = model.predict(dev)\n", "\n", "print('\\nRegresja logistyczna:')\n", "print(\"recal score = \",recall_score(trueClass,list(pred)))\n", "print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n", "print(\"precision score = \",precision_score(trueClass,list(pred)))\n", "print(\"f score score = \",f1_score(trueClass,list(pred)))\n" ] } ], "metadata": { "interpreter": { "hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc" }, "kernelspec": { "display_name": "Python 3.9.2 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }