projekt

2022-06-18 14:48:44 +02:00 · 2022-06-18 14:48:44 +02:00 · a49e52ee6c
commit a49e52ee6c
5 changed files with 11050 additions and 0 deletions
--- a/expected.tsv
+++ b/expected.tsv
--- a/in.tsv
+++ b/in.tsv
--- a/projekt.docx
+++ b/projekt.docx
--- a/projekt.ipynb
+++ b/projekt.ipynb
@ -0,0 +1,146 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "\n",
+      "\n",
+      "  data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
+      "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Naiwny bayes:\n",
+      "recal score =  0.9939463822427212\n",
+      "acuracy score =  0.9889948642699926\n",
+      "precision score =  0.9888156008029825\n",
+      "f score score =  0.9913743530764807\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Regresja logistyczna:\n",
+      "recal score =  0.9979821274142404\n",
+      "acuracy score =  0.9891782831988262\n",
+      "precision score =  0.9852020489470689\n",
+      "f score score =  0.9915509093512818\n"
+     ]
+    }
+   ],
+   "source": [
+    "import lzma\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import gzip\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.metrics import recall_score\n",
+    "from sklearn.metrics import precision_score\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "def readFile(filename):\n",
+    "    X_dev = []\n",
+    "    with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",
+    "        for line in dev_in:\n",
+    "            text = line.split(\"\\t\")[0].strip()\n",
+    "            X_dev.append(text)\n",
+    "    return X_dev\n",
+    "    \n",
+    "def writePred(filename, predictions):\n",
+    "    with open(filename, \"w\") as out_file:\n",
+    "        for pred in predictions:\n",
+    "            out_file.write(str(pred) + \"\\n\")\n",
+    "\n",
+    "with gzip.open('train.tsv.gz', 'rb') as f:\n",
+    "    data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
+    "\n",
+    "x = data['text']\n",
+    "y = data['isBall']\n",
+    "\n",
+    "x = np.asarray(x)\n",
+    "y = np.asarray(y)\n",
+    "\n",
+    "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
+    "model.fit(x,y)\n",
+    "\n",
+    "dev = readFile('in.tsv')\n",
+    "pred = model.predict(dev)\n",
+    "trueClass = readFile('expected.tsv')\n",
+    "\n",
+    "trueClass = [int(x) for x in trueClass]\n",
+    "print('Naiwny bayes:')\n",
+    "print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
+    "print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
+    "print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
+    "print(\"f score score = \",f1_score(trueClass,list(pred)))\n",
+    "\n",
+    "#x = x[:50000]\n",
+    "#y = y[:50000]\n",
+    "model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",
+    "model.fit(x,y)\n",
+    "pred = model.predict(dev)\n",
+    "\n",
+    "print('\\nRegresja logistyczna:')\n",
+    "print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
+    "print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
+    "print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
+    "print(\"f score score = \",f1_score(trueClass,list(pred)))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.2 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/train.tsv.gz
+++ b/train.tsv.gz