projekt-uczenie/projekt.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
      "\n",
      "\n",
      "  data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
      "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Naiwny bayes:\n",
      "recal score =  0.9939463822427212\n",
      "acuracy score =  0.9889948642699926\n",
      "precision score =  0.9888156008029825\n",
      "f score score =  0.9913743530764807\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Regresja logistyczna:\n",
      "recal score =  0.9979821274142404\n",
      "acuracy score =  0.9891782831988262\n",
      "precision score =  0.9852020489470689\n",
      "f score score =  0.9915509093512818\n"
     ]
    }
   ],
   "source": [
    "import lzma\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import gzip\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.metrics import recall_score\n",
    "from sklearn.metrics import precision_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "def readFile(filename):\n",
    "    X_dev = []\n",
    "    with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",
    "        for line in dev_in:\n",
    "            text = line.split(\"\\t\")[0].strip()\n",
    "            X_dev.append(text)\n",
    "    return X_dev\n",
    "    \n",
    "def writePred(filename, predictions):\n",
    "    with open(filename, \"w\") as out_file:\n",
    "        for pred in predictions:\n",
    "            out_file.write(str(pred) + \"\\n\")\n",
    "\n",
    "with gzip.open('train.tsv.gz', 'rb') as f:\n",
    "    data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
    "\n",
    "x = data['text']\n",
    "y = data['isBall']\n",
    "\n",
    "x = np.asarray(x)\n",
    "y = np.asarray(y)\n",
    "\n",
    "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
    "model.fit(x,y)\n",
    "\n",
    "dev = readFile('in.tsv')\n",
    "pred = model.predict(dev)\n",
    "trueClass = readFile('expected.tsv')\n",
    "\n",
    "trueClass = [int(x) for x in trueClass]\n",
    "print('Naiwny bayes:')\n",
    "print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
    "print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
    "print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
    "print(\"f score score = \",f1_score(trueClass,list(pred)))\n",
    "\n",
    "#x = x[:50000]\n",
    "#y = y[:50000]\n",
    "model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",
    "model.fit(x,y)\n",
    "pred = model.predict(dev)\n",
    "\n",
    "print('\\nRegresja logistyczna:')\n",
    "print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
    "print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
    "print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
    "print(\"f score score = \",f1_score(trueClass,list(pred)))\n"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"
  },
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
projekt 2022-06-18 14:48:44 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",`
			`"\n",`
			`"\n",`
			`" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",`
			`"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Naiwny bayes:\n",`
			`"recal score = 0.9939463822427212\n",`
			`"acuracy score = 0.9889948642699926\n",`
			`"precision score = 0.9888156008029825\n",`
			`"f score score = 0.9913743530764807\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",`
			`"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",`
			`"\n",`
			`"Increase the number of iterations (max_iter) or scale the data as shown in:\n",`
			`" https://scikit-learn.org/stable/modules/preprocessing.html\n",`
			`"Please also refer to the documentation for alternative solver options:\n",`
			`" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",`
			`" n_iter_i = _check_optimize_result(\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"\n",`
			`"Regresja logistyczna:\n",`
			`"recal score = 0.9979821274142404\n",`
			`"acuracy score = 0.9891782831988262\n",`
			`"precision score = 0.9852020489470689\n",`
			`"f score score = 0.9915509093512818\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"import lzma\n",`
			`"import pandas as pd\n",`
			`"import numpy as np\n",`
			`"import gzip\n",`
			`"from sklearn.feature_extraction.text import TfidfVectorizer\n",`
			`"from sklearn.naive_bayes import MultinomialNB\n",`
			`"from sklearn.pipeline import make_pipeline\n",`
			`"from sklearn.metrics import recall_score\n",`
			`"from sklearn.metrics import precision_score\n",`
			`"from sklearn.metrics import accuracy_score\n",`
			`"from sklearn.metrics import f1_score\n",`
			`"from sklearn.linear_model import LogisticRegression\n",`
			`"\n",`
			`"def readFile(filename):\n",`
			`" X_dev = []\n",`
			`" with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",`
			`" for line in dev_in:\n",`
			`" text = line.split(\"\\t\")[0].strip()\n",`
			`" X_dev.append(text)\n",`
			`" return X_dev\n",`
			`" \n",`
			`"def writePred(filename, predictions):\n",`
			`" with open(filename, \"w\") as out_file:\n",`
			`" for pred in predictions:\n",`
			`" out_file.write(str(pred) + \"\\n\")\n",`
			`"\n",`
			`"with gzip.open('train.tsv.gz', 'rb') as f:\n",`
			`" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",`
			`"\n",`
			`"x = data['text']\n",`
			`"y = data['isBall']\n",`
			`"\n",`
			`"x = np.asarray(x)\n",`
			`"y = np.asarray(y)\n",`
			`"\n",`
			`"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",`
			`"model.fit(x,y)\n",`
			`"\n",`
			`"dev = readFile('in.tsv')\n",`
			`"pred = model.predict(dev)\n",`
			`"trueClass = readFile('expected.tsv')\n",`
			`"\n",`
			`"trueClass = [int(x) for x in trueClass]\n",`
			`"print('Naiwny bayes:')\n",`
			`"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",`
			`"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",`
			`"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",`
			`"print(\"f score score = \",f1_score(trueClass,list(pred)))\n",`
			`"\n",`
			`"#x = x[:50000]\n",`
			`"#y = y[:50000]\n",`
			`"model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",`
			`"model.fit(x,y)\n",`
			`"pred = model.predict(dev)\n",`
			`"\n",`
			`"print('\\nRegresja logistyczna:')\n",`
			`"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",`
			`"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",`
			`"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",`
			`"print(\"f score score = \",f1_score(trueClass,list(pred)))\n"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"interpreter": {`
			`"hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"`
			`},`
			`"kernelspec": {`
			`"display_name": "Python 3.9.2 64-bit",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.9.2"`
			`},`
			`"orig_nbformat": 4`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`