Zadanie naiwny bayes

2021-05-09 18:26:42 +02:00 · 2021-05-09 18:26:42 +02:00 · 621d3c74f4
commit 621d3c74f4
parent 756ef4277a
4 changed files with 10639 additions and 0 deletions
--- a/Naiwny_bayes.ipynb
+++ b/Naiwny_bayes.ipynb
@ -0,0 +1,139 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "import numpy as np\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.preprocessing import LabelEncoder "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getInput(path):\n",
+    "    with open(path,encoding='utf-8') as f:\n",
+    "          return f.readlines()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/c/Users/mkoci/Desktop/naiwny_bayes\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_in=getInput('./train/in.tsv')\n",
+    "train_expected=getInput('./train/expected.tsv')\n",
+    "test_in=getInput('./test-A/in.tsv')\n",
+    "dev_in=getInput('./dev-0/in.tsv')\n",
+    "dev_expected=getInput('./dev-0/expected.tsv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())\n",
+    "encTransform = LabelEncoder().fit_transform(train_expected)\n",
+    "model = pipeline.fit(train_in, encTransform)\n",
+    "dev_predicted = model.predict(dev_in)\n",
+    "test_predicted = model.predict(test_in)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('./dev-0/out.tsv', \"w\") as result:\n",
+    "            for out in dev_predicted:\n",
+    "                result.write(str(out) + '\\n')\n",
+    "with open('./test-A/out.tsv', \"w\") as result:\n",
+    "            for out in test_predicted:\n",
+    "                result.write(str(out) + '\\n')                "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NbConvertApp] Converting notebook Naiwny_bayes.ipynb to script\n",
+      "[NbConvertApp] Writing 1337 bytes to Naiwny_bayes.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "!jupyter nbconvert --to script Naiwny_bayes.ipynb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/Naiwny_bayes.py
+++ b/Naiwny_bayes.py
@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[46]:
+
+
+import sklearn
+from sklearn.pipeline import make_pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import LabelEncoder 
+
+
+# In[47]:
+
+
+def getInput(path):
+    with open(path,encoding='utf-8') as f:
+          return f.readlines()
+
+
+# In[48]:
+
+
+get_ipython().system('pwd')
+
+
+# In[49]:
+
+
+train_in=getInput('./train/in.tsv')
+train_expected=getInput('./train/expected.tsv')
+test_in=getInput('./test-A/in.tsv')
+dev_in=getInput('./dev-0/in.tsv')
+dev_expected=getInput('./dev-0/expected.tsv')
+
+
+# In[50]:
+
+
+pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())
+encTransform = LabelEncoder().fit_transform(train_expected)
+model = pipeline.fit(train_in, encTransform)
+dev_predicted = model.predict(dev_in)
+test_predicted = model.predict(test_in)
+
+
+# In[ ]:
+
+
+
+
+
+# In[54]:
+
+
+with open('./dev-0/out.tsv', "w") as result:
+            for out in dev_predicted:
+                result.write(str(out) + '\n')
+with open('./test-A/out.tsv', "w") as result:
+            for out in test_predicted:
+                result.write(str(out) + '\n')                
+
+
+# In[55]:
+
+
+get_ipython().system('jupyter nbconvert --to script Naiwny_bayes.ipynb')
+
+
+# In[ ]:
+
+
+
+
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv