update

2021-04-20 18:55:51 +02:00 · 2021-04-20 18:55:51 +02:00 · 8967a904f8
commit 8967a904f8
parent 756ef4277a
10 changed files with 605411 additions and 284307 deletions
--- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@ -0,0 +1,103 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'text_data' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-d179e01d96de>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mw2v_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWord2Vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0mw2v_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'text_data' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
+    "import warnings\n",
+    "  \n",
+    "warnings.filterwarnings(action = 'ignore')\n",
+    "  \n",
+    "import gensim\n",
+    "from gensim.models import Word2Vec\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "import numpy as np\n",
+    "from sklearn.datasets import load_iris\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "\n",
+    "sample = open(\"/train/in.tsv\", \"r\")\n",
+    "s = sample.read()\n",
+    "  \n",
+    "# Replaces escape character with space\n",
+    "f = s.replace(\"\\n\", \" \")\n",
+    "  \n",
+    "data = []\n",
+    "  \n",
+    "# iterate through each sentence in the file\n",
+    "for i in sent_tokenize(f):\n",
+    "    temp = []\n",
+    "      \n",
+    "    # tokenize the sentence into words\n",
+    "    for j in word_tokenize(i):\n",
+    "        temp.append(j.lower())\n",
+    "  \n",
+    "    data.append(temp)\n",
+    "  \n",
+    "# Create CBOW model\n",
+    "model1 = gensim.models.Word2Vec(data, min_count = 1, \n",
+    "                              size = 100, window = 5)\n",
+    "  \n",
+    "w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)\n",
+    "w2v_model.wv['word']\n",
+    "\n",
+    "with open(\"train/in.tsv\") as f:\n",
+    "    content = f.readlines()\n",
+    "    with open(\"train/expected.tsv\") as ff:\n",
+    "        y = ff.readlines()\n",
+    "        vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)\n",
+    "        vectorizer = TfidfVectorizer()\n",
+    "        x = vectorizer.fit_transform(content)\n",
+    "        x=x.toarray()\n",
+    "        y=y.toarray()\n",
+    "        model = GaussianNB()\n",
+    "        model.fit(x,y)\n",
+    "        y_pred = model.predict([[0,1]])\n",
+    "        print(y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/program.py
+++ b/program.py
@ -0,0 +1,30 @@
+import numpy as np
+from sklearn import preprocessing
+from sklearn.naive_bayes import GaussianNB
+from sklearn.feature_extraction.text import TfidfVectorizer
+le=preprocessing.LabelEncoder()
+
+with open("train/in.tsv") as f:
+    data = f.readlines() 
+    vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
+    vectorizer = TfidfVectorizer()
+    x = vectorizer.fit_transform(data)
+    X=x.toarray()
+    with open("train/expected.tsv") as ff:
+        Y = ff.readlines() 
+        Y=le.fit_transform(Y)
+        with open("dev-0/in.tsv") as d:
+            fil = d.readlines() 
+            vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
+            vectorizer = TfidfVectorizer()
+            r=vectorizer.fit_transform(fil)
+            r=r.toarray()
+            r=r.reshape(-1,1)
+            gnb = GaussianNB()
+            model=gnb.fit(X, Y)
+            y_pred=model.predict(X)
+            print(y_pred)
+            y_pred=np.array(y_pred)
+            t=np.array2string(y_pred, precision=2, separator='\n',suppress_small=True)
+            f = open("dev-0/out.tsv", "a")
+            f.write(t)
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/ex.tsv
+++ b/train/ex.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/train/innn.tsv
+++ b/train/innn.tsv