diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 236a5ae..0000000 --- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,103 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'text_data' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mw2v_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWord2Vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mw2v_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'text_data' is not defined" - ] - } - ], - "source": [ - "from nltk.tokenize import sent_tokenize, word_tokenize\n", - "import warnings\n", - " \n", - "warnings.filterwarnings(action = 'ignore')\n", - " \n", - "import gensim\n", - "from gensim.models import Word2Vec\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "import numpy as np\n", - "from sklearn.datasets import load_iris\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.naive_bayes import GaussianNB\n", - "\n", - "sample = open(\"/train/in.tsv\", \"r\")\n", - "s = sample.read()\n", - " \n", - "# Replaces escape character with space\n", - "f = s.replace(\"\\n\", \" \")\n", - " \n", - "data = []\n", - " \n", - "# iterate through each sentence in the file\n", - "for i in sent_tokenize(f):\n", - " temp = []\n", - " \n", - " # tokenize the sentence into words\n", - " for j in word_tokenize(i):\n", - " temp.append(j.lower())\n", - " \n", - " data.append(temp)\n", - " \n", - "# Create CBOW model\n", - "model1 = gensim.models.Word2Vec(data, min_count = 1, \n", - " size = 100, window = 5)\n", - " \n", - "w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)\n", - "w2v_model.wv['word']\n", - "\n", - "with open(\"train/in.tsv\") as f:\n", - " content = f.readlines()\n", - " with open(\"train/expected.tsv\") as ff:\n", - " y = ff.readlines()\n", - " vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)\n", - " vectorizer = TfidfVectorizer()\n", - " x = vectorizer.fit_transform(content)\n", - " x=x.toarray()\n", - " y=y.toarray()\n", - " model = GaussianNB()\n", - " model.fit(x,y)\n", - " y_pred = model.predict([[0,1]])\n", - " print(y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}