{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'text_data' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mw2v_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWord2Vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mw2v_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'text_data' is not defined" ] } ], "source": [ "from nltk.tokenize import sent_tokenize, word_tokenize\n", "import warnings\n", " \n", "warnings.filterwarnings(action = 'ignore')\n", " \n", "import gensim\n", "from gensim.models import Word2Vec\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from sklearn.datasets import load_iris\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "sample = open(\"/train/in.tsv\", \"r\")\n", "s = sample.read()\n", " \n", "# Replaces escape character with space\n", "f = s.replace(\"\\n\", \" \")\n", " \n", "data = []\n", " \n", "# iterate through each sentence in the file\n", "for i in sent_tokenize(f):\n", " temp = []\n", " \n", " # tokenize the sentence into words\n", " for j in word_tokenize(i):\n", " temp.append(j.lower())\n", " \n", " data.append(temp)\n", " \n", "# Create CBOW model\n", "model1 = gensim.models.Word2Vec(data, min_count = 1, \n", " size = 100, window = 5)\n", " \n", "w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)\n", "w2v_model.wv['word']\n", "\n", "with open(\"train/in.tsv\") as f:\n", " content = f.readlines()\n", " with open(\"train/expected.tsv\") as ff:\n", " y = ff.readlines()\n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)\n", " vectorizer = TfidfVectorizer()\n", " x = vectorizer.fit_transform(content)\n", " x=x.toarray()\n", " y=y.toarray()\n", " model = GaussianNB()\n", " model.fit(x,y)\n", " y_pred = model.predict([[0,1]])\n", " print(y_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }