paranormal-or-skeptic-ISI-p.../.ipynb_checkpoints/Untitled-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'text_data' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-3-d179e01d96de>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mw2v_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWord2Vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0mw2v_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'text_data' is not defined"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "import warnings\n",
    "  \n",
    "warnings.filterwarnings(action = 'ignore')\n",
    "  \n",
    "import gensim\n",
    "from gensim.models import Word2Vec\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "sample = open(\"/train/in.tsv\", \"r\")\n",
    "s = sample.read()\n",
    "  \n",
    "# Replaces escape character with space\n",
    "f = s.replace(\"\\n\", \" \")\n",
    "  \n",
    "data = []\n",
    "  \n",
    "# iterate through each sentence in the file\n",
    "for i in sent_tokenize(f):\n",
    "    temp = []\n",
    "      \n",
    "    # tokenize the sentence into words\n",
    "    for j in word_tokenize(i):\n",
    "        temp.append(j.lower())\n",
    "  \n",
    "    data.append(temp)\n",
    "  \n",
    "# Create CBOW model\n",
    "model1 = gensim.models.Word2Vec(data, min_count = 1, \n",
    "                              size = 100, window = 5)\n",
    "  \n",
    "w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)\n",
    "w2v_model.wv['word']\n",
    "\n",
    "with open(\"train/in.tsv\") as f:\n",
    "    content = f.readlines()\n",
    "    with open(\"train/expected.tsv\") as ff:\n",
    "        y = ff.readlines()\n",
    "        vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)\n",
    "        vectorizer = TfidfVectorizer()\n",
    "        x = vectorizer.fit_transform(content)\n",
    "        x=x.toarray()\n",
    "        y=y.toarray()\n",
    "        model = GaussianNB()\n",
    "        model.fit(x,y)\n",
    "        y_pred = model.predict([[0,1]])\n",
    "        print(y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
update 2021-04-20 18:55:51 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"ename": "NameError",`
			`"evalue": "name 'text_data' is not defined",`
			`"output_type": "error",`
			`"traceback": [`
			`"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",`
			`"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",`
			"\u001b[0;32m<ipython-input-3-d179e01d96de>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mw2v_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWord2Vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mw2v_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
			`"\u001b[0;31mNameError\u001b[0m: name 'text_data' is not defined"`
			`]`
			`}`
			`],`
			`"source": [`
			`"from nltk.tokenize import sent_tokenize, word_tokenize\n",`
			`"import warnings\n",`
			`" \n",`
			`"warnings.filterwarnings(action = 'ignore')\n",`
			`" \n",`
			`"import gensim\n",`
			`"from gensim.models import Word2Vec\n",`
			`"from sklearn.feature_extraction.text import TfidfVectorizer\n",`
			`"import numpy as np\n",`
			`"from sklearn.datasets import load_iris\n",`
			`"from sklearn.model_selection import train_test_split\n",`
			`"from sklearn.naive_bayes import GaussianNB\n",`
			`"\n",`
			`"sample = open(\"/train/in.tsv\", \"r\")\n",`
			`"s = sample.read()\n",`
			`" \n",`
			`"# Replaces escape character with space\n",`
			`"f = s.replace(\"\\n\", \" \")\n",`
			`" \n",`
			`"data = []\n",`
			`" \n",`
			`"# iterate through each sentence in the file\n",`
			`"for i in sent_tokenize(f):\n",`
			`" temp = []\n",`
			`" \n",`
			`" # tokenize the sentence into words\n",`
			`" for j in word_tokenize(i):\n",`
			`" temp.append(j.lower())\n",`
			`" \n",`
			`" data.append(temp)\n",`
			`" \n",`
			`"# Create CBOW model\n",`
			`"model1 = gensim.models.Word2Vec(data, min_count = 1, \n",`
			`" size = 100, window = 5)\n",`
			`" \n",`
			`"w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)\n",`
			`"w2v_model.wv['word']\n",`
			`"\n",`
			`"with open(\"train/in.tsv\") as f:\n",`
			`" content = f.readlines()\n",`
			`" with open(\"train/expected.tsv\") as ff:\n",`
			`" y = ff.readlines()\n",`
			`" vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)\n",`
			`" vectorizer = TfidfVectorizer()\n",`
			`" x = vectorizer.fit_transform(content)\n",`
			`" x=x.toarray()\n",`
			`" y=y.toarray()\n",`
			`" model = GaussianNB()\n",`
			`" model.fit(x,y)\n",`
			`" y_pred = model.predict([[0,1]])\n",`
			`" print(y_pred)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.8.3"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 4`
			`}`