paranormal-or-skeptic-ISI-p.../.ipynb_checkpoints/Untitled-checkpoint.ipynb
Jan Przybylski 8967a904f8 update
2021-04-20 18:55:51 +02:00

4.1 KiB
Raw Blame History

from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
  
warnings.filterwarnings(action = 'ignore')
  
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

sample = open("/train/in.tsv", "r")
s = sample.read()
  
# Replaces escape character with space
f = s.replace("\n", " ")
  
data = []
  
# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []
      
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
  
    data.append(temp)
  
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1, 
                              size = 100, window = 5)
  
w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)
w2v_model.wv['word']

with open("train/in.tsv") as f:
    content = f.readlines()
    with open("train/expected.tsv") as ff:
        y = ff.readlines()
        vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
        vectorizer = TfidfVectorizer()
        x = vectorizer.fit_transform(content)
        x=x.toarray()
        y=y.toarray()
        model = GaussianNB()
        model.fit(x,y)
        y_pred = model.predict([[0,1]])
        print(y_pred)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-d179e01d96de> in <module>
     13 
     14 
---> 15 w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)
     16 w2v_model.wv['word']
     17 

NameError: name 'text_data' is not defined