4.1 KiB
4.1 KiB
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action = 'ignore')
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
sample = open("/train/in.tsv", "r")
s = sample.read()
# Replaces escape character with space
f = s.replace("\n", " ")
data = []
# iterate through each sentence in the file
for i in sent_tokenize(f):
temp = []
# tokenize the sentence into words
for j in word_tokenize(i):
temp.append(j.lower())
data.append(temp)
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1,
size = 100, window = 5)
w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)
w2v_model.wv['word']
with open("train/in.tsv") as f:
content = f.readlines()
with open("train/expected.tsv") as ff:
y = ff.readlines()
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(content)
x=x.toarray()
y=y.toarray()
model = GaussianNB()
model.fit(x,y)
y_pred = model.predict([[0,1]])
print(y_pred)
[0;31m---------------------------------------------------------------------------[0m [0;31mNameError[0m Traceback (most recent call last) [0;32m<ipython-input-3-d179e01d96de>[0m in [0;36m<module>[0;34m[0m [1;32m 13[0m [0;34m[0m[0m [1;32m 14[0m [0;34m[0m[0m [0;32m---> 15[0;31m [0mw2v_model[0m [0;34m=[0m [0mgensim[0m[0;34m.[0m[0mmodels[0m[0;34m.[0m[0mWord2Vec[0m[0;34m([0m[0mtext_data[0m[0;34m,[0m [0msize[0m[0;34m=[0m[0;36m300[0m[0;34m,[0m [0mmin_count[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mwindow[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m [0miter[0m[0;34m=[0m[0;36m50[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 16[0m [0mw2v_model[0m[0;34m.[0m[0mwv[0m[0;34m[[0m[0;34m'word'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [1;32m 17[0m [0;34m[0m[0m [0;31mNameError[0m: name 'text_data' is not defined