diff --git a/main.py b/main.py index 8253a72..a1329ab 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -import gzip - +from gensim import downloader +from nltk.tokenize import word_tokenize x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns @@ -16,3 +16,26 @@ x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, quoting=3, names=x_labels) print(x_train) + +x_train = x_train[x_labels[0]].str.lower() +x_dev = x_dev[x_labels[0]].str.lower() +x_test = x_test[x_labels[0]].str.lower() +y_train = y_train[y_labels[0]] + +x_train = [word_tokenize(x) for x in x_train] +x_dev = [word_tokenize(x) for x in x_dev] +x_test = [word_tokenize(x) for x in x_test] + +print(x_train) + + +# w2v = downloader.load('glove-wiki-gigaword-200') + +# def document_vector(doc): +# return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0) + +# for doc in x_train: + +# x_train = [document_vector(doc) for doc in x_train] +# x_dev = [document_vector(doc) for doc in x_dev] +# x_test = [document_vector(doc) for doc in x_test]