tokenize words

This commit is contained in:
Maciej Sobkowiak 2021-05-25 22:06:25 +02:00
parent 2e150d9a9a
commit 894a4fbebb

27
main.py
View File

@ -1,7 +1,7 @@
import pandas as pd
import numpy as np
import gzip
from gensim import downloader
from nltk.tokenize import word_tokenize
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
@ -16,3 +16,26 @@ x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
header=None, quoting=3, names=x_labels)
print(x_train)
x_train = x_train[x_labels[0]].str.lower()
x_dev = x_dev[x_labels[0]].str.lower()
x_test = x_test[x_labels[0]].str.lower()
y_train = y_train[y_labels[0]]
x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
print(x_train)
# w2v = downloader.load('glove-wiki-gigaword-200')
# def document_vector(doc):
# return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0)
# for doc in x_train:
# x_train = [document_vector(doc) for doc in x_train]
# x_dev = [document_vector(doc) for doc in x_dev]
# x_test = [document_vector(doc) for doc in x_test]