tokenize words
This commit is contained in:
parent
2e150d9a9a
commit
894a4fbebb
27
main.py
27
main.py
@ -1,7 +1,7 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import gzip
|
||||
|
||||
from gensim import downloader
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
|
||||
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
|
||||
@ -16,3 +16,26 @@ x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
|
||||
header=None, quoting=3, names=x_labels)
|
||||
|
||||
print(x_train)
|
||||
|
||||
x_train = x_train[x_labels[0]].str.lower()
|
||||
x_dev = x_dev[x_labels[0]].str.lower()
|
||||
x_test = x_test[x_labels[0]].str.lower()
|
||||
y_train = y_train[y_labels[0]]
|
||||
|
||||
x_train = [word_tokenize(x) for x in x_train]
|
||||
x_dev = [word_tokenize(x) for x in x_dev]
|
||||
x_test = [word_tokenize(x) for x in x_test]
|
||||
|
||||
print(x_train)
|
||||
|
||||
|
||||
# w2v = downloader.load('glove-wiki-gigaword-200')
|
||||
|
||||
# def document_vector(doc):
|
||||
# return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0)
|
||||
|
||||
# for doc in x_train:
|
||||
|
||||
# x_train = [document_vector(doc) for doc in x_train]
|
||||
# x_dev = [document_vector(doc) for doc in x_dev]
|
||||
# x_test = [document_vector(doc) for doc in x_test]
|
||||
|
Loading…
Reference in New Issue
Block a user