tokenize words
This commit is contained in:
parent
2e150d9a9a
commit
894a4fbebb
27
main.py
27
main.py
@ -1,7 +1,7 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import gzip
|
from gensim import downloader
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
|
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
|
||||||
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
|
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
|
||||||
@ -16,3 +16,26 @@ x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
|
|||||||
header=None, quoting=3, names=x_labels)
|
header=None, quoting=3, names=x_labels)
|
||||||
|
|
||||||
print(x_train)
|
print(x_train)
|
||||||
|
|
||||||
|
x_train = x_train[x_labels[0]].str.lower()
|
||||||
|
x_dev = x_dev[x_labels[0]].str.lower()
|
||||||
|
x_test = x_test[x_labels[0]].str.lower()
|
||||||
|
y_train = y_train[y_labels[0]]
|
||||||
|
|
||||||
|
x_train = [word_tokenize(x) for x in x_train]
|
||||||
|
x_dev = [word_tokenize(x) for x in x_dev]
|
||||||
|
x_test = [word_tokenize(x) for x in x_test]
|
||||||
|
|
||||||
|
print(x_train)
|
||||||
|
|
||||||
|
|
||||||
|
# w2v = downloader.load('glove-wiki-gigaword-200')
|
||||||
|
|
||||||
|
# def document_vector(doc):
|
||||||
|
# return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0)
|
||||||
|
|
||||||
|
# for doc in x_train:
|
||||||
|
|
||||||
|
# x_train = [document_vector(doc) for doc in x_train]
|
||||||
|
# x_dev = [document_vector(doc) for doc in x_dev]
|
||||||
|
# x_test = [document_vector(doc) for doc in x_test]
|
||||||
|
Loading…
Reference in New Issue
Block a user