52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
import pandas as pd
|
|
import spacy
|
|
from ffn import FFN
|
|
import numpy as np
|
|
from nltk.tokenize import word_tokenize
|
|
sp = spacy.load('en_core_web_sm')
|
|
|
|
|
|
def word2vec(word):
|
|
return sp(word).vector
|
|
|
|
|
|
def create_embeddings_file(data, path, func):
|
|
out = []
|
|
for line in data:
|
|
out.append(func(line))
|
|
df = pd.DataFrame(out)
|
|
df.to_csv(path)
|
|
|
|
|
|
def load_embeddings_file(path):
|
|
return pd.read_csv(path)
|
|
|
|
|
|
train_data = pd.read_csv("train/in.tsv", sep='\t')
|
|
train_data.columns = ['PostText', 'Timestamp']
|
|
train_expected = pd.read_csv("train/expected.tsv", sep='\t')
|
|
train_expected.columns = ['Label']
|
|
|
|
test_data = pd.read_csv("test-A/in.tsv", sep='\t')
|
|
test_data.columns = ['PostText', 'Timestamp']
|
|
|
|
dev_data = pd.read_csv('dev-0/in.tsv', sep='\t')
|
|
dev_data.columns = ['PostText', 'Timestamp']
|
|
dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t')
|
|
dev_expected.columns = ['Label']
|
|
|
|
create_embeddings_file(dev_data['PostText'], 'dev-0/embeddings.csv', word2vec)
|
|
create_embeddings_file(test_data['PostText'], 'test-A/embeddings.csv', word2vec)
|
|
create_embeddings_file(train_data['PostText'], 'train/embeddings.csv', word2vec)
|
|
|
|
train_data = load_embeddings_file('train/embeddings.csv').to_numpy()
|
|
dev_data = load_embeddings_file('dev-0/embeddings.csv').to_numpy()
|
|
test_data = load_embeddings_file('test-A/embeddings.csv').to_numpy()
|
|
|
|
model = FFN(300, 1, 300, 300, 0.01, 4, 100)
|
|
model.double()
|
|
model.train([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'])
|
|
model.load()
|
|
model.double()
|
|
model.test([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'], "train/out.tsv")
|