logistic_regression/main.py
2021-09-20 22:20:34 +02:00

52 lines
1.6 KiB
Python

import pandas as pd
import spacy
from ffn import FFN
import numpy as np
from nltk.tokenize import word_tokenize
sp = spacy.load('en_core_web_sm')
def word2vec(word):
return sp(word).vector
def create_embeddings_file(data, path, func):
out = []
for line in data:
out.append(func(line))
df = pd.DataFrame(out)
df.to_csv(path)
def load_embeddings_file(path):
return pd.read_csv(path)
train_data = pd.read_csv("train/in.tsv", sep='\t')
train_data.columns = ['PostText', 'Timestamp']
train_expected = pd.read_csv("train/expected.tsv", sep='\t')
train_expected.columns = ['Label']
test_data = pd.read_csv("test-A/in.tsv", sep='\t')
test_data.columns = ['PostText', 'Timestamp']
dev_data = pd.read_csv('dev-0/in.tsv', sep='\t')
dev_data.columns = ['PostText', 'Timestamp']
dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t')
dev_expected.columns = ['Label']
create_embeddings_file(dev_data['PostText'], 'dev-0/embeddings.csv', word2vec)
create_embeddings_file(test_data['PostText'], 'test-A/embeddings.csv', word2vec)
create_embeddings_file(train_data['PostText'], 'train/embeddings.csv', word2vec)
train_data = load_embeddings_file('train/embeddings.csv').to_numpy()
dev_data = load_embeddings_file('dev-0/embeddings.csv').to_numpy()
test_data = load_embeddings_file('test-A/embeddings.csv').to_numpy()
model = FFN(300, 1, 300, 300, 0.01, 4, 100)
model.double()
model.train([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'])
model.load()
model.double()
model.test([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'], "train/out.tsv")