TAU_22_sane_words_torch_nn/s.py
2019-12-04 22:52:59 +01:00

307 lines
10 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import torch
import random
from torch import nn
from torch import optim
import pandas
import numpy as np
import re
import timeit
import subprocess
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
import string
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
model = nn.Sequential(
nn.Linear(48, 24, bias=True),
nn.ReLU(),
nn.Linear(24,24,bias=True),
nn.ReLU(),
nn.Linear(24, 1, bias=True),
nn.Sigmoid())
criterion = nn.BCELoss(reduction='sum')
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
#optimizer = optim.Adam(model.parameters())
minibatch_size = 200
def count_polish_diacritics(x):
x_counts = []
for i, word in x.iteritems():
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
c2 = c / len(str(word))
x_counts.append(c2)
return x_counts
def count_vowels(x):
out = []
for index,row in x.iteritems():
vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
word_len = len(str(row))
out.append(vowel_len / word_len) #RATE
return out
def Normalize(data, d = None):
if (d is None):
d = data
r = data - d.min()
return r/(d.max() - d.min())
def f1_score(y_true, y_pred):
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
F1 = 2 * (precision * recall) / (precision + recall)
return F1
#Transforms df with categorical values to One Hot format
def ToOneHot(df_col):
out = []
df_labels = pandas.unique(df_col)
l_count = len(df_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for i in range(0, l_count):
if df_labels[i] == row:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df, df_labels
def ToOneHot_preproces(df_col, df_labels):
out = []
l_count = len(df_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for i in range(0, l_count):
if df_labels[i] == row:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df
def getAllchars(df_col):
all = []
for index, row in df_col.iteritems():
all = all + list(row)
return all
def wordToOneHot(df_col, ch_labels):
out = []
l_count = len(ch_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for ch in list(str(row)):
for i in range(0, l_count):
if ch_labels[i] == ch:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=ch_labels)
return out_df
class TrainDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return self.X.shape[0]
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
#Load data:
#Train
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
char_labels = pandas.unique(getAllchars(train_data['Word']))
#print(char_labels)
#print(len(char_labels)) 38 liter
#debug_fq = train_data['Frequency']
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
domain_onehot, domain_labels = ToOneHot(train_data['Domain'])
x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
df_words_onehot = wordToOneHot(train_data['Word'], char_labels)
x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float)
x_temp1 = torch.stack((x1,x2,x4, x5),0)
x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1)
x = torch.cat([x_temp2, x_words_onehot], 1)
l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels)
print(l)
print(len(l))
#debug_x = pandas.DataFrame(x.numpy(), columns=l)
y = torch.tensor(train_data['Sane'], dtype=torch.float)
#dev0
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float)
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels)
dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float)
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1)
#test-A
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float)
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels)
testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float)
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
"""
def pred_save_dev():
dev_y = model(dev_x)
file = open("dev-0/out.tsv", "w")
file2 = open("dev-0/out_float.tsv", "w")
for i in range(0, 11026):
file2.write(str(dev_y[i].data.item()) + "\n")
var = dev_y[i].data.item()
if var > threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
"""
def pred_save(name, data_train_x, f_threshold):
pred_y = model(data_train_x)
file = open(name + "/out.tsv", "w")
file2 = open(name + "/out_float.tsv", "w")
for i in range(0, len(data_train_x)):
file2.write(str(pred_y[i].data.item()) + "\n")
var = pred_y[i].data.item()
if var > f_threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
def optim_threshold(min_thr, step = 0.01):
best_thr = min_thr
best_geval=0.1
while min_thr < 1:
pred_save("dev-0", dev_x, min_thr)
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
if float(metric) > best_geval:
best_geval = float(metric)
best_thr = min_thr
min_thr += step
print("optimTHR; geval metric: ", float(metric), "\tbest: ", best_geval, "\tthreshold: ", min_thr, "\tbest_thr: ", best_thr)
return best_thr
dataset_train = TrainDataset(x, y)
trainloader=DataLoader(dataset=dataset_train, batch_size=minibatch_size, shuffle=True)
def train_loop(i = 500, best = 0.1, threshold = 0.25):
for i in range(i):
for xb, yb_expected in trainloader:
optimizer.zero_grad()
yp = model(xb)
# debug
"""
debug_xb = pandas.DataFrame(xb.numpy())
debug_yb_expected = pandas.DataFrame(yb_expected.numpy())
"""
#debug_yp = pandas.DataFrame(yp.detach().numpy())
loss = criterion(torch.squeeze(yp), yb_expected)
"""
dev_y_pred_float_tensor = model(dev_x)
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
print("auc:\t", auc_score, "\tloss:\t", loss.item())
if ((auc_score > 0.90)):
break
"""
#metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
loss.backward()
optimizer.step()
pred_save("dev-0", dev_x, threshold)
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
print("geval metric: ", float(metric),"\tbest: ", best, "\tLoss: ", loss.item(), "\tthr: ", threshold)
if float(metric) > best:
best_threshold = optim_threshold(float(threshold - 0.2))
threshold = best_threshold
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
best = float(metric)
pred_save("dev-0/best", dev_x, threshold)
pred_save("test-A/best", testA_x, threshold)
#4 200 ~7h
#elapsed_time = timeit.timeit(train_loop, number=1)
#print("Training time: ", elapsed_time, "seconds")
train_loop()
#saving results:
#dev0:
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
"\nroc_auc: ", )
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#testA:
testA_y = model(testA_x)
file=open("test-A/out.tsv","w")
file2=open("test-A/out_float.tsv","w")
for i in range(0,11061):
file2.write(str(testA_y[i].data.item()) + "\n")
if testA_y[i].data.item() > 0.25:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()