TAU_22_sane_words_torch_nn_.../s.py
2019-12-04 00:20:52 +01:00

254 lines
8.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import torch
import random
from torch import nn
from torch import optim
import pandas
import numpy as np
import re
import timeit
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
model = nn.Sequential(
nn.Linear(48, 96, bias=True),
nn.ReLU(),
nn.Linear(96,48,bias=True),
nn.ReLU(),
nn.Linear(48, 1, bias=True),
nn.Sigmoid())
criterion = nn.BCELoss()
#optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)
optimizer = optim.Adam(model.parameters())
minibatch_size = 200
def count_polish_diacritics(x):
x_counts = []
for i, word in x.iteritems():
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
c2 = c / len(str(word))
x_counts.append(c2)
return x_counts
def count_vowels(x):
out = []
for index,row in x.iteritems():
vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
word_len = len(str(row))
out.append(vowel_len / word_len) #RATE
return out
def Normalize(data, d = None):
if (d is None):
d = data
r = data - d.min()
return r/(d.max() - d.min())
def f1_score(y_true, y_pred):
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
F1 = 2 * (precision * recall) / (precision + recall)
return F1
#Transforms df with categorical values to One Hot format
def ToOneHot(df_col):
out = []
df_labels = pandas.unique(df_col)
l_count = len(df_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for i in range(0, l_count):
if df_labels[i] == row:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df, df_labels
def ToOneHot_preproces(df_col, df_labels):
out = []
l_count = len(df_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for i in range(0, l_count):
if df_labels[i] == row:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df
def getAllchars(df_col):
all = []
for index, row in df_col.iteritems():
all = all + list(row)
return all
def wordToOneHot(df_col, ch_labels):
out = []
l_count = len(ch_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for ch in list(str(row)):
for i in range(0, l_count):
if ch_labels[i] == ch:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=ch_labels)
return out_df
class TrainDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return self.X.shape[0]
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
#Load data:
#Train
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
char_labels = pandas.unique(getAllchars(train_data['Word']))
#print(char_labels)
#print(len(char_labels)) 38 liter
#debug_fq = train_data['Frequency']
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
domain_onehot, domain_labels = ToOneHot(train_data['Domain'])
x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
df_words_onehot = wordToOneHot(train_data['Word'], char_labels)
x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float)
x_temp1 = torch.stack((x1,x2,x4, x5),0)
x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1)
x = torch.cat([x_temp2, x_words_onehot], 1)
l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels)
print(l)
print(len(l))
#debug_x = pandas.DataFrame(x.numpy(), columns=l)
y = torch.tensor(train_data['Sane'], dtype=torch.float)
#dev0
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float)
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels)
dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float)
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1)
#test-A
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float)
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels)
testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float)
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
dataset_train = TrainDataset(x, y)
trainloader=DataLoader(dataset=dataset_train, batch_size=minibatch_size, shuffle=True)
def train_loop(i = 20):
for i in range(i):
for xb, yb_expected in trainloader:
optimizer.zero_grad()
yp = model(xb)
# debug
"""
debug_xb = pandas.DataFrame(xb.numpy())
debug_yb_expected = pandas.DataFrame(yb_expected.numpy())
"""
debug_yp = pandas.DataFrame(yp.detach().numpy())
loss = criterion(torch.squeeze(yp), yb_expected)
dev_y_pred_float_tensor = model(dev_x)
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
print("auc:\t", auc_score, "\tloss:\t", loss.item())
if ((auc_score > 0.9)):
break
loss.backward()
optimizer.step()
if ((auc_score > 0.9)):
break
#print(loss)
#4 200 ~7h
elapsed_time = timeit.timeit(train_loop, number=1)
print("Training time: ", elapsed_time, "seconds")
#saving results:
#dev0:
dev_y = model(dev_x)
file=open("dev-0/out.tsv","w")
file2=open("dev-0/out_float.tsv","w")
dev_y_pred_float=[]
for i in range(0,11026):
file2.write(str(dev_y[i].data.item()) + "\n")
dev_y_pred_float.append(dev_y[i].data.item())
var = dev_y[i].data.item()
if var > 0.999:
file.write("1" + "\n")
else:
file.write("0" + "\n")
file.close()
file2.close()
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
"\nroc_auc: ", roc_auc_score(dev_y_test,dev_y_pred_float ))
print(dev_y_pred_float)
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#testA:
testA_y = model(testA_x)
file=open("test-A/out.tsv","w")
file2=open("test-A/out_float.tsv","w")
for i in range(0,11061):
file2.write(str(testA_y[i].data.item()) + "\n")
if testA_y[i].data.item() > 0.999:
file.write("1" + "\n")
else:
file.write("0" + "\n")
file.close()
file2.close()