2019-12-02 14:41:07 +01:00
|
|
|
|
import torch
|
|
|
|
|
import random
|
|
|
|
|
from torch import nn
|
|
|
|
|
from torch import optim
|
|
|
|
|
import pandas
|
|
|
|
|
import numpy as np
|
|
|
|
|
import re
|
|
|
|
|
import timeit
|
2019-12-03 21:35:28 +01:00
|
|
|
|
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
|
2019-12-02 14:41:07 +01:00
|
|
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
|
|
|
2019-12-03 21:35:28 +01:00
|
|
|
|
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
|
2019-12-02 14:41:07 +01:00
|
|
|
|
model = nn.Sequential(
|
2019-12-03 23:08:49 +01:00
|
|
|
|
nn.Linear(48, 96, bias=True),
|
2019-12-02 14:41:07 +01:00
|
|
|
|
nn.ReLU(),
|
2019-12-03 23:08:49 +01:00
|
|
|
|
nn.Linear(96,48,bias=True),
|
|
|
|
|
nn.ReLU(),
|
|
|
|
|
nn.Linear(48, 1, bias=True),
|
2019-12-02 14:41:07 +01:00
|
|
|
|
nn.Sigmoid())
|
2019-12-03 23:08:49 +01:00
|
|
|
|
criterion = nn.BCELoss()
|
|
|
|
|
#optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)
|
|
|
|
|
optimizer = optim.Adam(model.parameters())
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
|
|
|
|
|
2019-12-03 23:08:49 +01:00
|
|
|
|
minibatch_size = 200
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_polish_diacritics(x):
|
|
|
|
|
x_counts = []
|
|
|
|
|
for i, word in x.iteritems():
|
|
|
|
|
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
|
|
|
|
|
c2 = c / len(str(word))
|
|
|
|
|
x_counts.append(c2)
|
|
|
|
|
return x_counts
|
|
|
|
|
|
|
|
|
|
def count_vowels(x):
|
|
|
|
|
out = []
|
|
|
|
|
for index,row in x.iteritems():
|
|
|
|
|
vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
|
|
|
|
|
word_len = len(str(row))
|
|
|
|
|
out.append(vowel_len / word_len) #RATE
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
def Normalize(data, d = None):
|
|
|
|
|
if (d is None):
|
|
|
|
|
d = data
|
|
|
|
|
r = data - d.min()
|
|
|
|
|
return r/(d.max() - d.min())
|
|
|
|
|
|
|
|
|
|
def f1_score(y_true, y_pred):
|
|
|
|
|
precision = precision_score(y_true, y_pred, average='micro')
|
|
|
|
|
recall = recall_score(y_true, y_pred, average='micro')
|
|
|
|
|
F1 = 2 * (precision * recall) / (precision + recall)
|
|
|
|
|
return F1
|
|
|
|
|
|
|
|
|
|
#Transforms df with categorical values to One Hot format
|
|
|
|
|
def ToOneHot(df_col):
|
|
|
|
|
out = []
|
|
|
|
|
df_labels = pandas.unique(df_col)
|
|
|
|
|
l_count = len(df_labels)
|
|
|
|
|
for index, row in df_col.iteritems():
|
|
|
|
|
blank_one_hot = np.full(l_count, 0)
|
|
|
|
|
for i in range(0, l_count):
|
|
|
|
|
if df_labels[i] == row:
|
|
|
|
|
blank_one_hot[i] = 1
|
|
|
|
|
out.append(blank_one_hot)
|
|
|
|
|
out_df = pandas.DataFrame(out, columns=df_labels)
|
|
|
|
|
return out_df, df_labels
|
|
|
|
|
|
|
|
|
|
def ToOneHot_preproces(df_col, df_labels):
|
|
|
|
|
out = []
|
|
|
|
|
l_count = len(df_labels)
|
|
|
|
|
for index, row in df_col.iteritems():
|
|
|
|
|
blank_one_hot = np.full(l_count, 0)
|
|
|
|
|
for i in range(0, l_count):
|
|
|
|
|
if df_labels[i] == row:
|
|
|
|
|
blank_one_hot[i] = 1
|
|
|
|
|
out.append(blank_one_hot)
|
|
|
|
|
out_df = pandas.DataFrame(out, columns=df_labels)
|
|
|
|
|
return out_df
|
|
|
|
|
|
2019-12-03 21:35:28 +01:00
|
|
|
|
def getAllchars(df_col):
|
|
|
|
|
all = []
|
|
|
|
|
for index, row in df_col.iteritems():
|
|
|
|
|
all = all + list(row)
|
|
|
|
|
return all
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wordToOneHot(df_col, ch_labels):
|
|
|
|
|
out = []
|
|
|
|
|
l_count = len(ch_labels)
|
|
|
|
|
for index, row in df_col.iteritems():
|
|
|
|
|
blank_one_hot = np.full(l_count, 0)
|
|
|
|
|
for ch in list(str(row)):
|
|
|
|
|
for i in range(0, l_count):
|
|
|
|
|
if ch_labels[i] == ch:
|
|
|
|
|
blank_one_hot[i] = 1
|
|
|
|
|
out.append(blank_one_hot)
|
|
|
|
|
|
|
|
|
|
out_df = pandas.DataFrame(out, columns=ch_labels)
|
|
|
|
|
return out_df
|
|
|
|
|
|
|
|
|
|
|
2019-12-02 14:41:07 +01:00
|
|
|
|
class TrainDataset(Dataset):
|
|
|
|
|
def __init__(self, X, y):
|
|
|
|
|
self.X = X
|
|
|
|
|
self.y = y
|
|
|
|
|
|
|
|
|
|
def __len__(self):
|
2019-12-03 23:08:49 +01:00
|
|
|
|
return self.X.shape[0]
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
|
|
|
|
def __getitem__(self, idx):
|
|
|
|
|
return self.X[idx], self.y[idx]
|
|
|
|
|
|
|
|
|
|
#Load data:
|
|
|
|
|
#Train
|
|
|
|
|
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
|
2019-12-03 21:35:28 +01:00
|
|
|
|
char_labels = pandas.unique(getAllchars(train_data['Word']))
|
|
|
|
|
#print(char_labels)
|
|
|
|
|
#print(len(char_labels)) 38 liter
|
|
|
|
|
#debug_fq = train_data['Frequency']
|
2019-12-02 14:41:07 +01:00
|
|
|
|
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
|
|
|
|
|
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
|
|
|
|
|
|
|
|
|
|
domain_onehot, domain_labels = ToOneHot(train_data['Domain'])
|
|
|
|
|
x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
|
|
|
|
|
|
|
|
|
|
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
|
|
|
|
|
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
|
|
|
|
|
|
2019-12-03 21:35:28 +01:00
|
|
|
|
df_words_onehot = wordToOneHot(train_data['Word'], char_labels)
|
|
|
|
|
x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float)
|
|
|
|
|
|
|
|
|
|
x_temp1 = torch.stack((x1,x2,x4, x5),0)
|
|
|
|
|
x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1)
|
|
|
|
|
x = torch.cat([x_temp2, x_words_onehot], 1)
|
|
|
|
|
l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels)
|
|
|
|
|
print(l)
|
|
|
|
|
print(len(l))
|
|
|
|
|
#debug_x = pandas.DataFrame(x.numpy(), columns=l)
|
|
|
|
|
|
2019-12-02 14:41:07 +01:00
|
|
|
|
y = torch.tensor(train_data['Sane'], dtype=torch.float)
|
|
|
|
|
|
|
|
|
|
#dev0
|
2019-12-03 21:35:28 +01:00
|
|
|
|
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
2019-12-02 14:41:07 +01:00
|
|
|
|
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
|
|
|
|
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
|
|
|
|
|
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
|
|
|
|
|
|
|
|
|
|
dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float)
|
|
|
|
|
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
|
|
|
|
|
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
|
|
|
|
|
|
2019-12-03 21:35:28 +01:00
|
|
|
|
dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels)
|
|
|
|
|
dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float)
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
2019-12-03 21:35:28 +01:00
|
|
|
|
|
|
|
|
|
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
|
|
|
|
|
dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
|
|
|
|
|
dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1)
|
2019-12-02 14:41:07 +01:00
|
|
|
|
#test-A
|
|
|
|
|
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
|
|
|
|
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
|
|
|
|
|
testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
|
|
|
|
|
testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float)
|
|
|
|
|
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
|
|
|
|
|
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
|
|
|
|
|
|
2019-12-03 21:35:28 +01:00
|
|
|
|
testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels)
|
|
|
|
|
testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float)
|
|
|
|
|
|
2019-12-02 14:41:07 +01:00
|
|
|
|
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
|
2019-12-03 21:35:28 +01:00
|
|
|
|
testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
|
|
|
|
|
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset_train = TrainDataset(x, y)
|
2019-12-03 23:08:49 +01:00
|
|
|
|
trainloader=DataLoader(dataset=dataset_train, batch_size=minibatch_size, shuffle=True)
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
2019-12-03 23:08:49 +01:00
|
|
|
|
def train_loop(i = 100):
|
2019-12-02 14:41:07 +01:00
|
|
|
|
for i in range(i):
|
2019-12-03 23:08:49 +01:00
|
|
|
|
for xb, yb_expected in trainloader:
|
|
|
|
|
optimizer.zero_grad()
|
2019-12-02 14:41:07 +01:00
|
|
|
|
yp = model(xb)
|
|
|
|
|
|
|
|
|
|
# debug
|
|
|
|
|
"""
|
|
|
|
|
debug_xb = pandas.DataFrame(xb.numpy())
|
|
|
|
|
debug_yb_expected = pandas.DataFrame(yb_expected.numpy())
|
|
|
|
|
"""
|
2019-12-03 23:08:49 +01:00
|
|
|
|
debug_yp = pandas.DataFrame(yp.detach().numpy())
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
2019-12-03 23:08:49 +01:00
|
|
|
|
|
|
|
|
|
loss = criterion(torch.squeeze(yp), yb_expected)
|
2019-12-03 21:35:28 +01:00
|
|
|
|
|
|
|
|
|
dev_y_pred_float_tensor = model(dev_x)
|
|
|
|
|
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
|
|
|
|
|
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
|
2019-12-03 23:08:49 +01:00
|
|
|
|
print("auc:\t", auc_score, "\tloss:\t", loss.item())
|
|
|
|
|
if ((auc_score > 0.80)):
|
2019-12-03 21:35:28 +01:00
|
|
|
|
break
|
|
|
|
|
|
2019-12-02 14:41:07 +01:00
|
|
|
|
loss.backward()
|
|
|
|
|
optimizer.step()
|
|
|
|
|
|
2019-12-03 23:08:49 +01:00
|
|
|
|
if ((auc_score > 0.80)):
|
2019-12-03 21:35:28 +01:00
|
|
|
|
break
|
|
|
|
|
#print(loss)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#4 200 ~7h
|
2019-12-02 14:41:07 +01:00
|
|
|
|
elapsed_time = timeit.timeit(train_loop, number=1)
|
|
|
|
|
print("Training time: ", elapsed_time, "seconds")
|
|
|
|
|
|
|
|
|
|
#saving results:
|
|
|
|
|
#dev0:
|
|
|
|
|
dev_y = model(dev_x)
|
|
|
|
|
file=open("dev-0/out.tsv","w")
|
|
|
|
|
file2=open("dev-0/out_float.tsv","w")
|
2019-12-03 21:35:28 +01:00
|
|
|
|
dev_y_pred_float=[]
|
2019-12-02 14:41:07 +01:00
|
|
|
|
for i in range(0,11026):
|
|
|
|
|
file2.write(str(dev_y[i].data.item()) + "\n")
|
2019-12-03 21:35:28 +01:00
|
|
|
|
dev_y_pred_float.append(dev_y[i].data.item())
|
2019-12-02 14:41:07 +01:00
|
|
|
|
var = dev_y[i].data.item()
|
|
|
|
|
if var < 0.5:
|
|
|
|
|
file.write("0" + "\n")
|
|
|
|
|
else:
|
|
|
|
|
file.write("1" + "\n")
|
|
|
|
|
file.close()
|
|
|
|
|
file2.close()
|
|
|
|
|
|
|
|
|
|
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
|
|
|
|
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
|
2019-12-03 21:35:28 +01:00
|
|
|
|
|
2019-12-02 14:41:07 +01:00
|
|
|
|
score = f1_score(y_test, dev_y_pred)
|
2019-12-03 21:35:28 +01:00
|
|
|
|
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
|
|
|
|
|
"\nroc_auc: ", roc_auc_score(dev_y_test,dev_y_pred_float ))
|
|
|
|
|
print(dev_y_pred_float)
|
2019-12-02 14:41:07 +01:00
|
|
|
|
|
|
|
|
|
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
|
|
|
#testA:
|
|
|
|
|
testA_y = model(testA_x)
|
|
|
|
|
file=open("test-A/out.tsv","w")
|
|
|
|
|
file2=open("test-A/out_float.tsv","w")
|
|
|
|
|
|
|
|
|
|
for i in range(0,11061):
|
|
|
|
|
file2.write(str(testA_y[i].data.item()) + "\n")
|
|
|
|
|
if testA_y[i].data.item() < 0.5:
|
|
|
|
|
file.write("0" + "\n")
|
|
|
|
|
else:
|
|
|
|
|
file.write("1" + "\n")
|
|
|
|
|
file.close()
|
|
|
|
|
file2.close()
|