import torch import random from torch import nn from torch import optim import pandas import numpy as np import re import timeit import subprocess from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score from torch.utils.data import Dataset, DataLoader import string #10 features: 4 normal + 6 from domain_onehot + 38 char labels model = nn.Sequential( nn.Linear(48, 24, bias=True), nn.ReLU(), nn.Linear(24,24,bias=True), nn.ReLU(), nn.Linear(24, 1, bias=True), nn.Sigmoid()) criterion = nn.BCELoss(reduction='sum') optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) #optimizer = optim.Adam(model.parameters()) minibatch_size = 200 def count_polish_diacritics(x): x_counts = [] for i, word in x.iteritems(): c = len(re.findall(r'[ąćęłńóśźż]', str(word))) c2 = c / len(str(word)) x_counts.append(c2) return x_counts def count_vowels(x): out = [] for index,row in x.iteritems(): vowel_len = len(re.findall(r'[aąeęioóuy]', str(row))) word_len = len(str(row)) out.append(vowel_len / word_len) #RATE return out def Normalize(data, d = None): if (d is None): d = data r = data - d.min() return r/(d.max() - d.min()) def f1_score(y_true, y_pred): precision = precision_score(y_true, y_pred, average='micro') recall = recall_score(y_true, y_pred, average='micro') F1 = 2 * (precision * recall) / (precision + recall) return F1 #Transforms df with categorical values to One Hot format def ToOneHot(df_col): out = [] df_labels = pandas.unique(df_col) l_count = len(df_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for i in range(0, l_count): if df_labels[i] == row: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=df_labels) return out_df, df_labels def ToOneHot_preproces(df_col, df_labels): out = [] l_count = len(df_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for i in range(0, l_count): if df_labels[i] == row: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=df_labels) return out_df def getAllchars(df_col): all = [] for index, row in df_col.iteritems(): all = all + list(row) return all def wordToOneHot(df_col, ch_labels): out = [] l_count = len(ch_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for ch in list(str(row)): for i in range(0, l_count): if ch_labels[i] == ch: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=ch_labels) return out_df class TrainDataset(Dataset): def __init__(self, X, y): self.X = X self.y = y def __len__(self): return self.X.shape[0] def __getitem__(self, idx): return self.X[idx], self.y[idx] #Load data: #Train train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None) char_labels = pandas.unique(getAllchars(train_data['Word'])) #print(char_labels) #print(len(char_labels)) 38 liter #debug_fq = train_data['Frequency'] x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float)) x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float)) domain_onehot, domain_labels = ToOneHot(train_data['Domain']) x3 = torch.tensor(domain_onehot.values, dtype=torch.float) x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float)) x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float)) df_words_onehot = wordToOneHot(train_data['Word'], char_labels) x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float) x_temp1 = torch.stack((x1,x2,x4, x5),0) x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1) x = torch.cat([x_temp2, x_words_onehot], 1) l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels) print(l) print(len(l)) #debug_x = pandas.DataFrame(x.numpy(), columns=l) y = torch.tensor(train_data['Sane'], dtype=torch.float) #dev0 dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1) dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2) dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float) dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4) dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5) dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels) dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float) dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0) dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1) dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1) #test-A testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1) testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2) testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float) testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4) testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5) testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels) testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float) testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0) testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1) testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1) """ def pred_save_dev(): dev_y = model(dev_x) file = open("dev-0/out.tsv", "w") file2 = open("dev-0/out_float.tsv", "w") for i in range(0, 11026): file2.write(str(dev_y[i].data.item()) + "\n") var = dev_y[i].data.item() if var > threshold: file.write(f'{1}\n') else: file.write(f'{0}\n') file.close() file2.close() """ def pred_save(name, data_train_x, f_threshold): pred_y = model(data_train_x) file = open(name + "/out.tsv", "w") file2 = open(name + "/out_float.tsv", "w") for i in range(0, len(data_train_x)): file2.write(str(pred_y[i].data.item()) + "\n") var = pred_y[i].data.item() if var > f_threshold: file.write(f'{1}\n') else: file.write(f'{0}\n') file.close() file2.close() def optim_threshold(min_thr, step = 0.01): best_thr = min_thr best_geval=0.1 while min_thr < 1: pred_save("dev-0", dev_x, min_thr) metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"])) if float(metric) > best_geval: best_geval = float(metric) best_thr = min_thr min_thr += step print("optimTHR; geval metric: ", float(metric), "\tbest: ", best_geval, "\tthreshold: ", min_thr, "\tbest_thr: ", best_thr) return best_thr dataset_train = TrainDataset(x, y) trainloader=DataLoader(dataset=dataset_train, batch_size=minibatch_size, shuffle=True) def train_loop(i = 500, best = 0.1, threshold = 0.25): for i in range(i): for xb, yb_expected in trainloader: optimizer.zero_grad() yp = model(xb) # debug """ debug_xb = pandas.DataFrame(xb.numpy()) debug_yb_expected = pandas.DataFrame(yb_expected.numpy()) """ #debug_yp = pandas.DataFrame(yp.detach().numpy()) loss = criterion(torch.squeeze(yp), yb_expected) """ dev_y_pred_float_tensor = model(dev_x) dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy()) auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df) print("auc:\t", auc_score, "\tloss:\t", loss.item()) if ((auc_score > 0.90)): break """ #metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"])) loss.backward() optimizer.step() pred_save("dev-0", dev_x, threshold) metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"])) print("geval metric: ", float(metric),"\tbest: ", best, "\tLoss: ", loss.item(), "\tthr: ", threshold) if float(metric) > best: best_threshold = optim_threshold(float(threshold - 0.2)) threshold = best_threshold metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"])) best = float(metric) pred_save("dev-0/best", dev_x, threshold) pred_save("test-A/best", testA_x, threshold) #4 200 ~7h #elapsed_time = timeit.timeit(train_loop, number=1) #print("Training time: ", elapsed_time, "seconds") train_loop() #saving results: #dev0: y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None)) score = f1_score(y_test, dev_y_pred) print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred), "\nroc_auc: ", ) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #testA: testA_y = model(testA_x) file=open("test-A/out.tsv","w") file2=open("test-A/out_float.tsv","w") for i in range(0,11061): file2.write(str(testA_y[i].data.item()) + "\n") if testA_y[i].data.item() > 0.25: file.write(f'{1}\n') else: file.write(f'{0}\n') file.close() file2.close()