import torch import random from torch import nn from torch import optim import pandas import numpy as np import re import timeit from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score from torch.utils.data import Dataset, DataLoader #10 features: 4 normal + 6 from domain_onehot + 38 char labels model = nn.Sequential( nn.Linear(48, 16), nn.ReLU(), nn.Linear(16,1), nn.Sigmoid()) criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.000001, momentum=0.9) #optimizer = optim.Adam(model.parameters()) minibatch_size = 1000 def count_polish_diacritics(x): x_counts = [] for i, word in x.iteritems(): c = len(re.findall(r'[ąćęłńóśźż]', str(word))) c2 = c / len(str(word)) x_counts.append(c2) return x_counts def count_vowels(x): out = [] for index,row in x.iteritems(): vowel_len = len(re.findall(r'[aąeęioóuy]', str(row))) word_len = len(str(row)) out.append(vowel_len / word_len) #RATE return out def Normalize(data, d = None): if (d is None): d = data r = data - d.min() return r/(d.max() - d.min()) def f1_score(y_true, y_pred): precision = precision_score(y_true, y_pred, average='micro') recall = recall_score(y_true, y_pred, average='micro') F1 = 2 * (precision * recall) / (precision + recall) return F1 #Transforms df with categorical values to One Hot format def ToOneHot(df_col): out = [] df_labels = pandas.unique(df_col) l_count = len(df_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for i in range(0, l_count): if df_labels[i] == row: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=df_labels) return out_df, df_labels def ToOneHot_preproces(df_col, df_labels): out = [] l_count = len(df_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for i in range(0, l_count): if df_labels[i] == row: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=df_labels) return out_df def getAllchars(df_col): all = [] for index, row in df_col.iteritems(): all = all + list(row) return all def wordToOneHot(df_col, ch_labels): out = [] l_count = len(ch_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for ch in list(str(row)): for i in range(0, l_count): if ch_labels[i] == ch: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=ch_labels) return out_df class TrainDataset(Dataset): def __init__(self, X, y): self.X = X self.y = y def __len__(self): return len(self.X) def __getitem__(self, idx): return self.X[idx], self.y[idx] #Load data: #Train train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None) char_labels = pandas.unique(getAllchars(train_data['Word'])) #print(char_labels) #print(len(char_labels)) 38 liter #debug_fq = train_data['Frequency'] x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float)) x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float)) domain_onehot, domain_labels = ToOneHot(train_data['Domain']) x3 = torch.tensor(domain_onehot.values, dtype=torch.float) x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float)) x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float)) df_words_onehot = wordToOneHot(train_data['Word'], char_labels) x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float) x_temp1 = torch.stack((x1,x2,x4, x5),0) x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1) x = torch.cat([x_temp2, x_words_onehot], 1) l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels) print(l) print(len(l)) #debug_x = pandas.DataFrame(x.numpy(), columns=l) y = torch.tensor(train_data['Sane'], dtype=torch.float) #dev0 dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1) dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2) dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float) dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4) dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5) dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels) dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float) dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0) dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1) dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1) #test-A testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1) testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2) testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float) testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4) testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5) testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels) testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float) testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0) testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1) testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1) dataset_train = TrainDataset(x, y) trainloader=DataLoader(dataset=dataset_train,batch_size=5) def train_loop(i = 3): for i in range(i): for xb, yb_expected in trainloader: # for each iteration a bach of samples is taken from loader(currently batch_size=5) yp = model(xb) # debug """ debug_xb = pandas.DataFrame(xb.numpy()) debug_yb_expected = pandas.DataFrame(yb_expected.numpy()) debug_yp = pandas.DataFrame(yp.detach().numpy()) """ loss = criterion(yp, yb_expected) optimizer.zero_grad() dev_y_pred_float_tensor = model(dev_x) dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy()) auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df) print("auc: ", auc_score, "loss: ", loss.item()) if(auc_score > 0.9): break loss.backward() optimizer.step() if (auc_score > 0.9): break #print(loss) #4 200 ~7h elapsed_time = timeit.timeit(train_loop, number=1) print("Training time: ", elapsed_time, "seconds") #saving results: #dev0: dev_y = model(dev_x) file=open("dev-0/out.tsv","w") file2=open("dev-0/out_float.tsv","w") dev_y_pred_float=[] for i in range(0,11026): file2.write(str(dev_y[i].data.item()) + "\n") dev_y_pred_float.append(dev_y[i].data.item()) var = dev_y[i].data.item() if var < 0.5: file.write("0" + "\n") else: file.write("1" + "\n") file.close() file2.close() y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None)) score = f1_score(y_test, dev_y_pred) print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred), "\nroc_auc: ", roc_auc_score(dev_y_test,dev_y_pred_float )) print(dev_y_pred_float) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #testA: testA_y = model(testA_x) file=open("test-A/out.tsv","w") file2=open("test-A/out_float.tsv","w") for i in range(0,11061): file2.write(str(testA_y[i].data.item()) + "\n") if testA_y[i].data.item() < 0.5: file.write("0" + "\n") else: file.write("1" + "\n") file.close() file2.close()