import torch import random from torch import nn from torch import optim import pandas import numpy as np import re import timeit from sklearn.metrics import precision_score, recall_score, accuracy_score from torch.utils.data import Dataset, DataLoader #10 features: 4 normal + 6 from domain_onehot model = nn.Sequential( nn.Linear(10, 16), nn.ReLU(), nn.Linear(16,1), nn.Sigmoid()) criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) #optimizer = optim.Adam(model.parameters()) minibatch_size = 5 def count_polish_diacritics(x): x_counts = [] for i, word in x.iteritems(): c = len(re.findall(r'[ąćęłńóśźż]', str(word))) c2 = c / len(str(word)) x_counts.append(c2) return x_counts def count_vowels(x): out = [] for index,row in x.iteritems(): vowel_len = len(re.findall(r'[aąeęioóuy]', str(row))) word_len = len(str(row)) out.append(vowel_len / word_len) #RATE return out def Normalize(data, d = None): if (d is None): d = data r = data - d.min() return r/(d.max() - d.min()) def f1_score(y_true, y_pred): precision = precision_score(y_true, y_pred, average='micro') recall = recall_score(y_true, y_pred, average='micro') F1 = 2 * (precision * recall) / (precision + recall) return F1 #Transforms df with categorical values to One Hot format def ToOneHot(df_col): out = [] df_labels = pandas.unique(df_col) l_count = len(df_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for i in range(0, l_count): if df_labels[i] == row: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=df_labels) return out_df, df_labels def ToOneHot_preproces(df_col, df_labels): out = [] l_count = len(df_labels) for index, row in df_col.iteritems(): blank_one_hot = np.full(l_count, 0) for i in range(0, l_count): if df_labels[i] == row: blank_one_hot[i] = 1 out.append(blank_one_hot) out_df = pandas.DataFrame(out, columns=df_labels) return out_df class TrainDataset(Dataset): def __init__(self, X, y): self.X = X self.y = y def __len__(self): return len(self.X) def __getitem__(self, idx): return self.X[idx], self.y[idx] #Load data: #Train train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None) x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float)) x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float)) domain_onehot, domain_labels = ToOneHot(train_data['Domain']) x3 = torch.tensor(domain_onehot.values, dtype=torch.float) x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float)) x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float)) x_temp = torch.stack((x1,x2,x4, x5),0) x = torch.cat([x_temp.transpose(1,0), x3], 1) #debug_x = pandas.DataFrame(x.numpy()) y = torch.tensor(train_data['Sane'], dtype=torch.float) #dev0 dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1) dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2) dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float) dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4) dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5) dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0) dev_x = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1) #test-A testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1) testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2) testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float) testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4) testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5) testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0) testA_x = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1) dataset_train = TrainDataset(x, y) trainloader=DataLoader(dataset=dataset_train,batch_size=5) for i in range(2): for xb, yb_expected in trainloader: # for each iteration a bach of samples is taken from loader(currently batch_size=5) yp = model(xb) # debug """ debug_xb = pandas.DataFrame(xb.numpy()) debug_yb_expected = pandas.DataFrame(yb_expected.numpy()) debug_yp = pandas.DataFrame(yp.detach().numpy()) """ #print(torch.squeeze(yp)) #print(yb_expected) loss = criterion(torch.squeeze(yp), yb_expected) optimizer.zero_grad() loss.backward() optimizer.step() #print(loss) #4 200 #elapsed_time = timeit.timeit(train_loop, number=1) #print("Training time: ", elapsed_time, "seconds") #saving results: #dev0: dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_y = model(dev_x) file=open("dev-0/out.tsv","w") file2=open("dev-0/out_float.tsv","w") for i in range(0,11026): file2.write(str(dev_y[i].data.item()) + "\n") var = dev_y[i].data.item() if var < 0.5: file.write("0" + "\n") else: file.write("1" + "\n") file.close() file2.close() y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None)) score = f1_score(y_test, dev_y_pred) print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred)) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #testA: testA_y = model(testA_x) file=open("test-A/out.tsv","w+") file2=open("test-A/out_float.tsv","w") for i in range(0,11061): file2.write(str(testA_y[i].data.item()) + "\n") if testA_y[i].data.item() < 0.5: file.write("0\n") else: file.write("1\n") file.close() file2.close() file=open("results.txt","w") file.write("loss = " + str(loss.item())+ "\n") file.write("f1: " + str(score) + "\n" ) file.close()