import torch import pandas import re import numpy as np from sklearn.metrics import precision_score, recall_score, accuracy_score learning_rate = torch.tensor(0.00005, dtype=torch.float) def f1_score(y_true, y_pred): precision = precision_score(y_true, y_pred, average='micro') recall = recall_score(y_true, y_pred, average='micro') F1 = 2 * (precision * recall) / (precision + recall) return F1 W1 = torch.rand([5,16],dtype=torch.float, requires_grad=True) b1 = torch.rand(16,dtype=torch.float, requires_grad=True) W2 = torch.rand(16,dtype=torch.float, requires_grad=True) b2 = torch.rand(1,dtype=torch.float, requires_grad=True) def count_polish_diacritics(x): x_counts = [] for i, word in x.iteritems(): c = len(re.findall(r'[ąćęłńóśźż]', str(word))) c2 = c / len(str(word)) x_counts.append(c2) return x_counts def count_vowels(x): out = [] for index,row in x.iteritems(): vowel_len = len(re.findall(r'[aąeęioóuy]', str(row))) word_len = len(str(row)) out.append(vowel_len / word_len) #RATE return out def Normalize(data, d = None): if (d is None): d = data r = data - d.min() return r/(d.max() - d.min()) def model(data_x): h1=torch.relu(data_x.transpose(1,0) @ W1 + b1) m_y = torch.sigmoid(h1 @ W2 + b2) return m_y train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None) x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float)) x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float)) x3 = torch.tensor(train_data['Domain'].astype('category').cat.codes, dtype=torch.float) x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float)) x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float)) x = torch.stack((x1,x2,x3,x4, x5),0) y = torch.tensor(train_data['Sane'], dtype=torch.float) count=1 for index, row in train_data['Sane'].iteritems(): if row > 0: count += 1 print(count) print(y) print("Training...") criterion = torch.nn.MSELoss(reduction='sum') dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1) dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2) dev_x3 = Normalize(torch.tensor(dev_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3) dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4) dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5) dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4, dev_x5), 0) dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None)) for i in range(80): for j in range(1000): y_predicted = model(x) cost = criterion(y_predicted, y) cost.backward() #print(str(i), " ; ", cost) if (cost.item() < 40000): learning_rate = torch.tensor(0.00001, dtype=torch.float) #if (cost.item() < 1614): # learning_rate = torch.tensor(0.000001, dtype=torch.float) with torch.no_grad(): W1 = W1 - learning_rate * W1.grad b1 = b1 - learning_rate * b1.grad W2 = W2 - learning_rate * W2.grad b2 = b2 - learning_rate * b2.grad dev_y_test = model(dev_x) dev_y_test_f = dev_y_test.numpy() dev_y_test = np.where(dev_y_test_f > 0.5, 1, 0) print(dev_y_test) score = f1_score(dev_y_test, dev_y_pred) W1.requires_grad_(True) b1.requires_grad_(True) W2.requires_grad_(True) b2.requires_grad_(True) if (score < 0.35): break print(score) print(str(i), " ; ", cost) print(str(i), " ; ", cost) print("Dev0 pred...") #dev data: dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_y = model(dev_x) #dev_y_pred = np.where(dev_y > 0.5, 1, 0) #np.savetxt(f'./dev-0/out.tsv', dev_y_pred, '%d') file=open("dev-0/out.tsv","w") file2=open("dev-0/out_float.tsv","w") for i in range(0,11026): file2.write(str(dev_y[i].data.item()) + "\n") var = dev_y[i].data.item() if var < 0.5: file.write("0" + "\n") else: file.write("1" + "\n") file.close() file2.close() y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None)) dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None)) score = f1_score(y_test, dev_y_pred) print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred)) print("TestA pred...") #test-A testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None) testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1) testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2) testA_x3 = Normalize(torch.tensor(testA_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3) testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4) testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5) testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4, testA_x5),0) testA_y = model(testA_x) #np.savetxt(f'./test-A/out.tsv', testA_y_pred, '%d') file=open("test-A/out.tsv","w") file2=open("test-A/out_float.tsv","w") for i in range(0,11061): file2.write(str(testA_y[i].data.item()) + "\n") if testA_y[i].data.item() < 0.5: file.write("0" + "\n") else: file.write("1" + "\n") file.close() file2.close()