137 lines
5.2 KiB
Python
137 lines
5.2 KiB
Python
import torch
|
|
import pandas
|
|
import re
|
|
import numpy as np
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
|
|
|
learning_rate = torch.tensor(0.00001, dtype=torch.float)
|
|
def f1_score(y_true, y_pred):
|
|
precision = precision_score(y_true, y_pred, average='micro')
|
|
recall = recall_score(y_true, y_pred, average='micro')
|
|
F1 = 2 * (precision * recall) / (precision + recall)
|
|
return F1
|
|
|
|
W = torch.rand([4,16],dtype=torch.float, requires_grad=True)
|
|
b = torch.rand(16,dtype=torch.float, requires_grad=True)
|
|
U = torch.rand(16,dtype=torch.float, requires_grad=True)
|
|
c = torch.rand(1,dtype=torch.float, requires_grad=True)
|
|
|
|
|
|
def count_polish_diacritics(x):
|
|
x_counts = []
|
|
for i, word in x.iteritems():
|
|
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
|
|
x_counts.append(c)
|
|
return x_counts
|
|
|
|
|
|
def Normalize(data, d = None):
|
|
if (d is None):
|
|
d = data
|
|
r = data - d.min()
|
|
return r/(d.max() - d.min())
|
|
|
|
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
|
|
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
|
|
x2 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
|
|
le = LabelEncoder()
|
|
le.fit(train_data['Domain'])
|
|
encoded_domain_col= le.transform(train_data['Domain'])
|
|
x3 = torch.tensor(encoded_domain_col, dtype=torch.float)
|
|
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
|
|
x = torch.stack((x1,x2,x3,x4),0)
|
|
y = torch.tensor(train_data['Sane'], dtype=torch.float)
|
|
|
|
#dev data:
|
|
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
|
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
|
|
dev_x2 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x2)
|
|
|
|
dev_encoded_domain_col = le.transform(dev_data['Domain'])
|
|
dev_x3 = torch.tensor(dev_encoded_domain_col, dtype=torch.float)
|
|
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
|
|
dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4), 0)
|
|
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
|
|
|
print("Training...")
|
|
|
|
for _ in range(500):
|
|
W.requires_grad_(True)
|
|
b.requires_grad_(True)
|
|
c.requires_grad_(True)
|
|
U.requires_grad_(True)
|
|
for _ in range(1000):
|
|
h = torch.sigmoid(x.transpose(1, 0) @ W + b)
|
|
y_predicted = torch.sigmoid(h @ U + c)
|
|
cost = torch.sum((y_predicted - y) ** 2)
|
|
cost.backward()
|
|
with torch.no_grad():
|
|
W = W - learning_rate * W.grad
|
|
b = b - learning_rate * b.grad
|
|
c = c - learning_rate * c.grad
|
|
U = U - learning_rate * U.grad
|
|
W.requires_grad_(True)
|
|
b.requires_grad_(True)
|
|
c.requires_grad_(True)
|
|
U.requires_grad_(True)
|
|
W.requires_grad_(False)
|
|
b.requires_grad_(False)
|
|
c.requires_grad_(False)
|
|
U.requires_grad_(False)
|
|
print("Dev0 pred...")
|
|
# dev
|
|
dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
|
|
dev_y = torch.sigmoid(dev_h @ U + c)
|
|
dev_y = dev_y.numpy()
|
|
dev_y_pred = np.where(dev_y > 0.5, 1, 0)
|
|
score = f1_score(dev_y_test, dev_y_pred)
|
|
print("f1_score_dev0 within training: ", score, "\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
|
|
|
|
W.requires_grad_(False)
|
|
b.requires_grad_(False)
|
|
c.requires_grad_(False)
|
|
U.requires_grad_(False)
|
|
|
|
print("Dev0 pred...")
|
|
#dev
|
|
|
|
|
|
dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
|
|
dev_y = torch.sigmoid(dev_h @ U + c)
|
|
dev_y = dev_y.numpy()
|
|
dev_y_pred = np.where(dev_y > 0.5, 1, 0)
|
|
#np.savetxt(f'./dev-0/out_float.tsv', dev_y, '%.f')
|
|
with open('dev-0/out.tsv', 'w') as output_file:
|
|
for out in dev_y_pred:
|
|
print('%s' % out, file=output_file)
|
|
with open('dev-0/out_float.tsv', 'w') as output_file:
|
|
for out in dev_y:
|
|
print('%s' % out, file=output_file)
|
|
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
|
score = f1_score(y_test, dev_y_pred)
|
|
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
|
|
|
|
print("TestA pred...")
|
|
#test-A
|
|
testA_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
|
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
|
|
testA_x2 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x2)
|
|
|
|
testA_encoded_domain_col= le.transform(testA_data['Domain'])
|
|
testA_x3 = torch.tensor(testA_encoded_domain_col, dtype=torch.float)
|
|
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
|
|
testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4),0)
|
|
|
|
testA_h = torch.sigmoid(testA_x.transpose(1, 0) @ W + b)
|
|
testA_y = torch.sigmoid(testA_h @ U + c)
|
|
testA_y = testA_y.numpy()
|
|
testA_y_pred = np.where(testA_y > 0.5, 1, 0)
|
|
np.savetxt(f'./test-A/out_float.tsv', testA_y)
|
|
with open('test-A/out.tsv', 'w') as output_file:
|
|
for out in testA_y_pred:
|
|
print('%s' % out, file=output_file)
|
|
with open('test-A/out_float.tsv', 'w') as output_file:
|
|
for out in testA_y:
|
|
print('%s' % out, file=output_file)
|