TAU_21_sane_words/solution2.py

161 lines
5.5 KiB
Python
Raw Normal View History

2019-11-27 03:15:05 +01:00
import torch
import pandas
import re
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
2019-11-29 14:28:11 +01:00
learning_rate = torch.tensor(0.00005, dtype=torch.float)
2019-11-27 03:15:05 +01:00
def f1_score(y_true, y_pred):
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
F1 = 2 * (precision * recall) / (precision + recall)
return F1
2019-11-29 14:28:11 +01:00
W1 = torch.rand([5,16],dtype=torch.float, requires_grad=True)
b1 = torch.rand(16,dtype=torch.float, requires_grad=True)
W2 = torch.rand(16,dtype=torch.float, requires_grad=True)
b2 = torch.rand(1,dtype=torch.float, requires_grad=True)
2019-11-27 03:15:05 +01:00
def count_polish_diacritics(x):
x_counts = []
for i, word in x.iteritems():
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
2019-11-29 14:28:11 +01:00
c2 = c / len(str(word))
x_counts.append(c2)
2019-11-27 03:15:05 +01:00
return x_counts
2019-11-29 14:28:11 +01:00
def count_vowels(x):
out = []
for index,row in x.iteritems():
vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
word_len = len(str(row))
out.append(vowel_len / word_len) #RATE
return out
2019-11-27 03:15:05 +01:00
def Normalize(data, d = None):
if (d is None):
d = data
r = data - d.min()
return r/(d.max() - d.min())
2019-11-29 14:28:11 +01:00
def model(data_x):
h1=torch.relu(data_x.transpose(1,0) @ W1 + b1)
m_y = torch.sigmoid(h1 @ W2 + b2)
return m_y
2019-11-27 03:15:05 +01:00
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
2019-11-29 14:28:11 +01:00
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
x3 = torch.tensor(train_data['Domain'].astype('category').cat.codes, dtype=torch.float)
2019-11-27 03:15:05 +01:00
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
2019-11-29 14:28:11 +01:00
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
x = torch.stack((x1,x2,x3,x4, x5),0)
2019-11-27 03:15:05 +01:00
y = torch.tensor(train_data['Sane'], dtype=torch.float)
2019-11-29 14:28:11 +01:00
count=1
for index, row in train_data['Sane'].iteritems():
if row > 0:
count += 1
print(count)
print(y)
print("Training...")
criterion = torch.nn.MSELoss(reduction='sum')
for i in range(80):
for j in range(1000):
y_predicted = model(x)
cost = criterion(y_predicted, y)
cost.backward()
#print(str(i), " ; ", cost)
if (cost.item() < 40000):
learning_rate = torch.tensor(0.00001, dtype=torch.float)
#if (cost.item() < 1614):
# learning_rate = torch.tensor(0.000001, dtype=torch.float)
with torch.no_grad():
W1 = W1 - learning_rate * W1.grad
b1 = b1 - learning_rate * b1.grad
W2 = W2 - learning_rate * W2.grad
b2 = b2 - learning_rate * b2.grad
W1.requires_grad_(True)
b1.requires_grad_(True)
W2.requires_grad_(True)
b2.requires_grad_(True)
if (cost.item() < 1700):
break
#print("Dev0 pred...")
# dev
print("Dev0 pred...")
2019-11-27 03:15:05 +01:00
#dev data:
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
2019-11-29 14:28:11 +01:00
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
2019-11-27 03:15:05 +01:00
2019-11-29 14:28:11 +01:00
dev_x3 = Normalize(torch.tensor(dev_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3)
2019-11-27 03:15:05 +01:00
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
2019-11-29 14:28:11 +01:00
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4, dev_x5), 0)
2019-11-27 03:15:05 +01:00
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
2019-11-29 14:28:11 +01:00
dev_y = model(dev_x)
#dev_y_pred = np.where(dev_y > 0.5, 1, 0)
#np.savetxt(f'./dev-0/out.tsv', dev_y_pred, '%d')
file=open("dev-0/out.tsv","w")
file2=open("dev-0/out_float.tsv","w")
for i in range(0,11026):
file2.write(str(dev_y[i].data.item()) + "\n")
var = dev_y[i].data.item()
if var < 0.5:
file.write("0" + "\n")
else:
file.write("1" + "\n")
file.close()
file2.close()
2019-11-27 03:15:05 +01:00
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
2019-11-29 14:28:11 +01:00
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
2019-11-27 03:15:05 +01:00
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
print("TestA pred...")
#test-A
2019-11-29 14:28:11 +01:00
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
2019-11-27 03:15:05 +01:00
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
2019-11-29 14:28:11 +01:00
testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
testA_x3 = Normalize(torch.tensor(testA_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3)
2019-11-27 03:15:05 +01:00
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
2019-11-29 14:28:11 +01:00
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4, testA_x5),0)
testA_y = model(testA_x)
#np.savetxt(f'./test-A/out.tsv', testA_y_pred, '%d')
file=open("test-A/out.tsv","w")
file2=open("test-A/out_float.tsv","w")
for i in range(0,11061):
file2.write(str(testA_y[i].data.item()) + "\n")
if testA_y[i].data.item() < 0.5:
file.write("0" + "\n")
else:
file.write("1" + "\n")
file.close()
file2.close()