added onehot words

This commit is contained in:
ksanu 2019-12-03 21:35:28 +01:00
parent 8b4b2a5232
commit 270eab1358
6 changed files with 28027 additions and 27951 deletions

View File

@ -1,7 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out.tsv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out_float.tsv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/s.py" beforeDir="false" afterPath="$PROJECT_DIR$/s.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/test-A/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out.tsv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/test-A/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out_float.tsv" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
@ -126,17 +133,27 @@
<map>
<entry key="MAIN">
<value>
<State />
<State>
<option name="COLUMN_ORDER" />
</State>
</value>
</entry>
</map>
</option>
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/s.py</url>
<line>139</line>
<option name="timeStamp" value="3" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
<watches-manager>
<configuration name="PythonConfigurationType">
<watch expression="dev_y" />
<watch expression="debug_yp" />
</configuration>
</watches-manager>
</component>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

93
s.py
View File

@ -6,21 +6,21 @@ import pandas
import numpy as np
import re
import timeit
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
#10 features: 4 normal + 6 from domain_onehot
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
model = nn.Sequential(
nn.Linear(10, 16),
nn.Linear(48, 16),
nn.ReLU(),
nn.Linear(16,1),
nn.Sigmoid())
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.000001, momentum=0.8)
optimizer = optim.SGD(model.parameters(), lr=0.000001, momentum=0.9)
#optimizer = optim.Adam(model.parameters())
minibatch_size = 5
minibatch_size = 1000
def count_polish_diacritics(x):
@ -77,6 +77,28 @@ def ToOneHot_preproces(df_col, df_labels):
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df
def getAllchars(df_col):
all = []
for index, row in df_col.iteritems():
all = all + list(row)
return all
def wordToOneHot(df_col, ch_labels):
out = []
l_count = len(ch_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for ch in list(str(row)):
for i in range(0, l_count):
if ch_labels[i] == ch:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=ch_labels)
return out_df
class TrainDataset(Dataset):
def __init__(self, X, y):
self.X = X
@ -91,6 +113,10 @@ class TrainDataset(Dataset):
#Load data:
#Train
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
char_labels = pandas.unique(getAllchars(train_data['Word']))
#print(char_labels)
#print(len(char_labels)) 38 liter
#debug_fq = train_data['Frequency']
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
@ -100,12 +126,21 @@ x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
x_temp = torch.stack((x1,x2,x4, x5),0)
x = torch.cat([x_temp.transpose(1,0), x3], 1)
#debug_x = pandas.DataFrame(x.numpy())
df_words_onehot = wordToOneHot(train_data['Word'], char_labels)
x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float)
x_temp1 = torch.stack((x1,x2,x4, x5),0)
x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1)
x = torch.cat([x_temp2, x_words_onehot], 1)
l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels)
print(l)
print(len(l))
#debug_x = pandas.DataFrame(x.numpy(), columns=l)
y = torch.tensor(train_data['Sane'], dtype=torch.float)
#dev0
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
@ -114,9 +149,13 @@ dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).valu
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
dev_x = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels)
dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float)
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1)
#test-A
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
@ -125,14 +164,18 @@ testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels)
testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float)
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
testA_x = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
dataset_train = TrainDataset(x, y)
trainloader=DataLoader(dataset=dataset_train,batch_size=5)
def train_loop(i = 4200): #~7h
def train_loop(i = 3):
for i in range(i):
for xb, yb_expected in trainloader: # for each iteration a bach of samples is taken from loader(currently batch_size=5)
yp = model(xb)
@ -146,23 +189,36 @@ def train_loop(i = 4200): #~7h
loss = criterion(yp, yb_expected)
optimizer.zero_grad()
dev_y_pred_float_tensor = model(dev_x)
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
print("auc: ", auc_score, "loss: ", loss.item())
if(auc_score > 0.9):
break
loss.backward()
optimizer.step()
print(loss)
#4 200
if (auc_score > 0.9):
break
#print(loss)
#4 200 ~7h
elapsed_time = timeit.timeit(train_loop, number=1)
print("Training time: ", elapsed_time, "seconds")
#saving results:
#dev0:
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_y = model(dev_x)
file=open("dev-0/out.tsv","w")
file2=open("dev-0/out_float.tsv","w")
dev_y_pred_float=[]
for i in range(0,11026):
file2.write(str(dev_y[i].data.item()) + "\n")
dev_y_pred_float.append(dev_y[i].data.item())
var = dev_y[i].data.item()
if var < 0.5:
file.write("0" + "\n")
@ -173,8 +229,11 @@ file2.close()
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
"\nroc_auc: ", roc_auc_score(dev_y_test,dev_y_pred_float ))
print(dev_y_pred_float)
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#testA:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff