added onehot words
This commit is contained in:
parent
8b4b2a5232
commit
270eab1358
@ -1,7 +1,14 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
|
||||
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out.tsv" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out_float.tsv" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/s.py" beforeDir="false" afterPath="$PROJECT_DIR$/s.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/test-A/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out.tsv" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/test-A/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out_float.tsv" afterDir="false" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
@ -126,17 +133,27 @@
|
||||
<map>
|
||||
<entry key="MAIN">
|
||||
<value>
|
||||
<State />
|
||||
<State>
|
||||
<option name="COLUMN_ORDER" />
|
||||
</State>
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<breakpoints>
|
||||
<line-breakpoint suspend="THREAD" type="python-line">
|
||||
<url>file://$PROJECT_DIR$/s.py</url>
|
||||
<line>139</line>
|
||||
<option name="timeStamp" value="3" />
|
||||
</line-breakpoint>
|
||||
</breakpoints>
|
||||
</breakpoint-manager>
|
||||
<watches-manager>
|
||||
<configuration name="PythonConfigurationType">
|
||||
<watch expression="dev_y" />
|
||||
<watch expression="debug_yp" />
|
||||
</configuration>
|
||||
</watches-manager>
|
||||
</component>
|
||||
|
366
dev-0/out.tsv
366
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
22052
dev-0/out_float.tsv
22052
dev-0/out_float.tsv
File diff suppressed because it is too large
Load Diff
93
s.py
93
s.py
@ -6,21 +6,21 @@ import pandas
|
||||
import numpy as np
|
||||
import re
|
||||
import timeit
|
||||
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
||||
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
#10 features: 4 normal + 6 from domain_onehot
|
||||
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
|
||||
model = nn.Sequential(
|
||||
nn.Linear(10, 16),
|
||||
nn.Linear(48, 16),
|
||||
nn.ReLU(),
|
||||
nn.Linear(16,1),
|
||||
nn.Sigmoid())
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.000001, momentum=0.8)
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.000001, momentum=0.9)
|
||||
#optimizer = optim.Adam(model.parameters())
|
||||
|
||||
|
||||
minibatch_size = 5
|
||||
minibatch_size = 1000
|
||||
|
||||
|
||||
def count_polish_diacritics(x):
|
||||
@ -77,6 +77,28 @@ def ToOneHot_preproces(df_col, df_labels):
|
||||
out_df = pandas.DataFrame(out, columns=df_labels)
|
||||
return out_df
|
||||
|
||||
def getAllchars(df_col):
|
||||
all = []
|
||||
for index, row in df_col.iteritems():
|
||||
all = all + list(row)
|
||||
return all
|
||||
|
||||
|
||||
def wordToOneHot(df_col, ch_labels):
|
||||
out = []
|
||||
l_count = len(ch_labels)
|
||||
for index, row in df_col.iteritems():
|
||||
blank_one_hot = np.full(l_count, 0)
|
||||
for ch in list(str(row)):
|
||||
for i in range(0, l_count):
|
||||
if ch_labels[i] == ch:
|
||||
blank_one_hot[i] = 1
|
||||
out.append(blank_one_hot)
|
||||
|
||||
out_df = pandas.DataFrame(out, columns=ch_labels)
|
||||
return out_df
|
||||
|
||||
|
||||
class TrainDataset(Dataset):
|
||||
def __init__(self, X, y):
|
||||
self.X = X
|
||||
@ -91,6 +113,10 @@ class TrainDataset(Dataset):
|
||||
#Load data:
|
||||
#Train
|
||||
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
|
||||
char_labels = pandas.unique(getAllchars(train_data['Word']))
|
||||
#print(char_labels)
|
||||
#print(len(char_labels)) 38 liter
|
||||
#debug_fq = train_data['Frequency']
|
||||
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
|
||||
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
|
||||
|
||||
@ -100,12 +126,21 @@ x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
|
||||
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
|
||||
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
|
||||
|
||||
x_temp = torch.stack((x1,x2,x4, x5),0)
|
||||
x = torch.cat([x_temp.transpose(1,0), x3], 1)
|
||||
#debug_x = pandas.DataFrame(x.numpy())
|
||||
df_words_onehot = wordToOneHot(train_data['Word'], char_labels)
|
||||
x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float)
|
||||
|
||||
x_temp1 = torch.stack((x1,x2,x4, x5),0)
|
||||
x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1)
|
||||
x = torch.cat([x_temp2, x_words_onehot], 1)
|
||||
l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels)
|
||||
print(l)
|
||||
print(len(l))
|
||||
#debug_x = pandas.DataFrame(x.numpy(), columns=l)
|
||||
|
||||
y = torch.tensor(train_data['Sane'], dtype=torch.float)
|
||||
|
||||
#dev0
|
||||
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
||||
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
|
||||
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
|
||||
@ -114,9 +149,13 @@ dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).valu
|
||||
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
|
||||
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
|
||||
|
||||
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
|
||||
dev_x = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
|
||||
dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels)
|
||||
dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float)
|
||||
|
||||
|
||||
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
|
||||
dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
|
||||
dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1)
|
||||
#test-A
|
||||
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
||||
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
|
||||
@ -125,14 +164,18 @@ testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).
|
||||
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
|
||||
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
|
||||
|
||||
testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels)
|
||||
testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float)
|
||||
|
||||
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
|
||||
testA_x = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
|
||||
testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
|
||||
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
|
||||
|
||||
|
||||
dataset_train = TrainDataset(x, y)
|
||||
trainloader=DataLoader(dataset=dataset_train,batch_size=5)
|
||||
|
||||
def train_loop(i = 4200): #~7h
|
||||
def train_loop(i = 3):
|
||||
for i in range(i):
|
||||
for xb, yb_expected in trainloader: # for each iteration a bach of samples is taken from loader(currently batch_size=5)
|
||||
yp = model(xb)
|
||||
@ -146,23 +189,36 @@ def train_loop(i = 4200): #~7h
|
||||
|
||||
loss = criterion(yp, yb_expected)
|
||||
optimizer.zero_grad()
|
||||
|
||||
dev_y_pred_float_tensor = model(dev_x)
|
||||
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
|
||||
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
|
||||
print("auc: ", auc_score, "loss: ", loss.item())
|
||||
if(auc_score > 0.9):
|
||||
break
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
print(loss)
|
||||
|
||||
#4 200
|
||||
if (auc_score > 0.9):
|
||||
break
|
||||
#print(loss)
|
||||
|
||||
|
||||
|
||||
#4 200 ~7h
|
||||
elapsed_time = timeit.timeit(train_loop, number=1)
|
||||
print("Training time: ", elapsed_time, "seconds")
|
||||
|
||||
#saving results:
|
||||
#dev0:
|
||||
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
dev_y = model(dev_x)
|
||||
file=open("dev-0/out.tsv","w")
|
||||
file2=open("dev-0/out_float.tsv","w")
|
||||
|
||||
dev_y_pred_float=[]
|
||||
for i in range(0,11026):
|
||||
file2.write(str(dev_y[i].data.item()) + "\n")
|
||||
dev_y_pred_float.append(dev_y[i].data.item())
|
||||
var = dev_y[i].data.item()
|
||||
if var < 0.5:
|
||||
file.write("0" + "\n")
|
||||
@ -173,8 +229,11 @@ file2.close()
|
||||
|
||||
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
|
||||
score = f1_score(y_test, dev_y_pred)
|
||||
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
|
||||
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
|
||||
"\nroc_auc: ", roc_auc_score(dev_y_test,dev_y_pred_float ))
|
||||
print(dev_y_pred_float)
|
||||
|
||||
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
||||
#testA:
|
||||
|
11324
test-A/out.tsv
11324
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
22120
test-A/out_float.tsv
22120
test-A/out_float.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user