This commit is contained in:
ksanu 2019-12-04 22:52:59 +01:00
parent 08fe617c0f
commit c84faa55ea
13 changed files with 70722 additions and 26006 deletions

4
.idea/encodings.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

View File

@ -2,9 +2,8 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/misc.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/dev-0/expected.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/expected.tsv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out.tsv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out_float.tsv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/s.py" beforeDir="false" afterPath="$PROJECT_DIR$/s.py" afterDir="false" />
@ -18,15 +17,15 @@
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CoverageDataManager">
<SUITE FILE_PATH="coverage/TAU_22_sane_words_torch_nn$s.coverage" NAME="s Coverage Results" MODIFIED="1575414386650" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/TAU_22_sane_words_torch_nn$s.coverage" NAME="s Coverage Results" MODIFIED="1575494879139" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
</component>
<component name="FileEditorManager">
<leaf>
<file pinned="false" current-in-tab="false">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/s.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="980">
<caret line="225" column="18" selection-start-line="225" selection-start-column="18" selection-end-line="225" selection-end-column="18" />
<state relative-caret-position="191">
<caret line="269" column="15" selection-start-line="269" selection-start-column="15" selection-end-line="269" selection-end-column="15" />
<folding>
<element signature="e#0#12#0" expanded="true" />
</folding>
@ -35,30 +34,32 @@
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/dev-0/expected.tsv">
<entry file="file://$PROJECT_DIR$/s2.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-196952" />
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/dev-0/out.tsv">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="342">
<caret line="19" selection-start-line="19" selection-end-line="20" />
<state relative-caret-position="121">
<caret line="304" column="13" selection-end-line="304" selection-end-column="13" />
<folding>
<element signature="e#0#12#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/test-A/out.tsv">
<provider selected="true" editor-type-id="text-editor" />
<entry file="file:///usr/lib/python3.6/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="352">
<caret line="437" selection-start-line="437" selection-end-line="437" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/dev-0/out_float.tsv">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-198268" />
<state relative-caret-position="36">
<caret line="2" column="3" lean-forward="true" selection-start-line="2" selection-start-column="3" selection-end-line="2" selection-end-column="3" />
</state>
</provider>
</entry>
</file>
@ -77,15 +78,17 @@
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/new_lab.py" />
<option value="$PROJECT_DIR$/s2.py" />
<option value="$PROJECT_DIR$/s.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds">
<option name="x" value="147" />
<option name="y" value="16" />
<option name="width" value="1320" />
<option name="height" value="724" />
<option name="x" value="66" />
<option name="y" value="117" />
<option name="width" value="1339" />
<option name="height" value="687" />
</component>
<component name="ProjectId" id="1UAXhosCPbReL7U2TCbyyTVGpqs" />
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
@ -94,7 +97,6 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
@ -116,9 +118,11 @@
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
@ -136,6 +140,28 @@
</option>
</component>
<component name="RunManager" selected="Python.s">
<configuration name="new_lab" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="TAU_21_sane_words" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/new_lab.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="s" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="TAU_21_sane_words" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -205,6 +231,7 @@
<recent_temporary>
<list>
<item itemvalue="Python.s" />
<item itemvalue="Python.new_lab" />
<item itemvalue="Python.solution2" />
<item itemvalue="Python.solution" />
</list>
@ -220,38 +247,48 @@
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1574800494334</updated>
<workItem from="1575412092709" duration="2059000" />
<workItem from="1575412092709" duration="3487000" />
<workItem from="1575450164911" duration="5424000" />
<workItem from="1575467930210" duration="14040000" />
</task>
<task id="LOCAL-00001" summary="test3">
<created>1575414898244</created>
<option name="number" value="00001" />
<option name="presentableId" value="LOCAL-00001" />
<option name="project" value="LOCAL" />
<updated>1575414898244</updated>
</task>
<option name="localTasksCounter" value="2" />
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="2059000" />
<option name="totallyTimeSpent" value="22951000" />
</component>
<component name="ToolWindowManager">
<frame x="147" y="16" width="1320" height="724" extended-state="0" />
<frame x="66" y="117" width="1339" height="687" extended-state="0" />
<editor active="true" />
<layout>
<window_info id="Favorites" side_tool="true" />
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.24405706" />
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.22092116" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info anchor="bottom" id="Docker" show_stripe_button="false" />
<window_info anchor="bottom" id="Database Changes" />
<window_info anchor="bottom" id="Version Control" />
<window_info anchor="bottom" id="Python Console" />
<window_info anchor="bottom" id="Terminal" />
<window_info anchor="bottom" id="Event Log" side_tool="true" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" visible="true" weight="0.33625218" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info active="true" anchor="bottom" id="Run" order="2" visible="true" weight="0.25093633" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.35018727" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="right" id="SciView" />
<window_info anchor="right" id="Database" />
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
<window_info anchor="bottom" id="Version Control" order="8" />
<window_info anchor="bottom" id="Database Changes" order="9" />
<window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
<window_info anchor="bottom" id="Terminal" order="11" />
<window_info anchor="bottom" id="Python Console" order="12" weight="0.07116105" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="SciView" order="3" />
<window_info anchor="right" id="Database" order="4" />
</layout>
</component>
<component name="TypeScriptGeneratedFilesManager">
@ -290,18 +327,22 @@
</map>
</option>
</component>
<component name="VcsManagerConfiguration">
<MESSAGE value="test3" />
<option name="LAST_COMMIT_MESSAGE" value="test3" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/s.py</url>
<line>141</line>
<line>142</line>
<option name="timeStamp" value="3" />
</line-breakpoint>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/s.py</url>
<line>193</line>
<option name="timeStamp" value="8" />
<line>226</line>
<option name="timeStamp" value="16" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
@ -312,24 +353,45 @@
</configuration>
</watches-manager>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/TAU_22_sane_words_torch_nn$new_lab.coverage" NAME="new_lab Coverage Results" MODIFIED="1575455391198" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/test-A/out.tsv">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file:///usr/lib/python3/dist-packages/pandas/core/frame.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="113850">
<caret line="6419" selection-start-line="6419" selection-end-line="6419" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test-A/in.tsv">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-198895" />
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/dev-0/expected.tsv">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/new_lab.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-196952" />
<state relative-caret-position="18">
<caret line="48" selection-start-line="48" selection-end-line="48" />
<folding>
<element signature="e#0#10#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/dev-0/out_float.tsv">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-198268" />
</provider>
<entry file="file://$PROJECT_DIR$/test-A/out_float.tsv">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/s.py">
<entry file="file://$PROJECT_DIR$/s2.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="980">
<caret line="225" column="18" selection-start-line="225" selection-start-column="18" selection-end-line="225" selection-end-column="18" />
<state relative-caret-position="121">
<caret line="304" column="13" selection-end-line="304" selection-end-column="13" />
<folding>
<element signature="e#0#12#0" expanded="true" />
</folding>
@ -338,8 +400,32 @@
</entry>
<entry file="file://$PROJECT_DIR$/dev-0/out.tsv">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="342">
<caret line="19" selection-start-line="19" selection-end-line="20" />
<state relative-caret-position="18">
<caret line="1" column="1" selection-start-line="1" selection-start-column="1" selection-end-line="1" selection-end-column="1" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3.6/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="352">
<caret line="437" selection-start-line="437" selection-end-line="437" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/dev-0/out_float.tsv">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="36">
<caret line="2" column="3" lean-forward="true" selection-start-line="2" selection-start-column="3" selection-end-line="2" selection-end-column="3" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/s.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="191">
<caret line="269" column="15" selection-start-line="269" selection-start-column="15" selection-end-line="269" selection-end-column="15" />
<folding>
<element signature="e#0#12#0" expanded="true" />
</folding>
</state>
</provider>
</entry>

11026
dev-0/best/out.tsv Normal file

File diff suppressed because it is too large Load Diff

11026
dev-0/best/out_float.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

94
new_lab.py Normal file
View File

@ -0,0 +1,94 @@
import sys
import torch
from torch import nn
from torch import optim
history_length = 32
nb_of_char_codes = 128
history_encoded = [ord("\n")] * history_length
embedding_size = 10
hidden_size = 100
device = torch.device('cpu')
print(history_encoded)
def char_source():
for line in sys.stdin:
for char in line:
if ord(char) < nb_of_char_codes:
yield ord(char)
class NGramLanguageModel(nn.Module):
def __init__(self, nb_of_char_codes, history_length, embedding_size, hidden_size):
super(NGramLanguageModel, self).__init__()
self.embeddings = nn.Embedding(nb_of_char_codes, embedding_size).to(device)
self.model = nn.Sequential(
nn.Linear(history_length * embedding_size, hidden_size),
nn.Linear(hidden_size, nb_of_char_codes),
nn.LogSoftmax()
).to(device)
def forward(self, inputs):
embedded_inputs = self.embeddings(inputs)
return self.model(embedded_inputs.view(-1)) #view -1 rozpłaszcza
def generate(self, to_be_continued, n):
t = (" " * history_length + to_be_continued)[-history_length:]
history = [ord(c) for c in t]
with torch.no_grad():
for _ in range(n):
x = torch.tensor(history, dtype = torch.long)
y = ((torch.exp(model(x))))
best = sorted(range(nb_of_char_codes), key= lambda i: -y[i])[0:4]
yb = torch.tensor([
y[ix] if ix in best else 0.0
for ix in range(nb_of_char_codes)
])
c = torch.multinomial(y, 1)[0].item()
t+= chr(c)
history.pop(0)
history.append(c)
return t
model = NGramLanguageModel(nb_of_char_codes, history_length, embedding_size, hidden_size)
counter = 0
step = 1000
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())
losses = []
for c in char_source():
x = torch.tensor(history_encoded, dtype=torch.long, device = device)
model.zero_grad()
y=model(x)
loss = criterion(y.view(1, -1), torch.tensor([c], dtype=torch.long, device=device))
losses.append(loss.item())
if len(losses) > step:
losses.pop(0)
if counter % step == 0:
awg_losses = sum(losses) / len(losses)
print(awg_losses)
print(loss)
print(model.generate("Machine translation is", 200))
loss.backward()
optimizer.step()
#print(y)
history_encoded.pop(0)
history_encoded.append(c)
"""
zd, zrobić generator
Nucleus - do generowania
"""

119
s.py
View File

@ -6,25 +6,26 @@ import pandas
import numpy as np
import re
import timeit
import subprocess
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
import string
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
model = nn.Sequential(
nn.Linear(48, 96, bias=True),
nn.Linear(48, 24, bias=True),
nn.ReLU(),
nn.Linear(96,48,bias=True),
nn.Linear(24,24,bias=True),
nn.ReLU(),
nn.Linear(48, 1, bias=True),
nn.Linear(24, 1, bias=True),
nn.Sigmoid())
criterion = nn.BCELoss()
#optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss(reduction='sum')
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
#optimizer = optim.Adam(model.parameters())
minibatch_size = 200
def count_polish_diacritics(x):
x_counts = []
for i, word in x.iteritems():
@ -101,6 +102,9 @@ def wordToOneHot(df_col, ch_labels):
return out_df
class TrainDataset(Dataset):
def __init__(self, X, y):
self.X = X
@ -174,10 +178,60 @@ testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
"""
def pred_save_dev():
dev_y = model(dev_x)
file = open("dev-0/out.tsv", "w")
file2 = open("dev-0/out_float.tsv", "w")
for i in range(0, 11026):
file2.write(str(dev_y[i].data.item()) + "\n")
var = dev_y[i].data.item()
if var > threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
"""
def pred_save(name, data_train_x, f_threshold):
pred_y = model(data_train_x)
file = open(name + "/out.tsv", "w")
file2 = open(name + "/out_float.tsv", "w")
for i in range(0, len(data_train_x)):
file2.write(str(pred_y[i].data.item()) + "\n")
var = pred_y[i].data.item()
if var > f_threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
def optim_threshold(min_thr, step = 0.01):
best_thr = min_thr
best_geval=0.1
while min_thr < 1:
pred_save("dev-0", dev_x, min_thr)
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
if float(metric) > best_geval:
best_geval = float(metric)
best_thr = min_thr
min_thr += step
print("optimTHR; geval metric: ", float(metric), "\tbest: ", best_geval, "\tthreshold: ", min_thr, "\tbest_thr: ", best_thr)
return best_thr
dataset_train = TrainDataset(x, y)
trainloader=DataLoader(dataset=dataset_train, batch_size=minibatch_size, shuffle=True)
def train_loop(i = 20):
def train_loop(i = 500, best = 0.1, threshold = 0.25):
for i in range(i):
for xb, yb_expected in trainloader:
optimizer.zero_grad()
@ -188,55 +242,54 @@ def train_loop(i = 20):
debug_xb = pandas.DataFrame(xb.numpy())
debug_yb_expected = pandas.DataFrame(yb_expected.numpy())
"""
debug_yp = pandas.DataFrame(yp.detach().numpy())
#debug_yp = pandas.DataFrame(yp.detach().numpy())
loss = criterion(torch.squeeze(yp), yb_expected)
"""
dev_y_pred_float_tensor = model(dev_x)
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
print("auc:\t", auc_score, "\tloss:\t", loss.item())
if ((auc_score > 0.90)):
break
"""
#metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
loss.backward()
optimizer.step()
if ((auc_score > 0.90)):
break
#print(loss)
pred_save("dev-0", dev_x, threshold)
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
print("geval metric: ", float(metric),"\tbest: ", best, "\tLoss: ", loss.item(), "\tthr: ", threshold)
if float(metric) > best:
best_threshold = optim_threshold(float(threshold - 0.2))
threshold = best_threshold
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
best = float(metric)
pred_save("dev-0/best", dev_x, threshold)
pred_save("test-A/best", testA_x, threshold)
#4 200 ~7h
elapsed_time = timeit.timeit(train_loop, number=1)
print("Training time: ", elapsed_time, "seconds")
#elapsed_time = timeit.timeit(train_loop, number=1)
#print("Training time: ", elapsed_time, "seconds")
train_loop()
#saving results:
#dev0:
dev_y = model(dev_x)
file=open("dev-0/out.tsv","w")
file2=open("dev-0/out_float.tsv","w")
dev_y_pred_float=[]
for i in range(0,11026):
file2.write(str(dev_y[i].data.item()) + "\n")
dev_y_pred_float.append(dev_y[i].data.item())
var = dev_y[i].data.item()
if var > 0.999:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
"\nroc_auc: ", roc_auc_score(dev_y_test,dev_y_pred_float ))
print(dev_y_pred_float)
"\nroc_auc: ", )
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#testA:
@ -246,7 +299,7 @@ file2=open("test-A/out_float.tsv","w")
for i in range(0,11061):
file2.write(str(testA_y[i].data.item()) + "\n")
if testA_y[i].data.item() > 0.999:
if testA_y[i].data.item() > 0.25:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')

305
s2.py Normal file
View File

@ -0,0 +1,305 @@
import torch
import random
from torch import nn
from torch import optim
import pandas
import numpy as np
import re
import timeit
import subprocess
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
import string
#10 features: 4 normal + 6 from domain_onehot + 38 char labels
model = nn.Sequential(
nn.Linear(48, 40, bias=True),
nn.ReLU(),
nn.Linear(40,24,bias=True),
nn.ReLU(),
nn.Linear(24, 1, bias=True),
nn.Sigmoid())
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.7)
#optimizer = optim.Adam(model.parameters())
minibatch_size = 200
def count_polish_diacritics(x):
x_counts = []
for i, word in x.iteritems():
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
c2 = c / len(str(word))
x_counts.append(c2)
return x_counts
def count_vowels(x):
out = []
for index,row in x.iteritems():
vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
word_len = len(str(row))
out.append(vowel_len / word_len) #RATE
return out
def Normalize(data, d = None):
if (d is None):
d = data
r = data - d.min()
return r/(d.max() - d.min())
def f1_score(y_true, y_pred):
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
F1 = 2 * (precision * recall) / (precision + recall)
return F1
#Transforms df with categorical values to One Hot format
def ToOneHot(df_col):
out = []
df_labels = pandas.unique(df_col)
l_count = len(df_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for i in range(0, l_count):
if df_labels[i] == row:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df, df_labels
def ToOneHot_preproces(df_col, df_labels):
out = []
l_count = len(df_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for i in range(0, l_count):
if df_labels[i] == row:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=df_labels)
return out_df
def getAllchars(df_col):
all = []
for index, row in df_col.iteritems():
all = all + list(row)
return all
def wordToOneHot(df_col, ch_labels):
out = []
l_count = len(ch_labels)
for index, row in df_col.iteritems():
blank_one_hot = np.full(l_count, 0)
for ch in list(str(row)):
for i in range(0, l_count):
if ch_labels[i] == ch:
blank_one_hot[i] = 1
out.append(blank_one_hot)
out_df = pandas.DataFrame(out, columns=ch_labels)
return out_df
class TrainDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return self.X.shape[0]
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
#Load data:
#Train
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
char_labels = pandas.unique(getAllchars(train_data['Word']))
#print(char_labels)
#print(len(char_labels)) 38 liter
#debug_fq = train_data['Frequency']
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
domain_onehot, domain_labels = ToOneHot(train_data['Domain'])
x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
df_words_onehot = wordToOneHot(train_data['Word'], char_labels)
x_words_onehot = torch.tensor(df_words_onehot.values, dtype=torch.float)
x_temp1 = torch.stack((x1,x2,x4, x5),0)
x_temp2 = torch.cat([x_temp1.transpose(1,0), x3], 1)
x = torch.cat([x_temp2, x_words_onehot], 1)
l = list(["Freq", "Vovels", "pol_dia", "Len"])+list(domain_labels)+list(char_labels)
print(l)
print(len(l))
#debug_x = pandas.DataFrame(x.numpy(), columns=l)
y = torch.tensor(train_data['Sane'], dtype=torch.float)
#dev0
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float)
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
dev_df_words_onehot = wordToOneHot(dev_data['Word'], char_labels)
dev_x_words_onehot = torch.tensor(dev_df_words_onehot.values, dtype=torch.float)
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
dev_x_temp2 = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
dev_x = torch.cat([dev_x_temp2, dev_x_words_onehot], 1)
#test-A
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float)
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
testA_df_words_onehot = wordToOneHot(testA_data['Word'], char_labels)
testA_x_words_onehot = torch.tensor(testA_df_words_onehot.values, dtype=torch.float)
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
testA_x_temp2 = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
testA_x = torch.cat([testA_x_temp2, testA_x_words_onehot], 1)
threshold = 0.25
"""
def pred_save_dev():
dev_y = model(dev_x)
file = open("dev-0/out.tsv", "w")
file2 = open("dev-0/out_float.tsv", "w")
for i in range(0, 11026):
file2.write(str(dev_y[i].data.item()) + "\n")
var = dev_y[i].data.item()
if var > threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
"""
def pred_save(name, data_train_x, f_threshold):
pred_y = model(data_train_x)
file = open(name + "/out.tsv", "w")
file2 = open(name + "/out_float.tsv", "w")
for i in range(0, len(data_train_x)):
file2.write(str(pred_y[i].data.item()) + "\n")
var = pred_y[i].data.item()
if var > f_threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()
def optim_threshold(min_thr, step = 0.01):
best_thr = min_thr
best=0.1
while min_thr < 1:
pred_save("dev-0", dev_x, min_thr)
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
print("optimTHR; geval metric: ", float(metric), "\tbest: ", best, "\tthreshold: ", min_thr)
if float(metric) > best:
best = float(metric)
best_thr = min_thr
min_thr += step
return best_thr
dataset_train = TrainDataset(x, y)
trainloader=DataLoader(dataset=dataset_train, batch_size=minibatch_size, shuffle=True)
def train_loop(i = 50, best = 0.01, threshold = 0.5):
for i in range(i):
for xb, yb_expected in trainloader:
optimizer.zero_grad()
yp = model(xb)
# debug
"""
debug_xb = pandas.DataFrame(xb.numpy())
debug_yb_expected = pandas.DataFrame(yb_expected.numpy())
"""
#debug_yp = pandas.DataFrame(yp.detach().numpy())
loss = criterion(torch.squeeze(yp), yb_expected)
"""
dev_y_pred_float_tensor = model(dev_x)
dev_y_pred_float_df = pandas.DataFrame(dev_y_pred_float_tensor.detach().numpy())
auc_score = roc_auc_score(dev_y_test, dev_y_pred_float_df)
print("auc:\t", auc_score, "\tloss:\t", loss.item())
if ((auc_score > 0.90)):
break
"""
#metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
loss.backward()
optimizer.step()
pred_save("dev-0", dev_x, threshold)
metric = float(subprocess.check_output(["/home/students/s452101/TAU/geval/geval", "-t", "dev-0"]))
print("geval metric: ", float(metric),"\tbest: ", best, "\tLoss: ", loss.item())
if float(metric) > best:
best_threshold = optim_threshold(float(torch.min(yp)))
threshold = best_threshold
best = float(metric)
pred_save("dev-0/best", dev_x, threshold)
pred_save("test-A/best", testA_x, threshold)
#4 200 ~7h
#elapsed_time = timeit.timeit(train_loop, number=1)
#print("Training time: ", elapsed_time, "seconds")
train_loop()
#saving results:
#dev0:
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred),
"\nroc_auc: ", )
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#testA:
testA_y = model(testA_x)
file=open("test-A/out.tsv","w")
file2=open("test-A/out_float.tsv","w")
for i in range(0,11061):
file2.write(str(testA_y[i].data.item()) + "\n")
if testA_y[i].data.item() > threshold:
file.write(f'{1}\n')
else:
file.write(f'{0}\n')
file.close()
file2.close()

11061
test-A/best/out.tsv Normal file

File diff suppressed because it is too large Load Diff

11061
test-A/best/out_float.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff