TAU_22_sane_words_torch_nn
This commit is contained in:
commit
8b4b2a5232
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
solution2.py
|
||||
solution2_p.py
|
11
.idea/TAU_21_sane_words.iml
Normal file
11
.idea/TAU_21_sane_words.iml
Normal file
@ -0,0 +1,11 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
|
||||
</component>
|
||||
</module>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
.idea/misc.xml
Normal file
4
.idea/misc.xml
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" filepath="$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
143
.idea/workspace.xml
Normal file
143
.idea/workspace.xml
Normal file
@ -0,0 +1,143 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="ProjectId" id="1UAXhosCPbReL7U2TCbyyTVGpqs" />
|
||||
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
|
||||
<component name="PropertiesComponent">
|
||||
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
||||
</component>
|
||||
<component name="RunDashboard">
|
||||
<option name="ruleStates">
|
||||
<list>
|
||||
<RuleState>
|
||||
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
<RuleState>
|
||||
<option name="name" value="StatusDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.s">
|
||||
<configuration name="s" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="TAU_21_sane_words" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/s.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="solution" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="TAU_21_sane_words" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/solution.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="solution2" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="TAU_21_sane_words" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/solution2.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.s" />
|
||||
<item itemvalue="Python.solution2" />
|
||||
<item itemvalue="Python.solution" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="SvnConfiguration">
|
||||
<configuration />
|
||||
</component>
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
|
||||
<created>1574800494334</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1574800494334</updated>
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="Vcs.Log.Tabs.Properties">
|
||||
<option name="TAB_STATES">
|
||||
<map>
|
||||
<entry key="MAIN">
|
||||
<value>
|
||||
<State />
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<watches-manager>
|
||||
<configuration name="PythonConfigurationType">
|
||||
<watch expression="dev_y" />
|
||||
<watch expression="debug_yp" />
|
||||
</configuration>
|
||||
</watches-manager>
|
||||
</component>
|
||||
</project>
|
23
README.md
Normal file
23
README.md
Normal file
@ -0,0 +1,23 @@
|
||||
|
||||
Sane words challenge
|
||||
======================
|
||||
|
||||
Guess if a given word is a correct Polish word in a given domain. Additionally, you have the information on reported frequency of the word in source texts.
|
||||
|
||||
Each entry in training data set is of the form: __Sane (0 or 1), Domain, Word, Frequency__.
|
||||
Evaluation metric is F2-score.
|
||||
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv` — train set
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric F2 --precision 4
|
11026
dev-0/expected.tsv
Normal file
11026
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11026
dev-0/in.tsv
Normal file
11026
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11026
dev-0/out.tsv
Normal file
11026
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11026
dev-0/out_float.tsv
Normal file
11026
dev-0/out_float.tsv
Normal file
File diff suppressed because it is too large
Load Diff
192
s.py
Normal file
192
s.py
Normal file
@ -0,0 +1,192 @@
|
||||
import torch
|
||||
import random
|
||||
from torch import nn
|
||||
from torch import optim
|
||||
import pandas
|
||||
import numpy as np
|
||||
import re
|
||||
import timeit
|
||||
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
#10 features: 4 normal + 6 from domain_onehot
|
||||
model = nn.Sequential(
|
||||
nn.Linear(10, 16),
|
||||
nn.ReLU(),
|
||||
nn.Linear(16,1),
|
||||
nn.Sigmoid())
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.000001, momentum=0.8)
|
||||
#optimizer = optim.Adam(model.parameters())
|
||||
|
||||
|
||||
minibatch_size = 5
|
||||
|
||||
|
||||
def count_polish_diacritics(x):
|
||||
x_counts = []
|
||||
for i, word in x.iteritems():
|
||||
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
|
||||
c2 = c / len(str(word))
|
||||
x_counts.append(c2)
|
||||
return x_counts
|
||||
|
||||
def count_vowels(x):
|
||||
out = []
|
||||
for index,row in x.iteritems():
|
||||
vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
|
||||
word_len = len(str(row))
|
||||
out.append(vowel_len / word_len) #RATE
|
||||
return out
|
||||
|
||||
def Normalize(data, d = None):
|
||||
if (d is None):
|
||||
d = data
|
||||
r = data - d.min()
|
||||
return r/(d.max() - d.min())
|
||||
|
||||
def f1_score(y_true, y_pred):
|
||||
precision = precision_score(y_true, y_pred, average='micro')
|
||||
recall = recall_score(y_true, y_pred, average='micro')
|
||||
F1 = 2 * (precision * recall) / (precision + recall)
|
||||
return F1
|
||||
|
||||
#Transforms df with categorical values to One Hot format
|
||||
def ToOneHot(df_col):
|
||||
out = []
|
||||
df_labels = pandas.unique(df_col)
|
||||
l_count = len(df_labels)
|
||||
for index, row in df_col.iteritems():
|
||||
blank_one_hot = np.full(l_count, 0)
|
||||
for i in range(0, l_count):
|
||||
if df_labels[i] == row:
|
||||
blank_one_hot[i] = 1
|
||||
out.append(blank_one_hot)
|
||||
out_df = pandas.DataFrame(out, columns=df_labels)
|
||||
return out_df, df_labels
|
||||
|
||||
def ToOneHot_preproces(df_col, df_labels):
|
||||
out = []
|
||||
l_count = len(df_labels)
|
||||
for index, row in df_col.iteritems():
|
||||
blank_one_hot = np.full(l_count, 0)
|
||||
for i in range(0, l_count):
|
||||
if df_labels[i] == row:
|
||||
blank_one_hot[i] = 1
|
||||
out.append(blank_one_hot)
|
||||
out_df = pandas.DataFrame(out, columns=df_labels)
|
||||
return out_df
|
||||
|
||||
class TrainDataset(Dataset):
|
||||
def __init__(self, X, y):
|
||||
self.X = X
|
||||
self.y = y
|
||||
|
||||
def __len__(self):
|
||||
return len(self.X)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.X[idx], self.y[idx]
|
||||
|
||||
#Load data:
|
||||
#Train
|
||||
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
|
||||
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
|
||||
x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
|
||||
|
||||
domain_onehot, domain_labels = ToOneHot(train_data['Domain'])
|
||||
x3 = torch.tensor(domain_onehot.values, dtype=torch.float)
|
||||
|
||||
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
|
||||
x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
|
||||
|
||||
x_temp = torch.stack((x1,x2,x4, x5),0)
|
||||
x = torch.cat([x_temp.transpose(1,0), x3], 1)
|
||||
#debug_x = pandas.DataFrame(x.numpy())
|
||||
y = torch.tensor(train_data['Sane'], dtype=torch.float)
|
||||
|
||||
#dev0
|
||||
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
||||
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
|
||||
dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
|
||||
|
||||
dev_x3 = torch.tensor(ToOneHot_preproces(dev_data['Domain'], domain_labels).values, dtype=torch.float)
|
||||
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
|
||||
dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
|
||||
|
||||
dev_x_temp = torch.stack((dev_x1, dev_x2, dev_x4, dev_x5), 0)
|
||||
dev_x = torch.cat([dev_x_temp.transpose(1,0), dev_x3], 1)
|
||||
|
||||
#test-A
|
||||
testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
|
||||
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
|
||||
testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
|
||||
testA_x3 = torch.tensor(ToOneHot_preproces(testA_data['Domain'], domain_labels).values, dtype=torch.float)
|
||||
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
|
||||
testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
|
||||
|
||||
testA_x_temp = torch.stack((testA_x1,testA_x2,testA_x4, testA_x5),0)
|
||||
testA_x = torch.cat([testA_x_temp.transpose(1,0), testA_x3], 1)
|
||||
|
||||
|
||||
dataset_train = TrainDataset(x, y)
|
||||
trainloader=DataLoader(dataset=dataset_train,batch_size=5)
|
||||
|
||||
def train_loop(i = 4200): #~7h
|
||||
for i in range(i):
|
||||
for xb, yb_expected in trainloader: # for each iteration a bach of samples is taken from loader(currently batch_size=5)
|
||||
yp = model(xb)
|
||||
|
||||
# debug
|
||||
"""
|
||||
debug_xb = pandas.DataFrame(xb.numpy())
|
||||
debug_yb_expected = pandas.DataFrame(yb_expected.numpy())
|
||||
debug_yp = pandas.DataFrame(yp.detach().numpy())
|
||||
"""
|
||||
|
||||
loss = criterion(yp, yb_expected)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
print(loss)
|
||||
|
||||
#4 200
|
||||
elapsed_time = timeit.timeit(train_loop, number=1)
|
||||
print("Training time: ", elapsed_time, "seconds")
|
||||
|
||||
#saving results:
|
||||
#dev0:
|
||||
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
dev_y = model(dev_x)
|
||||
file=open("dev-0/out.tsv","w")
|
||||
file2=open("dev-0/out_float.tsv","w")
|
||||
|
||||
for i in range(0,11026):
|
||||
file2.write(str(dev_y[i].data.item()) + "\n")
|
||||
var = dev_y[i].data.item()
|
||||
if var < 0.5:
|
||||
file.write("0" + "\n")
|
||||
else:
|
||||
file.write("1" + "\n")
|
||||
file.close()
|
||||
file2.close()
|
||||
|
||||
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
|
||||
score = f1_score(y_test, dev_y_pred)
|
||||
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
|
||||
|
||||
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
||||
#testA:
|
||||
testA_y = model(testA_x)
|
||||
file=open("test-A/out.tsv","w")
|
||||
file2=open("test-A/out_float.tsv","w")
|
||||
|
||||
for i in range(0,11061):
|
||||
file2.write(str(testA_y[i].data.item()) + "\n")
|
||||
if testA_y[i].data.item() < 0.5:
|
||||
file.write("0" + "\n")
|
||||
else:
|
||||
file.write("1" + "\n")
|
||||
file.close()
|
||||
file2.close()
|
11061
test-A/in.tsv
Normal file
11061
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11061
test-A/out.tsv
Normal file
11061
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11061
test-A/out_float.tsv
Normal file
11061
test-A/out_float.tsv
Normal file
File diff suppressed because it is too large
Load Diff
44344
train/train.tsv
Normal file
44344
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user