This commit is contained in:
ksanu 2019-11-27 03:15:05 +01:00
commit 8836e1e6e0
16 changed files with 121855 additions and 0 deletions

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" filepath="$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" />
</modules>
</component>
</project>

105
.idea/workspace.xml Normal file
View File

@ -0,0 +1,105 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="ProjectId" id="1UAXhosCPbReL7U2TCbyyTVGpqs" />
<component name="PropertiesComponent">
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.solution2">
<configuration name="solution" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="TAU_21_sane_words" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/solution.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="solution2" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="TAU_21_sane_words" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/solution2.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.solution2" />
<item itemvalue="Python.solution" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
<created>1574800494334</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1574800494334</updated>
</task>
<servers />
</component>
<component name="XDebuggerManager">
<watches-manager>
<configuration name="PythonConfigurationType">
<watch expression="dev_y" />
</configuration>
</watches-manager>
</component>
</project>

23
README.md Normal file
View File

@ -0,0 +1,23 @@
Sane words challenge
======================
Guess if a given word is a correct Polish word in a given domain. Additionally, you have the information on reported frequency of the word in source texts.
Each entry in training data set is of the form: __Sane (0 or 1), Domain, Word, Frequency__.
Evaluation metric is F2-score.
Directory structure
-------------------
* `README.md` — this file
* `config.txt` — configuration file
* `train/` — directory with training data
* `train/train.tsv` — train set
* `dev-0/` — directory with dev (test) data
* `dev-0/in.tsv` — input data for the dev set
* `dev-0/expected.tsv` — expected (reference) data for the dev set
* `test-A` — directory with test data
* `test-A/in.tsv` — input data for the test set
* `test-A/expected.tsv` — expected (reference) data for the test set

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric F2 --precision 4

11026
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

11026
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

11026
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

11026
dev-0/out_float.tsv Normal file

File diff suppressed because it is too large Load Diff

136
solution2.py Normal file
View File

@ -0,0 +1,136 @@
import torch
import pandas
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score
learning_rate = torch.tensor(0.00001, dtype=torch.float)
def f1_score(y_true, y_pred):
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
F1 = 2 * (precision * recall) / (precision + recall)
return F1
W = torch.rand([4,16],dtype=torch.float, requires_grad=True)
b = torch.rand(16,dtype=torch.float, requires_grad=True)
U = torch.rand(16,dtype=torch.float, requires_grad=True)
c = torch.rand(1,dtype=torch.float, requires_grad=True)
def count_polish_diacritics(x):
x_counts = []
for i, word in x.iteritems():
c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
x_counts.append(c)
return x_counts
def Normalize(data, d = None):
if (d is None):
d = data
r = data - d.min()
return r/(d.max() - d.min())
train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
x2 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
le = LabelEncoder()
le.fit(train_data['Domain'])
encoded_domain_col= le.transform(train_data['Domain'])
x3 = torch.tensor(encoded_domain_col, dtype=torch.float)
x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
x = torch.stack((x1,x2,x3,x4),0)
y = torch.tensor(train_data['Sane'], dtype=torch.float)
#dev data:
dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
dev_x2 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x2)
dev_encoded_domain_col = le.transform(dev_data['Domain'])
dev_x3 = torch.tensor(dev_encoded_domain_col, dtype=torch.float)
dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4), 0)
dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
print("Training...")
for _ in range(500):
W.requires_grad_(True)
b.requires_grad_(True)
c.requires_grad_(True)
U.requires_grad_(True)
for _ in range(1000):
h = torch.sigmoid(x.transpose(1, 0) @ W + b)
y_predicted = torch.sigmoid(h @ U + c)
cost = torch.sum((y_predicted - y) ** 2)
cost.backward()
with torch.no_grad():
W = W - learning_rate * W.grad
b = b - learning_rate * b.grad
c = c - learning_rate * c.grad
U = U - learning_rate * U.grad
W.requires_grad_(True)
b.requires_grad_(True)
c.requires_grad_(True)
U.requires_grad_(True)
W.requires_grad_(False)
b.requires_grad_(False)
c.requires_grad_(False)
U.requires_grad_(False)
print("Dev0 pred...")
# dev
dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
dev_y = torch.sigmoid(dev_h @ U + c)
dev_y = dev_y.numpy()
dev_y_pred = np.where(dev_y > 0.5, 1, 0)
score = f1_score(dev_y_test, dev_y_pred)
print("f1_score_dev0 within training: ", score, "\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
W.requires_grad_(False)
b.requires_grad_(False)
c.requires_grad_(False)
U.requires_grad_(False)
print("Dev0 pred...")
#dev
dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
dev_y = torch.sigmoid(dev_h @ U + c)
dev_y = dev_y.numpy()
dev_y_pred = np.where(dev_y > 0.5, 1, 0)
#np.savetxt(f'./dev-0/out_float.tsv', dev_y, '%.f')
with open('dev-0/out.tsv', 'w') as output_file:
for out in dev_y_pred:
print('%s' % out, file=output_file)
with open('dev-0/out_float.tsv', 'w') as output_file:
for out in dev_y:
print('%s' % out, file=output_file)
y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
score = f1_score(y_test, dev_y_pred)
print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
print("TestA pred...")
#test-A
testA_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
testA_x2 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x2)
testA_encoded_domain_col= le.transform(testA_data['Domain'])
testA_x3 = torch.tensor(testA_encoded_domain_col, dtype=torch.float)
testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4),0)
testA_h = torch.sigmoid(testA_x.transpose(1, 0) @ W + b)
testA_y = torch.sigmoid(testA_h @ U + c)
testA_y = testA_y.numpy()
testA_y_pred = np.where(testA_y > 0.5, 1, 0)
np.savetxt(f'./test-A/out_float.tsv', testA_y)
with open('test-A/out.tsv', 'w') as output_file:
for out in testA_y_pred:
print('%s' % out, file=output_file)
with open('test-A/out_float.tsv', 'w') as output_file:
for out in testA_y:
print('%s' % out, file=output_file)

11061
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

11026
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

11026
test-A/out_float.tsv Normal file

File diff suppressed because it is too large Load Diff

44344
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff