init

2019-11-27 03:15:05 +01:00 · 2019-11-27 03:15:05 +01:00 · 8836e1e6e0
commit 8836e1e6e0
16 changed files with 121855 additions and 0 deletions
--- a/.idea/TAU_21_sane_words.iml
+++ b/.idea/TAU_21_sane_words.iml
@ -0,0 +1,11 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TestRunnerService">
    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
  </component>
 </module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" filepath="$PROJECT_DIR$/.idea/TAU_21_sane_words.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -0,0 +1,105 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="SHOW_DIALOG" value="false" />
    <option name="HIGHLIGHT_CONFLICTS" value="true" />
    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
    <option name="LAST_RESOLUTION" value="IGNORE" />
  </component>
  <component name="FileTemplateManagerImpl">
    <option name="RECENT_TEMPLATES">
      <list>
        <option value="Python Script" />
      </list>
    </option>
  </component>
  <component name="ProjectId" id="1UAXhosCPbReL7U2TCbyyTVGpqs" />
  <component name="PropertiesComponent">
    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
  </component>
  <component name="RunDashboard">
    <option name="ruleStates">
      <list>
        <RuleState>
          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
        </RuleState>
        <RuleState>
          <option name="name" value="StatusDashboardGroupingRule" />
        </RuleState>
      </list>
    </option>
  </component>
  <component name="RunManager" selected="Python.solution2">
    <configuration name="solution" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="TAU_21_sane_words" />
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
      <option name="IS_MODULE_SDK" value="true" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/solution.py" />
      <option name="PARAMETERS" value="" />
      <option name="SHOW_COMMAND_LINE" value="false" />
      <option name="EMULATE_TERMINAL" value="false" />
      <option name="MODULE_MODE" value="false" />
      <option name="REDIRECT_INPUT" value="false" />
      <option name="INPUT_FILE" value="" />
      <method v="2" />
    </configuration>
    <configuration name="solution2" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="TAU_21_sane_words" />
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
      <option name="IS_MODULE_SDK" value="true" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/solution2.py" />
      <option name="PARAMETERS" value="" />
      <option name="SHOW_COMMAND_LINE" value="false" />
      <option name="EMULATE_TERMINAL" value="false" />
      <option name="MODULE_MODE" value="false" />
      <option name="REDIRECT_INPUT" value="false" />
      <option name="INPUT_FILE" value="" />
      <method v="2" />
    </configuration>
    <recent_temporary>
      <list>
        <item itemvalue="Python.solution2" />
        <item itemvalue="Python.solution" />
      </list>
    </recent_temporary>
  </component>
  <component name="SvnConfiguration">
    <configuration />
  </component>
  <component name="TaskManager">
    <task active="true" id="Default" summary="Default task">
      <changelist id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="" />
      <created>1574800494334</created>
      <option name="number" value="Default" />
      <option name="presentableId" value="Default" />
      <updated>1574800494334</updated>
    </task>
    <servers />
  </component>
  <component name="XDebuggerManager">
    <watches-manager>
      <configuration name="PythonConfigurationType">
        <watch expression="dev_y" />
      </configuration>
    </watches-manager>
  </component>
 </project>
--- a/README.md
+++ b/README.md
@ -0,0 +1,23 @@
 Sane words challenge
 ======================
 Guess if a given word is a correct Polish word in a given domain. Additionally, you have the information on reported frequency of the word in source texts.
 Each entry in training data set is of the form: __Sane (0 or 1), Domain, Word, Frequency__.
 Evaluation metric is F2-score.
 Directory structure
 -------------------
 * `README.md` — this file
 * `config.txt` — configuration file
 * `train/` — directory with training data
 * `train/train.tsv` — train set
 * `dev-0/` — directory with dev (test) data
 * `dev-0/in.tsv` — input data for the dev set
 * `dev-0/expected.tsv` — expected (reference) data for the dev set
 * `test-A` — directory with test data
 * `test-A/in.tsv` — input data for the test set
 * `test-A/expected.tsv` — expected (reference) data for the test set
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
 --metric F2 --precision 4
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/out_float.tsv
+++ b/dev-0/out_float.tsv
--- a/solution2.py
+++ b/solution2.py
@ -0,0 +1,136 @@
 import torch
 import pandas
 import re
 import numpy as np
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import precision_score, recall_score, accuracy_score
 learning_rate = torch.tensor(0.00001, dtype=torch.float)
 def f1_score(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1
 W = torch.rand([4,16],dtype=torch.float, requires_grad=True)
 b = torch.rand(16,dtype=torch.float, requires_grad=True)
 U = torch.rand(16,dtype=torch.float, requires_grad=True)
 c = torch.rand(1,dtype=torch.float, requires_grad=True)
 def count_polish_diacritics(x):
    x_counts = []
    for i, word in x.iteritems():
        c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
        x_counts.append(c)
    return x_counts
 def Normalize(data, d = None):
    if (d is None):
        d = data
    r = data - d.min()
    return r/(d.max() - d.min())
 train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
 x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
 x2 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
 le = LabelEncoder()
 le.fit(train_data['Domain'])
 encoded_domain_col= le.transform(train_data['Domain'])
 x3 = torch.tensor(encoded_domain_col, dtype=torch.float)
 x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
 x = torch.stack((x1,x2,x3,x4),0)
 y = torch.tensor(train_data['Sane'], dtype=torch.float)
 #dev data:
 dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
 dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
 dev_x2 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x2)
 dev_encoded_domain_col = le.transform(dev_data['Domain'])
 dev_x3 = torch.tensor(dev_encoded_domain_col, dtype=torch.float)
 dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
 dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4), 0)
 dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
 print("Training...")
 for _ in range(500):
    W.requires_grad_(True)
    b.requires_grad_(True)
    c.requires_grad_(True)
    U.requires_grad_(True)
    for _ in range(1000):
        h = torch.sigmoid(x.transpose(1, 0) @ W + b)
        y_predicted = torch.sigmoid(h @ U + c)
        cost = torch.sum((y_predicted - y) ** 2)
        cost.backward()
        with torch.no_grad():
            W = W - learning_rate * W.grad
            b = b - learning_rate * b.grad
            c = c - learning_rate * c.grad
            U = U - learning_rate * U.grad
            W.requires_grad_(True)
            b.requires_grad_(True)
            c.requires_grad_(True)
            U.requires_grad_(True)
    W.requires_grad_(False)
    b.requires_grad_(False)
    c.requires_grad_(False)
    U.requires_grad_(False)
    print("Dev0 pred...")
    # dev
    dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
    dev_y = torch.sigmoid(dev_h @ U + c)
    dev_y = dev_y.numpy()
    dev_y_pred = np.where(dev_y > 0.5, 1, 0)
    score = f1_score(dev_y_test, dev_y_pred)
    print("f1_score_dev0 within training: ", score, "\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
 W.requires_grad_(False)
 b.requires_grad_(False)
 c.requires_grad_(False)
 U.requires_grad_(False)
 print("Dev0 pred...")
 #dev
 dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
 dev_y = torch.sigmoid(dev_h @ U + c)
 dev_y = dev_y.numpy()
 dev_y_pred = np.where(dev_y > 0.5, 1, 0)
 #np.savetxt(f'./dev-0/out_float.tsv', dev_y, '%.f')
 with open('dev-0/out.tsv', 'w') as output_file:
    for out in dev_y_pred:
        print('%s' % out, file=output_file)
 with open('dev-0/out_float.tsv', 'w') as output_file:
    for out in dev_y:
        print('%s' % out, file=output_file)
 y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
 score = f1_score(y_test, dev_y_pred)
 print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
 print("TestA pred...")
 #test-A
 testA_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
 testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
 testA_x2 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x2)
 testA_encoded_domain_col= le.transform(testA_data['Domain'])
 testA_x3 = torch.tensor(testA_encoded_domain_col, dtype=torch.float)
 testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
 testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4),0)
 testA_h = torch.sigmoid(testA_x.transpose(1, 0) @ W + b)
 testA_y = torch.sigmoid(testA_h @ U + c)
 testA_y = testA_y.numpy()
 testA_y_pred = np.where(testA_y > 0.5, 1, 0)
 np.savetxt(f'./test-A/out_float.tsv', testA_y)
 with open('test-A/out.tsv', 'w') as output_file:
    for out in testA_y_pred:
        print('%s' % out, file=output_file)
 with open('test-A/out_float.tsv', 'w') as output_file:
    for out in testA_y:
        print('%s' % out, file=output_file)
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-A/out_float.tsv
+++ b/test-A/out_float.tsv
--- a/train/train.tsv
+++ b/train/train.tsv