Sane words, early stopping

2019-11-29 14:28:11 +01:00 · 2019-11-29 14:28:11 +01:00 · f0970031a0
commit f0970031a0
parent 6487254f7d
6 changed files with 40289 additions and 40137 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -3,10 +3,11 @@
  <component name="ChangeListManager">
    <list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="">
      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out.tsv" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out_float.tsv" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/test-A/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out.tsv" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/solution2.py" beforeDir="false" afterPath="$PROJECT_DIR$/solution2.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test-A/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out_float.tsv" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test-A/out.tsv" beforeDir="false" />
      <change beforePath="$PROJECT_DIR$/test-A/out_float.tsv" beforeDir="false" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="SHOW_DIALOG" value="false" />
@ -27,7 +28,7 @@
  <component name="ProjectId" id="1UAXhosCPbReL7U2TCbyyTVGpqs" />
  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
  <component name="PropertiesComponent">
-    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/../../Systemy_informatyczne/merged_master/BestNotes" />
    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
  </component>
  <component name="RunDashboard">
@ -43,6 +44,48 @@
    </option>
  </component>
  <component name="RunManager" selected="Python.solution2">
    <configuration name="S" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="TAU_21_sane_words" />
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
      <option name="IS_MODULE_SDK" value="true" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/S.py" />
      <option name="PARAMETERS" value="" />
      <option name="SHOW_COMMAND_LINE" value="false" />
      <option name="EMULATE_TERMINAL" value="false" />
      <option name="MODULE_MODE" value="false" />
      <option name="REDIRECT_INPUT" value="false" />
      <option name="INPUT_FILE" value="" />
      <method v="2" />
    </configuration>
    <configuration name="s2" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="TAU_21_sane_words" />
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
      <option name="IS_MODULE_SDK" value="true" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/s2.py" />
      <option name="PARAMETERS" value="" />
      <option name="SHOW_COMMAND_LINE" value="false" />
      <option name="EMULATE_TERMINAL" value="false" />
      <option name="MODULE_MODE" value="false" />
      <option name="REDIRECT_INPUT" value="false" />
      <option name="INPUT_FILE" value="" />
      <method v="2" />
    </configuration>
    <configuration name="solution" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="TAU_21_sane_words" />
      <option name="INTERPRETER_OPTIONS" value="" />
@ -88,6 +131,8 @@
    <recent_temporary>
      <list>
        <item itemvalue="Python.solution2" />
        <item itemvalue="Python.s2" />
        <item itemvalue="Python.S" />
        <item itemvalue="Python.solution" />
      </list>
    </recent_temporary>
@ -110,16 +155,28 @@
      <map>
        <entry key="MAIN">
          <value>
-            <State />
+            <State>
              <option name="COLUMN_ORDER" />
            </State>
          </value>
        </entry>
      </map>
    </option>
  </component>
  <component name="XDebuggerManager">
    <breakpoint-manager>
      <default-breakpoints>
        <breakpoint type="python-exception">
          <properties notifyOnTerminate="true" exception="BaseException">
            <option name="notifyOnTerminate" value="true" />
          </properties>
        </breakpoint>
      </default-breakpoints>
    </breakpoint-manager>
    <watches-manager>
      <configuration name="PythonConfigurationType">
        <watch expression="dev_y" />
        <watch expression="var" />
      </configuration>
    </watches-manager>
  </component>
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/out_float.tsv
+++ b/dev-0/out_float.tsv
--- a/solution2.py
+++ b/solution2.py
@ -2,135 +2,160 @@ import torch
 import pandas
 import re
 import numpy as np
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import precision_score, recall_score, accuracy_score
-learning_rate = torch.tensor(0.00001, dtype=torch.float)
+learning_rate = torch.tensor(0.00005, dtype=torch.float)
 def f1_score(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1
-W = torch.rand([4,16],dtype=torch.float, requires_grad=True)
+W1 = torch.rand([5,16],dtype=torch.float, requires_grad=True)
-b = torch.rand(16,dtype=torch.float, requires_grad=True)
+b1 = torch.rand(16,dtype=torch.float, requires_grad=True)
-U = torch.rand(16,dtype=torch.float, requires_grad=True)
+W2 = torch.rand(16,dtype=torch.float, requires_grad=True)
-c = torch.rand(1,dtype=torch.float, requires_grad=True)
+b2 = torch.rand(1,dtype=torch.float, requires_grad=True)
 def count_polish_diacritics(x):
    x_counts = []
    for i, word in x.iteritems():
        c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
-        x_counts.append(c)
+        c2 = c / len(str(word))
        x_counts.append(c2)
    return x_counts
-
+def count_vowels(x):
    out = []
    for index,row in x.iteritems():
        vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
        word_len = len(str(row))
        out.append(vowel_len / word_len) #RATE
    return out
 def Normalize(data, d = None):
    if (d is None):
        d = data
    r = data - d.min()
    return r/(d.max() - d.min())
 def model(data_x):
    h1=torch.relu(data_x.transpose(1,0) @ W1 + b1)
    m_y = torch.sigmoid(h1 @ W2 + b2)
    return m_y
 train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
 x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
-x2 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
+x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
-le = LabelEncoder()
+x3 = torch.tensor(train_data['Domain'].astype('category').cat.codes, dtype=torch.float)
-le.fit(train_data['Domain'])
+
 encoded_domain_col= le.transform(train_data['Domain'])
 x3 = torch.tensor(encoded_domain_col, dtype=torch.float)
 x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
-x = torch.stack((x1,x2,x3,x4),0)
+x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
 x = torch.stack((x1,x2,x3,x4, x5),0)
 y = torch.tensor(train_data['Sane'], dtype=torch.float)
 count=1
 for index, row in train_data['Sane'].iteritems():
    if row > 0:
        count += 1
 print(count)
 print(y)
 print("Training...")
 criterion = torch.nn.MSELoss(reduction='sum')
 for i in range(80):
    for j in range(1000):
        y_predicted = model(x)
        cost = criterion(y_predicted, y)
        cost.backward()
        #print(str(i), " ; ", cost)
        if (cost.item() < 40000):
            learning_rate = torch.tensor(0.00001, dtype=torch.float)
        #if (cost.item() < 1614):
        #    learning_rate = torch.tensor(0.000001, dtype=torch.float)
        with torch.no_grad():
            W1 = W1 - learning_rate * W1.grad
            b1 = b1 - learning_rate * b1.grad
            W2 = W2 - learning_rate * W2.grad
            b2 = b2 - learning_rate * b2.grad
        W1.requires_grad_(True)
        b1.requires_grad_(True)
        W2.requires_grad_(True)
        b2.requires_grad_(True)
    if (cost.item() < 1700):
        break
    #print("Dev0 pred...")
    # dev
 print("Dev0 pred...")
 #dev data:
 dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
 dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
-dev_x2 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x2)
+dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)
-dev_encoded_domain_col = le.transform(dev_data['Domain'])
+dev_x3 = Normalize(torch.tensor(dev_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3)
 dev_x3 = torch.tensor(dev_encoded_domain_col, dtype=torch.float)
 dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
-dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4), 0)
+dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
 dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4, dev_x5), 0)
 dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
-print("Training...")
+dev_y = model(dev_x)
 #dev_y_pred = np.where(dev_y > 0.5, 1, 0)
 #np.savetxt(f'./dev-0/out.tsv', dev_y_pred, '%d')
-for _ in range(500):
+file=open("dev-0/out.tsv","w")
-    W.requires_grad_(True)
+file2=open("dev-0/out_float.tsv","w")
    b.requires_grad_(True)
    c.requires_grad_(True)
    U.requires_grad_(True)
    for _ in range(1000):
        h = torch.sigmoid(x.transpose(1, 0) @ W + b)
        y_predicted = torch.sigmoid(h @ U + c)
        cost = torch.sum((y_predicted - y) ** 2)
        cost.backward()
        with torch.no_grad():
            W = W - learning_rate * W.grad
            b = b - learning_rate * b.grad
            c = c - learning_rate * c.grad
            U = U - learning_rate * U.grad
            W.requires_grad_(True)
            b.requires_grad_(True)
            c.requires_grad_(True)
            U.requires_grad_(True)
    W.requires_grad_(False)
    b.requires_grad_(False)
    c.requires_grad_(False)
    U.requires_grad_(False)
    print("Dev0 pred...")
    # dev
    dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
    dev_y = torch.sigmoid(dev_h @ U + c)
    dev_y = dev_y.numpy()
    dev_y_pred = np.where(dev_y > 0.5, 1, 0)
    score = f1_score(dev_y_test, dev_y_pred)
    print("f1_score_dev0 within training: ", score, "\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
-W.requires_grad_(False)
+for i in range(0,11026):
-b.requires_grad_(False)
+    file2.write(str(dev_y[i].data.item()) + "\n")
-c.requires_grad_(False)
+    var = dev_y[i].data.item()
-U.requires_grad_(False)
+    if var < 0.5:
-
+        file.write("0" + "\n")
-print("Dev0 pred...")
+    else:
-#dev
+        file.write("1" + "\n")
 file.close()
 file2.close()
 dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
 dev_y = torch.sigmoid(dev_h @ U + c)
 dev_y = dev_y.numpy()
 dev_y_pred = np.where(dev_y > 0.5, 1, 0)
 #np.savetxt(f'./dev-0/out_float.tsv', dev_y, '%.f')
 with open('dev-0/out.tsv', 'w') as output_file:
    for out in dev_y_pred:
        print('%s' % out, file=output_file)
 with open('dev-0/out_float.tsv', 'w') as output_file:
    for out in dev_y:
        print('%s' % out, file=output_file)
 y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
 dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
 score = f1_score(y_test, dev_y_pred)
 print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
 print("TestA pred...")
 #test-A
-testA_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
+testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
 testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
-testA_x2 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x2)
+testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
-
+testA_x3 = Normalize(torch.tensor(testA_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3)
 testA_encoded_domain_col= le.transform(testA_data['Domain'])
 testA_x3 = torch.tensor(testA_encoded_domain_col, dtype=torch.float)
 testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
-testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4),0)
+testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)
-testA_h = torch.sigmoid(testA_x.transpose(1, 0) @ W + b)
+testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4, testA_x5),0)
-testA_y = torch.sigmoid(testA_h @ U + c)
+
-testA_y = testA_y.numpy()
+testA_y = model(testA_x)
-testA_y_pred = np.where(testA_y > 0.5, 1, 0)
+
-np.savetxt(f'./test-A/out_float.tsv', testA_y)
+#np.savetxt(f'./test-A/out.tsv', testA_y_pred, '%d')
-with open('test-A/out.tsv', 'w') as output_file:
+
-    for out in testA_y_pred:
+
-        print('%s' % out, file=output_file)
+file=open("test-A/out.tsv","w")
-with open('test-A/out_float.tsv', 'w') as output_file:
+file2=open("test-A/out_float.tsv","w")
-    for out in testA_y:
+
-        print('%s' % out, file=output_file)
+for i in range(0,11061):
    file2.write(str(testA_y[i].data.item()) + "\n")
    if testA_y[i].data.item() < 0.5:
        file.write("0" + "\n")
    else:
        file.write("1" + "\n")
 file.close()
 file2.close()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-A/out_float.tsv
+++ b/test-A/out_float.tsv