Sane words, early stopping

2019-11-29 14:28:11 +01:00 · 2019-11-29 14:28:11 +01:00 · f0970031a0
commit f0970031a0
parent 6487254f7d
6 changed files with 40289 additions and 40137 deletions
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -3,10 +3,11 @@
  <component name="ChangeListManager">
    <list default="true" id="d25a65da-2ba0-4272-a0a5-c59cbecb6088" name="Default Changelist" comment="">
      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out.tsv" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/dev-0/out_float.tsv" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test-A/out.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out.tsv" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test-A/out_float.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test-A/out_float.tsv" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/dev-0/out.tsv" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/dev-0/out_float.tsv" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/solution2.py" beforeDir="false" afterPath="$PROJECT_DIR$/solution2.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test-A/out.tsv" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/test-A/out_float.tsv" beforeDir="false" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="SHOW_DIALOG" value="false" />
@ -27,7 +28,7 @@
  <component name="ProjectId" id="1UAXhosCPbReL7U2TCbyyTVGpqs" />
  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
  <component name="PropertiesComponent">
-    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/../../Systemy_informatyczne/merged_master/BestNotes" />
    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
  </component>
  <component name="RunDashboard">
@ -43,6 +44,48 @@
    </option>
  </component>
  <component name="RunManager" selected="Python.solution2">
+    <configuration name="S" type="PythonConfigurationType" factoryName="Python" temporary="true">
+      <module name="TAU_21_sane_words" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/S.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="s2" type="PythonConfigurationType" factoryName="Python" temporary="true">
+      <module name="TAU_21_sane_words" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/s2.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
    <configuration name="solution" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="TAU_21_sane_words" />
      <option name="INTERPRETER_OPTIONS" value="" />
@ -88,6 +131,8 @@
    <recent_temporary>
      <list>
        <item itemvalue="Python.solution2" />
+        <item itemvalue="Python.s2" />
+        <item itemvalue="Python.S" />
        <item itemvalue="Python.solution" />
      </list>
    </recent_temporary>
@ -110,16 +155,28 @@
      <map>
        <entry key="MAIN">
          <value>
-            <State />
+            <State>
+              <option name="COLUMN_ORDER" />
+            </State>
          </value>
        </entry>
      </map>
    </option>
  </component>
  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <default-breakpoints>
+        <breakpoint type="python-exception">
+          <properties notifyOnTerminate="true" exception="BaseException">
+            <option name="notifyOnTerminate" value="true" />
+          </properties>
+        </breakpoint>
+      </default-breakpoints>
+    </breakpoint-manager>
    <watches-manager>
      <configuration name="PythonConfigurationType">
        <watch expression="dev_y" />
+        <watch expression="var" />
      </configuration>
    </watches-manager>
  </component>
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/out_float.tsv
+++ b/dev-0/out_float.tsv
--- a/solution2.py
+++ b/solution2.py
@ -2,135 +2,160 @@ import torch
 import pandas
 import re
 import numpy as np
-from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import precision_score, recall_score, accuracy_score

-learning_rate = torch.tensor(0.00001, dtype=torch.float)
+learning_rate = torch.tensor(0.00005, dtype=torch.float)
 def f1_score(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1

-W = torch.rand([4,16],dtype=torch.float, requires_grad=True)
-b = torch.rand(16,dtype=torch.float, requires_grad=True)
-U = torch.rand(16,dtype=torch.float, requires_grad=True)
-c = torch.rand(1,dtype=torch.float, requires_grad=True)
+W1 = torch.rand([5,16],dtype=torch.float, requires_grad=True)
+b1 = torch.rand(16,dtype=torch.float, requires_grad=True)
+W2 = torch.rand(16,dtype=torch.float, requires_grad=True)
+b2 = torch.rand(1,dtype=torch.float, requires_grad=True)
+
+


 def count_polish_diacritics(x):
    x_counts = []
    for i, word in x.iteritems():
        c = len(re.findall(r'[ąćęłńóśźż]', str(word)))
-        x_counts.append(c)
+        c2 = c / len(str(word))
+        x_counts.append(c2)
    return x_counts

-
+def count_vowels(x):
+    out = []
+    for index,row in x.iteritems():
+        vowel_len = len(re.findall(r'[aąeęioóuy]', str(row)))
+        word_len = len(str(row))
+        out.append(vowel_len / word_len) #RATE
+    return out
 def Normalize(data, d = None):
    if (d is None):
        d = data
    r = data - d.min()
    return r/(d.max() - d.min())

+def model(data_x):
+    h1=torch.relu(data_x.transpose(1,0) @ W1 + b1)
+    m_y = torch.sigmoid(h1 @ W2 + b2)
+    return m_y
+
 train_data = pandas.read_csv('train/train.tsv', sep='\t', names=['Sane', 'Domain', 'Word', 'Frequency'], header=None)
 x1 = Normalize(torch.tensor(train_data['Frequency'], dtype=torch.float))
-x2 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
-le = LabelEncoder()
-le.fit(train_data['Domain'])
-encoded_domain_col= le.transform(train_data['Domain'])
-x3 = torch.tensor(encoded_domain_col, dtype=torch.float)
+x2 = Normalize(torch.tensor(count_vowels(train_data['Word']), dtype=torch.float))
+x3 = torch.tensor(train_data['Domain'].astype('category').cat.codes, dtype=torch.float)
+
 x4 = Normalize(torch.tensor(count_polish_diacritics(train_data['Word']),dtype=torch.float))
-x = torch.stack((x1,x2,x3,x4),0)
+x5 = Normalize(torch.tensor(train_data['Word'].str.len(), dtype=torch.float))
+x = torch.stack((x1,x2,x3,x4, x5),0)
 y = torch.tensor(train_data['Sane'], dtype=torch.float)

+count=1
+for index, row in train_data['Sane'].iteritems():
+    if row > 0:
+        count += 1
+
+print(count)
+print(y)
+
+print("Training...")
+criterion = torch.nn.MSELoss(reduction='sum')
+for i in range(80):
+    for j in range(1000):
+        y_predicted = model(x)
+        cost = criterion(y_predicted, y)
+        cost.backward()
+        #print(str(i), " ; ", cost)
+        if (cost.item() < 40000):
+            learning_rate = torch.tensor(0.00001, dtype=torch.float)
+        #if (cost.item() < 1614):
+        #    learning_rate = torch.tensor(0.000001, dtype=torch.float)
+        with torch.no_grad():
+            W1 = W1 - learning_rate * W1.grad
+            b1 = b1 - learning_rate * b1.grad
+            W2 = W2 - learning_rate * W2.grad
+            b2 = b2 - learning_rate * b2.grad
+
+
+        W1.requires_grad_(True)
+        b1.requires_grad_(True)
+        W2.requires_grad_(True)
+        b2.requires_grad_(True)
+
+
+
+    if (cost.item() < 1700):
+        break
+    #print("Dev0 pred...")
+    # dev
+
+
+
+print("Dev0 pred...")
 #dev data:
 dev_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
 dev_x1 = Normalize(torch.tensor(dev_data['Frequency'], dtype=torch.float), x1)
-dev_x2 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x2)
+dev_x2 = Normalize(torch.tensor(count_vowels(dev_data['Word']), dtype=torch.float), x2)

-dev_encoded_domain_col = le.transform(dev_data['Domain'])
-dev_x3 = torch.tensor(dev_encoded_domain_col, dtype=torch.float)
+dev_x3 = Normalize(torch.tensor(dev_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3)
 dev_x4 = Normalize(torch.tensor(count_polish_diacritics(dev_data['Word']), dtype=torch.float), x4)
-dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4), 0)
+dev_x5 = Normalize(torch.tensor(dev_data['Word'].str.len(), dtype=torch.float), x5)
+
+dev_x = torch.stack((dev_x1, dev_x2, dev_x3, dev_x4, dev_x5), 0)
 dev_y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))

-print("Training...")
+dev_y = model(dev_x)
+#dev_y_pred = np.where(dev_y > 0.5, 1, 0)
+#np.savetxt(f'./dev-0/out.tsv', dev_y_pred, '%d')

-for _ in range(500):
-    W.requires_grad_(True)
-    b.requires_grad_(True)
-    c.requires_grad_(True)
-    U.requires_grad_(True)
-    for _ in range(1000):
-        h = torch.sigmoid(x.transpose(1, 0) @ W + b)
-        y_predicted = torch.sigmoid(h @ U + c)
-        cost = torch.sum((y_predicted - y) ** 2)
-        cost.backward()
-        with torch.no_grad():
-            W = W - learning_rate * W.grad
-            b = b - learning_rate * b.grad
-            c = c - learning_rate * c.grad
-            U = U - learning_rate * U.grad
-            W.requires_grad_(True)
-            b.requires_grad_(True)
-            c.requires_grad_(True)
-            U.requires_grad_(True)
-    W.requires_grad_(False)
-    b.requires_grad_(False)
-    c.requires_grad_(False)
-    U.requires_grad_(False)
-    print("Dev0 pred...")
-    # dev
-    dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
-    dev_y = torch.sigmoid(dev_h @ U + c)
-    dev_y = dev_y.numpy()
-    dev_y_pred = np.where(dev_y > 0.5, 1, 0)
-    score = f1_score(dev_y_test, dev_y_pred)
-    print("f1_score_dev0 within training: ", score, "\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))
+file=open("dev-0/out.tsv","w")
+file2=open("dev-0/out_float.tsv","w")

-W.requires_grad_(False)
-b.requires_grad_(False)
-c.requires_grad_(False)
-U.requires_grad_(False)
-
-print("Dev0 pred...")
-#dev
+for i in range(0,11026):
+    file2.write(str(dev_y[i].data.item()) + "\n")
+    var = dev_y[i].data.item()
+    if var < 0.5:
+        file.write("0" + "\n")
+    else:
+        file.write("1" + "\n")
+file.close()
+file2.close()


-dev_h = torch.sigmoid(dev_x.transpose(1, 0) @ W + b)
-dev_y = torch.sigmoid(dev_h @ U + c)
-dev_y = dev_y.numpy()
-dev_y_pred = np.where(dev_y > 0.5, 1, 0)
-#np.savetxt(f'./dev-0/out_float.tsv', dev_y, '%.f')
-with open('dev-0/out.tsv', 'w') as output_file:
-    for out in dev_y_pred:
-        print('%s' % out, file=output_file)
-with open('dev-0/out_float.tsv', 'w') as output_file:
-    for out in dev_y:
-        print('%s' % out, file=output_file)
 y_test = pandas.DataFrame(pandas.read_csv('dev-0/expected.tsv', encoding="utf-8", delimiter='\t', header=None))
+dev_y_pred = pandas.DataFrame(pandas.read_csv('dev-0/out.tsv', encoding="utf-8", delimiter='\t', header=None))
 score = f1_score(y_test, dev_y_pred)
 print("f1_score_dev0 after training: ", score,"\nAcc: ", accuracy_score(dev_y_test, dev_y_pred))

 print("TestA pred...")
 #test-A
-testA_data = pandas.read_csv('dev-0/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
+testA_data = pandas.read_csv('test-A/in.tsv', sep='\t', names=['Domain', 'Word', 'Frequency'], header=None)
 testA_x1 = Normalize(torch.tensor(testA_data['Frequency'], dtype=torch.float), x1)
-testA_x2 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x2)
-
-testA_encoded_domain_col= le.transform(testA_data['Domain'])
-testA_x3 = torch.tensor(testA_encoded_domain_col, dtype=torch.float)
+testA_x2 = Normalize(torch.tensor(count_vowels(testA_data['Word']), dtype=torch.float), x2)
+testA_x3 = Normalize(torch.tensor(testA_data['Domain'].astype('category').cat.codes, dtype=torch.float), x3)
 testA_x4 = Normalize(torch.tensor(count_polish_diacritics(testA_data['Word']),dtype=torch.float), x4)
-testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4),0)
+testA_x5 = Normalize(torch.tensor(testA_data['Word'].str.len(), dtype=torch.float), x5)

-testA_h = torch.sigmoid(testA_x.transpose(1, 0) @ W + b)
-testA_y = torch.sigmoid(testA_h @ U + c)
-testA_y = testA_y.numpy()
-testA_y_pred = np.where(testA_y > 0.5, 1, 0)
-np.savetxt(f'./test-A/out_float.tsv', testA_y)
-with open('test-A/out.tsv', 'w') as output_file:
-    for out in testA_y_pred:
-        print('%s' % out, file=output_file)
-with open('test-A/out_float.tsv', 'w') as output_file:
-    for out in testA_y:
-        print('%s' % out, file=output_file)
+testA_x = torch.stack((testA_x1,testA_x2,testA_x3,testA_x4, testA_x5),0)
+
+testA_y = model(testA_x)
+
+#np.savetxt(f'./test-A/out.tsv', testA_y_pred, '%d')
+
+
+file=open("test-A/out.tsv","w")
+file2=open("test-A/out_float.tsv","w")
+
+for i in range(0,11061):
+    file2.write(str(testA_y[i].data.item()) + "\n")
+    if testA_y[i].data.item() < 0.5:
+        file.write("0" + "\n")
+    else:
+        file.write("1" + "\n")
+file.close()
+file2.close()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-A/out_float.tsv
+++ b/test-A/out_float.tsv