laptop commit linear regression

2020-06-08 19:11:20 +02:00 · 2020-06-08 19:11:20 +02:00 · 53cd39d670
commit 53cd39d670
parent 0c3e331712
5 changed files with 3069 additions and 3008 deletions
--- a/.idea/Pierwsze.iml
+++ b/.idea/Pierwsze.iml
@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,5 +3,5 @@
  <component name="JavaScriptSettings">
    <option name="languageLevel" value="ES6" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
 </project>
--- a/61
+++ b/61
@ -0,0 +1,61 @@
+import pickle
+import re
+
+
+def calculate_words(linetxt):
+    word_counts = {}
+    tokens = linetxt.split(' ')
+    for token in tokens:
+        if token in word_counts.keys():
+            word_counts[token]+=1
+        else:
+            word_counts[token]=1
+        word_counts[''] = 1
+    return word_counts
+
+def tokenize_list(string_input):
+    string=string_input.replace('\\n',' ')
+    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
+    text = re.sub(r'\\n+', " ", text)
+    text = re.sub(r'http\S+', " ", text)
+    text = re.sub(r'\/[a-z]\/', " ", text)
+    text = re.sub(r'[^a-z]', " ", text)
+    text = re.sub(r'\s{2,}', " ", text)
+    text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
+    text = re.sub(r'^\s', "", text)
+
+    return text
+
+def prediction(input,output):
+    loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
+    #print(loaded_model)
+    weights, word, vocabulary = loaded_model
+    #print("WORD: ")
+    #print(word)
+    #print(" WEIGHTS: ")
+    #print(weights)
+    output_f = open(output,'w')
+    with open(input, encoding='utf-8') as input_f:
+        for line in input_f:
+            text, timestamp = line.rstrip('\n').split('\t')
+            tokens = tokenize_list(text.lower())
+            line_vocabulary = calculate_words(tokens)
+            tokens = tokens.split(' ')
+            y_hat = weights[0]
+            for token in tokens:
+                if token in vocabulary.keys():
+                    y_hat += weights[word[token]] * line_vocabulary[token]
+            if y_hat > 0.5:
+                output_f.write("1\n")
+                print(y_hat)
+            else:
+                output_f.write("0\n")
+                print(y_hat)
+    output_f.close()
+
+
+def main():
+    prediction("dev-0/in.tsv","dev-0/out.tsv")
+    prediction("test-A/in.tsv","test-A/out.tsv")
+
+main()
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv