laptop commit linear regression

2020-06-08 19:11:20 +02:00 · 2020-06-08 19:11:20 +02:00 · 53cd39d670
commit 53cd39d670
parent 0c3e331712
5 changed files with 3069 additions and 3008 deletions
--- a/.idea/Pierwsze.iml
+++ b/.idea/Pierwsze.iml
@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,5 +3,5 @@
  <component name="JavaScriptSettings">
    <option name="languageLevel" value="ES6" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
 </project>
--- a/61
+++ b/61
@ -0,0 +1,61 @@
 import pickle
 import re
 def calculate_words(linetxt):
    word_counts = {}
    tokens = linetxt.split(' ')
    for token in tokens:
        if token in word_counts.keys():
            word_counts[token]+=1
        else:
            word_counts[token]=1
        word_counts[''] = 1
    return word_counts
 def tokenize_list(string_input):
    string=string_input.replace('\\n',' ')
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    text = re.sub(r'\\n+', " ", text)
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'\/[a-z]\/', " ", text)
    text = re.sub(r'[^a-z]', " ", text)
    text = re.sub(r'\s{2,}', " ", text)
    text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
    text = re.sub(r'^\s', "", text)
    return text
 def prediction(input,output):
    loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
    #print(loaded_model)
    weights, word, vocabulary = loaded_model
    #print("WORD: ")
    #print(word)
    #print(" WEIGHTS: ")
    #print(weights)
    output_f = open(output,'w')
    with open(input, encoding='utf-8') as input_f:
        for line in input_f:
            text, timestamp = line.rstrip('\n').split('\t')
            tokens = tokenize_list(text.lower())
            line_vocabulary = calculate_words(tokens)
            tokens = tokens.split(' ')
            y_hat = weights[0]
            for token in tokens:
                if token in vocabulary.keys():
                    y_hat += weights[word[token]] * line_vocabulary[token]
            if y_hat > 0.5:
                output_f.write("1\n")
                print(y_hat)
            else:
                output_f.write("0\n")
                print(y_hat)
    output_f.close()
 def main():
    prediction("dev-0/in.tsv","dev-0/out.tsv")
    prediction("test-A/in.tsv","test-A/out.tsv")
 main()
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv