poprawki

test commit from Windows
2020-06-08 15:25:08 +02:00 · 2020-04-06 15:34:10 +02:00
10 changed files with 579795 additions and 580088 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dict.txt
+++ b/dict.txt
--- a/linear_regression.py
+++ b/linear_regression.py
@ -1,119 +0,0 @@
-import csv
-import re
-import random
-import json
-from math import sqrt
-
-def make_dict(path):
-    dict = {}
-    with open(path) as in_file:
-        for line in in_file:
-            post = (line.split('\t')[0])
-            for word in re.findall(r"[\w']+", post):
-                if not word in dict:
-                    weight = round(random.random()%0.2-0.1,2)
-                    dict[word] = weight
-    with open('dict.txt', 'w') as file:
-        json.dump(dict, file)
-
-def make_posts_list(in_file):
-    posts = []
-    with open(in_file) as f:
-            for line in f:
-                post = (line.split('\t')[0])
-                posts.append(post)
-    return posts
-
-def make_exp_list(exp_file):
-    exp_list = []
-    with open(exp_file) as f:
-        for exp_line in f:
-            y = int(exp_line)
-            exp_list.append(y)
-
-    return exp_list
-
-def train_model(in_path, exp_path):
-    with open('dict.txt', 'r') as file:
-        dict = json.load(file)
-    posts = make_posts_list(in_path)
-    exp = make_exp_list(exp_path)
-    w0 = 0.1
-    lr = 0.00001
-    loss_counter = 0
-    loss_sum = 0
-    last_sum = 10
-    while loss_counter < 1000:
-        
-        loss_cost = 0            
-        for in_line, exp_line in zip(posts, exp):
-            loss_counter+=1
-            #losowy przykład ze zbioru uczącego
-            #print("new post" + str(random.randint(0,10)))
-            post = (in_line.split('\t')[0])
-            error_rate = 1
-            y = int(exp_line)
-            y_hat = w0
-            for word in re.findall(r"[\w']+", post):
-                y_hat += dict[word]
-            loss = (y_hat - y)**2
-            loss_sum += loss
-
-            #uczenie
-            delta = (y_hat - y) * lr
-            w0 = w0 - delta
-            for word in re.findall(r"[\w']+", post):
-                dict[word] -= delta
-
-        
-        real_loss = loss_sum/loss_counter
-        print(real_loss)
-
-        if real_loss > last_sum:
-            break
-        else:
-            last_sum = real_loss
-        loss_sum = 0
-        loss_counter = 0
-    dict["w0"] = w0
-    with open('dict2.txt', 'w') as file:
-        json.dump(dict, file)
-
-def predict(path):
-    results = []
-    with open('dict2.txt', 'r') as file:
-        dict = json.load(file)
-
-    with open(path+"/in.tsv") as in_file:
-        for in_line in in_file:
-            print("new post" + str(random.randint(0,10)))
-            post = (in_line.split('\t')[0])
-            y=dict["w0"]
-            for word in re.findall(r"[\w']+", post):
-                if word in dict:
-                    y += dict[word]
-            if y > 0.5:
-                results.append("1")
-            else:
-                results.append("0")
-        
-        with open(path+"/out.tsv", 'wt') as tsvfile:
-            tsv_writer = csv.writer(tsvfile, delimiter='\t')
-            for i in results:
-                tsv_writer.writerow(i)
-
-make_dict("train/in.tsv")
-train_model("train/in.tsv", "train/expected.tsv")
-
-def check_dev():
-    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
-        counter = 0
-        positive = 0
-        for out_line, exp_line in zip(out_file, exp_file):
-            counter+=1
-            if out_line == exp_line:
-                positive += 1
-        print(positive/counter)
-
-predict("dev-0")
-check_dev()
--- a/linearregression.py
+++ b/linearregression.py
@ -1,87 +0,0 @@
-import csv
-import re
-import random
-import json
-
-# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
-def make_dict(path):
-    dict = {}
-    with open(path) as in_file:
-        for line in in_file:
-            post = (line.split('\t')[0])
-            for word in re.findall(r"[\w']+", post):
-                if not word in dict:
-                    weight = round(random.random()%0.2-0.1,2)
-                    dict[word] = weight
-    
-    return dict
-
-def train_model(in_path, exp_path):
-    dict = make_dict(in_path)
-    w0 = 0.1
-    lr = 0.0001
-    with open(in_path) as in_file, open(exp_path) as exp_file:
-        for in_line, exp_line in zip(in_file, exp_file):
-            print("new post" + str(random.randint(0,10)))
-            post = (in_line.split('\t')[0])
-            delta = 1
-            y=0
-            y_plus = 0
-            y_minus = 0
-            while delta > 0.5:
-                for word in re.findall(r"[\w']+", post):
-                    y += dict[word]
-                    y_plus += dict[word] + lr
-                    y_minus += dict[word] - lr
-                delta = abs(int(exp_line) - y+w0)
-                delta_minus = abs(int(exp_line) - y_minus+w0)
-                delta_plus = abs(int(exp_line) - y_plus+w0)
-                if delta_minus < delta:
-                    delta = delta_minus
-                    for word in re.findall(r"[\w']+", post):
-                        dict[word] = dict[word] - lr
-                elif delta_plus < delta:
-                    delta = delta_plus
-                    for word in re.findall(r"[\w']+", post):
-                        dict[word] = dict[word] + lr
-                else:
-                    break
-    with open('dict.txt', 'w') as file:
-        json.dump(dict, file)
-
-def predict(path):
-    results = []
-    with open('dict.txt', 'r') as file:
-        dict = json.load(file)
-
-    with open(path+"/in.tsv") as in_file:
-        for in_line in in_file:
-            print("new post" + str(random.randint(0,10)))
-            post = (in_line.split('\t')[0])
-            y=0
-            for word in re.findall(r"[\w']+", post):
-                if word in dict:
-                    y += dict[word]
-            if y > 0.5:
-                results.append("1")
-            else:
-                results.append("0")
-        
-        with open(path+"/out.tsv", 'wt') as tsvfile:
-            tsv_writer = csv.writer(tsvfile, delimiter='\t')
-            for i in results:
-                tsv_writer.writerow(i)
-
-
-predict("test-A")
-
-def check_dev():
-    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
-        counter = 0
-        positive = 0
-        for out_line, exp_line in zip(out_file, exp_file):
-            counter+=1
-            if out_line == exp_line:
-                positive += 1
-        print(positive/counter)
-
--- a/readymadesolution.py
+++ b/readymadesolution.py
@ -1,42 +0,0 @@
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_extraction.text import TfidfTransformer
-import csv
-
-
-def get_test_posts(path):
-    posts = []
-    with open(path) as f:
-        for line in f:
-            text, timestamp = line.rstrip('\n').split('\t')
-            posts.append(text)
-    return posts
-
-
-def get_expected(path):
-    expected = []
-    with open(path) as f:
-        for line in f:
-            class_ = line.rstrip('\n').replace(" ", "")
-            expected.append(class_)
-    return expected
-
-count_vect = CountVectorizer()
-X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
-
-y = get_expected("train/expected.tsv")
-
-clf = MultinomialNB()
-clf.fit(X_train_counts, y)
-
-def predict_posts(path, clf):
-    X = count_vect.transform(get_test_posts(path+'/in.tsv'))
-    classes = clf.predict(X)
-    with open(path+"/out.tsv", 'wt') as tsvfile:
-            tsv_writer = csv.writer(tsvfile, delimiter='\t')
-            for i in classes:
-                tsv_writer.writerow(i)
-
-predict_posts("dev-0", clf)
-predict_posts("test-A", clf)
-
--- a/solution.py
+++ b/solution.py
@ -1,11 +1,33 @@
 import csv
 from collections import defaultdict
 import math
-import pickle
-import os
-from pathlib import Path

-def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
+counter = 0
+docs = []
+with open('in.tsv') as tsvfile:
+    reader = csv.reader(tsvfile, delimiter='\t')
+    for row in reader:
+        docs.append(row)
+        counter+=1
+
+print(counter)
+pcounter = 0
+scounter = 0
+with open('expected.tsv') as tsvfile:
+    reader = csv.reader(tsvfile, delimiter='\t')
+    for row in reader:
+        if row[0] == " P":
+            pcounter += 1
+        if row[0] == " S":
+            scounter += 1
+
+print(pcounter)
+print(scounter)
+
+print("P(S) = " + str(scounter+1/counter+2))
+print("P(P) = " + str(pcounter+1/counter+2))
+
+def calc_class_logprob(expected_path):
    paranoarmal_class_count = 0
    skeptic_class_count = 0
    with open(expected_path) as f:
@ -21,100 +43,29 @@ def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla
    return math.log(paranormal_class_prob), math.log(skeptic_class_prob)

 def calc_word_counts(in_path, expected_path):
-    with open(in_path) as in_file, open(expected_path) as exp_file:
+    with open(in_path), open(expected_path) as in_file, exp_file:
        word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
        for in_line, exp_line in zip(in_file, exp_file):
-            class_ = exp_line.rstrip('\n').replace(" ", "")
-            text, timestamp = in_line.rstrip('\n').split('\t')
-            tokens = text.lower().split(' ')
-            for token in tokens:
-                if class_ == 'P':
-                    word_counts['paranormal'][token] += 1
-                elif class_ == 'S':
-                    word_counts['skeptic'][token] += 1
+            for line in f:
+                class_ = exp_line.rstrip('\n').replace(" ", "")
+                text, timestamp = line.rstrip('\n').split('\t')
+                tokens = text.lower().split(' ')
+                for token in tokens:
+                    if class_ == 'P':
+                        word_counts['paranormal'][token] += 1
+                    elif class_ == 'S':
+                        word_counts['skeptic'][token] += 1

    return word_counts


-def calc_word_logprobs(word_counts):
-    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
-    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
-    word_logprobs = {'paranormal': {}, 'skeptic':{}}
-    for class_ in word_logprobs.keys():
-        for token, value in word_counts[class_].items():
-            if class_ == 'skeptic':
-                word_prob = (value + 1)/ total_skeptic
-            else:
-                 word_prob = (value + 1)/total_paranormal
-            word_logprobs[class_][token] = math.log(word_prob)
-    return word_logprobs
-
-paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
-
-word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
-
-word_logprobs = calc_word_logprobs(word_counts)
-
-print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
-
-#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
-
-def get_test_posts(path):
-    posts = []
-    with open(path) as f:
-        for line in f:
-            text, timestamp = line.rstrip('\n').split('\t')
-            posts.append(text)
-    return posts
+def calc_words_logprobs(words_counts):
+    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())) 
+    total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))


-def predict_post_class(posts, sprob, pprob, word_logprobs):
-    out_classes = []
+with open('prediction.tsv', 'wt') as tsvfile:
+    tsv_writer = csv.writer(tsvfile, delimiter='\t')
+    for i in range(counter):
+        tsv_writer.writerow('S')

-    for post in posts:
-        total_s_prob = sprob
-        total_p_prob = pprob
-        tokens = post.lower().split(' ')
-        for token in tokens:
-            #dlasceptic
-            if (token in word_logprobs['skeptic'].keys()):
-                sceptic_prob = word_logprobs['skeptic'][token]
-            else:
-                sceptic_prob = 0
-            #dlaparanormal
-            if (token in word_logprobs['paranormal'].keys()):
-                paranormal_prob = word_logprobs['paranormal'][token]
-            else:
-                paranormal_prob = 0
-            total_s_prob += sceptic_prob
-            total_p_prob += paranormal_prob
-        
-        #print(total_p_prob)
-        #print(total_s_prob)
-        if total_p_prob > total_s_prob:
-            out_classes.append('P')
-        else:
-            out_classes.append('S')
-
-    return out_classes
-
-
-def predict_posts(path):
-    posts = get_test_posts(path+'/in.tsv')
-    classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
-    with open(path+"/out.tsv", 'wt') as tsvfile:
-            tsv_writer = csv.writer(tsvfile, delimiter='\t')
-            for i in classes:
-                tsv_writer.writerow(i)
-
-predict_posts("dev-0")
-predict_posts("test-A")
-
-with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
-    counter = 0
-    positive = 0
-    for out_line, exp_line in zip(out_file, exp_file):
-        counter+=1
-        if " "+out_line == exp_line:
-            positive += 1
-    print(positive/counter)
--- a/stupidsolution.py
+++ b/stupidsolution.py
@ -1,17 +1,22 @@
 import csv
-
+import re

 def makeoutput(infile, outfile):
-    counter = 0
+    output = []
+    regex = r'paranormal|ufo|youtube|spirit'
    with open(infile) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
-            counter+=1
+            if re.search(regex, str(row).lower()):
+                output.append('1')
+            else:
+                output.append('0')
+

    with open(outfile, 'wt') as tsvfile:
        tsv_writer = csv.writer(tsvfile, delimiter='\t')
-        for i in range(counter):
-            tsv_writer.writerow('S')
+        for i in output:
+            tsv_writer.writerow(i)

 makeoutput("test-A/in.tsv", "test-A/out.tsv")
 makeoutput("train/in.tsv", "train/out.tsv")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/out.tsv
+++ b/train/out.tsv
Author	SHA1	Message	Date
Artur Dylewski	2787573a2c	poprawki	2020-06-08 15:25:08 +02:00
dylodylo	9cb993d397	test commit from Windows	2020-04-06 15:34:10 +02:00