better code

updated code, much better
add linear regression solution
2020-04-20 16:14:42 +02:00 · 2020-04-14 14:18:59 +02:00 · 2020-04-05 16:28:00 +02:00 · 2020-03-30 23:56:52 +02:00 · 2020-03-30 23:04:51 +02:00 · 2020-03-30 22:59:26 +02:00
8 changed files with 16037 additions and 15739 deletions
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dict.txt
+++ b/dict.txt
--- a/linear_regression.py
+++ b/linear_regression.py
@ -0,0 +1,119 @@
+import csv
+import re
+import random
+import json
+from math import sqrt
+
+def make_dict(path):
+    dict = {}
+    with open(path) as in_file:
+        for line in in_file:
+            post = (line.split('\t')[0])
+            for word in re.findall(r"[\w']+", post):
+                if not word in dict:
+                    weight = round(random.random()%0.2-0.1,2)
+                    dict[word] = weight
+    with open('dict.txt', 'w') as file:
+        json.dump(dict, file)
+
+def make_posts_list(in_file):
+    posts = []
+    with open(in_file) as f:
+            for line in f:
+                post = (line.split('\t')[0])
+                posts.append(post)
+    return posts
+
+def make_exp_list(exp_file):
+    exp_list = []
+    with open(exp_file) as f:
+        for exp_line in f:
+            y = int(exp_line)
+            exp_list.append(y)
+
+    return exp_list
+
+def train_model(in_path, exp_path):
+    with open('dict.txt', 'r') as file:
+        dict = json.load(file)
+    posts = make_posts_list(in_path)
+    exp = make_exp_list(exp_path)
+    w0 = 0.1
+    lr = 0.00001
+    loss_counter = 0
+    loss_sum = 0
+    last_sum = 10
+    while loss_counter < 1000:
+        
+        loss_cost = 0            
+        for in_line, exp_line in zip(posts, exp):
+            loss_counter+=1
+            #losowy przykład ze zbioru uczącego
+            #print("new post" + str(random.randint(0,10)))
+            post = (in_line.split('\t')[0])
+            error_rate = 1
+            y = int(exp_line)
+            y_hat = w0
+            for word in re.findall(r"[\w']+", post):
+                y_hat += dict[word]
+            loss = (y_hat - y)**2
+            loss_sum += loss
+
+            #uczenie
+            delta = (y_hat - y) * lr
+            w0 = w0 - delta
+            for word in re.findall(r"[\w']+", post):
+                dict[word] -= delta
+
+        
+        real_loss = loss_sum/loss_counter
+        print(real_loss)
+
+        if real_loss > last_sum:
+            break
+        else:
+            last_sum = real_loss
+        loss_sum = 0
+        loss_counter = 0
+    dict["w0"] = w0
+    with open('dict2.txt', 'w') as file:
+        json.dump(dict, file)
+
+def predict(path):
+    results = []
+    with open('dict2.txt', 'r') as file:
+        dict = json.load(file)
+
+    with open(path+"/in.tsv") as in_file:
+        for in_line in in_file:
+            print("new post" + str(random.randint(0,10)))
+            post = (in_line.split('\t')[0])
+            y=dict["w0"]
+            for word in re.findall(r"[\w']+", post):
+                if word in dict:
+                    y += dict[word]
+            if y > 0.5:
+                results.append("1")
+            else:
+                results.append("0")
+        
+        with open(path+"/out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            for i in results:
+                tsv_writer.writerow(i)
+
+make_dict("train/in.tsv")
+train_model("train/in.tsv", "train/expected.tsv")
+
+def check_dev():
+    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+        counter = 0
+        positive = 0
+        for out_line, exp_line in zip(out_file, exp_file):
+            counter+=1
+            if out_line == exp_line:
+                positive += 1
+        print(positive/counter)
+
+predict("dev-0")
+check_dev()
--- a/linearregression.py
+++ b/linearregression.py
@ -0,0 +1,87 @@
+import csv
+import re
+import random
+import json
+
+# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
+def make_dict(path):
+    dict = {}
+    with open(path) as in_file:
+        for line in in_file:
+            post = (line.split('\t')[0])
+            for word in re.findall(r"[\w']+", post):
+                if not word in dict:
+                    weight = round(random.random()%0.2-0.1,2)
+                    dict[word] = weight
+    
+    return dict
+
+def train_model(in_path, exp_path):
+    dict = make_dict(in_path)
+    w0 = 0.1
+    lr = 0.0001
+    with open(in_path) as in_file, open(exp_path) as exp_file:
+        for in_line, exp_line in zip(in_file, exp_file):
+            print("new post" + str(random.randint(0,10)))
+            post = (in_line.split('\t')[0])
+            delta = 1
+            y=0
+            y_plus = 0
+            y_minus = 0
+            while delta > 0.5:
+                for word in re.findall(r"[\w']+", post):
+                    y += dict[word]
+                    y_plus += dict[word] + lr
+                    y_minus += dict[word] - lr
+                delta = abs(int(exp_line) - y+w0)
+                delta_minus = abs(int(exp_line) - y_minus+w0)
+                delta_plus = abs(int(exp_line) - y_plus+w0)
+                if delta_minus < delta:
+                    delta = delta_minus
+                    for word in re.findall(r"[\w']+", post):
+                        dict[word] = dict[word] - lr
+                elif delta_plus < delta:
+                    delta = delta_plus
+                    for word in re.findall(r"[\w']+", post):
+                        dict[word] = dict[word] + lr
+                else:
+                    break
+    with open('dict.txt', 'w') as file:
+        json.dump(dict, file)
+
+def predict(path):
+    results = []
+    with open('dict.txt', 'r') as file:
+        dict = json.load(file)
+
+    with open(path+"/in.tsv") as in_file:
+        for in_line in in_file:
+            print("new post" + str(random.randint(0,10)))
+            post = (in_line.split('\t')[0])
+            y=0
+            for word in re.findall(r"[\w']+", post):
+                if word in dict:
+                    y += dict[word]
+            if y > 0.5:
+                results.append("1")
+            else:
+                results.append("0")
+        
+        with open(path+"/out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            for i in results:
+                tsv_writer.writerow(i)
+
+
+predict("test-A")
+
+def check_dev():
+    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+        counter = 0
+        positive = 0
+        for out_line, exp_line in zip(out_file, exp_file):
+            counter+=1
+            if out_line == exp_line:
+                positive += 1
+        print(positive/counter)
+
--- a/readymadesolution.py
+++ b/readymadesolution.py
@ -0,0 +1,42 @@
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+import csv
+
+
+def get_test_posts(path):
+    posts = []
+    with open(path) as f:
+        for line in f:
+            text, timestamp = line.rstrip('\n').split('\t')
+            posts.append(text)
+    return posts
+
+
+def get_expected(path):
+    expected = []
+    with open(path) as f:
+        for line in f:
+            class_ = line.rstrip('\n').replace(" ", "")
+            expected.append(class_)
+    return expected
+
+count_vect = CountVectorizer()
+X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
+
+y = get_expected("train/expected.tsv")
+
+clf = MultinomialNB()
+clf.fit(X_train_counts, y)
+
+def predict_posts(path, clf):
+    X = count_vect.transform(get_test_posts(path+'/in.tsv'))
+    classes = clf.predict(X)
+    with open(path+"/out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            for i in classes:
+                tsv_writer.writerow(i)
+
+predict_posts("dev-0", clf)
+predict_posts("test-A", clf)
+
--- a/solution.py
+++ b/solution.py
@ -1,33 +1,11 @@
 import csv
 from collections import defaultdict
 import math
+import pickle
+import os
+from pathlib import Path

-counter = 0
-docs = []
-with open('in.tsv') as tsvfile:
-    reader = csv.reader(tsvfile, delimiter='\t')
-    for row in reader:
-        docs.append(row)
-        counter+=1
-
-print(counter)
-pcounter = 0
-scounter = 0
-with open('expected.tsv') as tsvfile:
-    reader = csv.reader(tsvfile, delimiter='\t')
-    for row in reader:
-        if row[0] == " P":
-            pcounter += 1
-        if row[0] == " S":
-            scounter += 1
-
-print(pcounter)
-print(scounter)
-
-print("P(S) = " + str(scounter+1/counter+2))
-print("P(P) = " + str(pcounter+1/counter+2))
-
-def calc_class_logprob(expected_path):
+def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
    paranoarmal_class_count = 0
    skeptic_class_count = 0
    with open(expected_path) as f:
@ -43,29 +21,100 @@ def calc_class_logprob(expected_path):
    return math.log(paranormal_class_prob), math.log(skeptic_class_prob)

 def calc_word_counts(in_path, expected_path):
-    with open(in_path), open(expected_path) as in_file, exp_file:
+    with open(in_path) as in_file, open(expected_path) as exp_file:
        word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
        for in_line, exp_line in zip(in_file, exp_file):
-            for line in f:
-                class_ = exp_line.rstrip('\n').replace(" ", "")
-                text, timestamp = line.rstrip('\n').split('\t')
-                tokens = text.lower().split(' ')
-                for token in tokens:
-                    if class_ == 'P':
-                        word_counts['paranormal'][token] += 1
-                    elif class_ == 'S':
-                        word_counts['skeptic'][token] += 1
+            class_ = exp_line.rstrip('\n').replace(" ", "")
+            text, timestamp = in_line.rstrip('\n').split('\t')
+            tokens = text.lower().split(' ')
+            for token in tokens:
+                if class_ == 'P':
+                    word_counts['paranormal'][token] += 1
+                elif class_ == 'S':
+                    word_counts['skeptic'][token] += 1

    return word_counts


-def calc_words_logprobs(words_counts):
-    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())) 
-    total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
+def calc_word_logprobs(word_counts):
+    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
+    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
+    word_logprobs = {'paranormal': {}, 'skeptic':{}}
+    for class_ in word_logprobs.keys():
+        for token, value in word_counts[class_].items():
+            if class_ == 'skeptic':
+                word_prob = (value + 1)/ total_skeptic
+            else:
+                 word_prob = (value + 1)/total_paranormal
+            word_logprobs[class_][token] = math.log(word_prob)
+    return word_logprobs
+
+paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
+
+word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
+
+word_logprobs = calc_word_logprobs(word_counts)
+
+print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
+
+#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
+
+def get_test_posts(path):
+    posts = []
+    with open(path) as f:
+        for line in f:
+            text, timestamp = line.rstrip('\n').split('\t')
+            posts.append(text)
+    return posts


-# with open('prediction.tsv', 'wt') as tsvfile:
-#     tsv_writer = csv.writer(tsvfile, delimiter='\t')
-#     for i in range(counter):
-#         tsv_writer.writerow('S')
+def predict_post_class(posts, sprob, pprob, word_logprobs):
+    out_classes = []

+    for post in posts:
+        total_s_prob = sprob
+        total_p_prob = pprob
+        tokens = post.lower().split(' ')
+        for token in tokens:
+            #dlasceptic
+            if (token in word_logprobs['skeptic'].keys()):
+                sceptic_prob = word_logprobs['skeptic'][token]
+            else:
+                sceptic_prob = 0
+            #dlaparanormal
+            if (token in word_logprobs['paranormal'].keys()):
+                paranormal_prob = word_logprobs['paranormal'][token]
+            else:
+                paranormal_prob = 0
+            total_s_prob += sceptic_prob
+            total_p_prob += paranormal_prob
+        
+        #print(total_p_prob)
+        #print(total_s_prob)
+        if total_p_prob > total_s_prob:
+            out_classes.append('P')
+        else:
+            out_classes.append('S')
+
+    return out_classes
+
+
+def predict_posts(path):
+    posts = get_test_posts(path+'/in.tsv')
+    classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
+    with open(path+"/out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            for i in classes:
+                tsv_writer.writerow(i)
+
+predict_posts("dev-0")
+predict_posts("test-A")
+
+with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+    counter = 0
+    positive = 0
+    for out_line, exp_line in zip(out_file, exp_file):
+        counter+=1
+        if " "+out_line == exp_line:
+            positive += 1
+    print(positive/counter)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
Author	SHA1	Message	Date
dylodylo	6eb349dc94	better code	2020-04-20 16:14:42 +02:00
dylodylo	ef7d13af8b	updated code, much better	2020-04-14 14:18:59 +02:00
dylodylo	ee6858ae3f	add linear regression solution	2020-04-05 16:28:00 +02:00
dylodylo	99d9e8ddb5	v2.0	2020-03-30 23:56:52 +02:00
dylodylo	45f8f65f6c	commit	2020-03-30 23:04:51 +02:00
dylodylo	a1f496054d	commit	2020-03-30 22:59:26 +02:00
dylodylo	744e5db758	naive-bayess solution	2020-03-29 21:22:20 +02:00
dylodylo	2a9ca866c9	naive-bayess solution	2020-03-29 21:03:04 +02:00
dylodylo	8fd7b62eef	naive-bayess solution	2020-03-29 20:58:56 +02:00