Begin lin reg

2020-04-04 22:07:48 +02:00 · 2020-04-04 22:07:48 +02:00 · 0839c5ca41
commit 0839c5ca41
parent 4b926f648f
3 changed files with 228 additions and 132 deletions
--- a/predict_bigram.py
+++ b/predict_bigram.py
--- a/train.py
+++ b/train.py
@ -1,33 +1,10 @@
 #!/usr/bin/python3
-from collections import defaultdict
-import math
-import pickle
-import re
-import sys
-import nltk
+import re, sys, pickle, nltk, math, random
 from nltk.corpus import stopwords

-def calc_class_logprob(expected_path):
-    paranormal_classcount = 0
-    sceptic_classcount = 0
-
-    with open(expected_path) as f:
-        for line in f:
-            line = line.rstrip('\n').replace(' ','')
-            if 'P' in line:
-                paranormal_classcount +=1
-            elif 'S' in line:
-                sceptic_classcount +=1
-
-    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
-    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
-
-    return math.log(paranol_prob), math.log(sceptic_prob)
-
 def clear_post(post):
    post = post.replace('\\n', ' ')
    post = post.lower()
-    # delete links
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
@ -40,118 +17,80 @@ def clear_post(post):
    post_no_stop = [w for w in post if not w in stop_words]
    return post_no_stop

-#def calc_bigram_count(in_path, expected_path):
-#    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
-#    with open(in_path) as infile, open(expected_path) as expected_file:
-#        num_of_bigams = 0
-#        for line, exp in zip(infile, expected_file):
-#            class_ = exp.rstrip('\n').replace(' ', '')
-#            text, timestap = line.rstrip('\n').split('\t')
-#            tokens = clear_post(text)
-#            #tokens = text.lower().split(' ')
-#            for index in range(len(tokens)-1):
-#                # if there is next token we append current and next
-#                bigram = tokens[index] + " " + tokens[index + 1]
-#                #print(bigram)
-#                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
-#                if class_ == 'P':
-#                    bigram_counts['paranormal'][bigram] +=1
-#                elif class_ == 'S':
-#                    bigram_counts['sceptic'][bigram] +=1
-#                num_of_bigams +=1
-#    #print(f"num of every added bigams with repetitions {num_of_bigams})")
-#    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
-#    return bigram_counts
-
-def calc_bigram_logprobs(bigram_counts):
-    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
-    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
-    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
-    for class_ in bigram_counts.keys():
-        for bigram, value in bigram_counts[class_].items():
-            if class_ == "sceptic":
-                bigram_prob = (value + 1) / total_sceptic
-            elif class_ == "paranormal":
-                bigram_prob = (value + 1) / total_paranormal
-
-            bigram_logprobs[class_][bigram] = math.log(bigram_prob)
-
-    return bigram_logprobs
-
-#def calc_word_count(in_path, expected_path):
-#    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
-#    with open(in_path) as infile, open(expected_path)  as expectedfile:
-#        for line, exp in zip(infile, expectedfile):
-#            class_ = exp.rstrip('\n').replace(' ','')
-#            text, timestap =line.rstrip('\n').split('\t')
-#            #print(f"text  {type(text)}")
-#            text = clear_tokens(text, True)
-#            tokens = text.lower().split(' ')
-#            #print(f"tokens {type(tokens)}")
-#            for token in tokens:
-#                clear_tokens(token,False)
-#                if class_ == 'P':
-#                    word_counts['paranormal'][token] += 1
-#                elif class_ == 'S':
-#                    word_counts['sceptic'][token]+=1
-#
-#    return word_counts
-
-def calc_word_logprobs(word_counts):
-    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
-    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
-    word_logprobs= {'paranormal': {}, 'sceptic': {}}
-    for class_ in word_counts.keys(): # sceptic paranormal
-        for token, value in word_counts[class_].items():
-            if class_ == 'sceptic':
-                word_prob = (value +1)/ total_skeptic
-            elif class_ == 'paranormal':
-                word_prob = (value+1)/ total_paranormal
-
-            #print (token)
-            word_logprobs[class_][token] = math.log(word_prob)
-
-    return word_logprobs
-
-def launch_bigrams_and_words(in_path, expected_path):
-    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
-    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
-    with open(in_path) as infile, open(expected_path) as expected_file:
-        for line, exp in zip(infile, expected_file):
-            class_ = exp.rstrip('\n').replace(' ', '')
+# czy słowa musza byc setem?
+def create_vocabulary_and_documents(in_file, expected_file):
+    vocabulary = set()
+    posts = {}
+    with open(in_file) as in_f, open(expected_file) as exp_f:
+        for line, exp in zip(in_f, exp_f):
            text, timestap = line.rstrip('\n').split('\t')
-            tokens = clear_post(text)
-            for index in range(len(tokens)-1):
-                # if there is next token we append current and next
-                bigram = tokens[index] + " " + tokens[index + 1]
-                #print(bigram)
-                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
-                if class_ == 'P':
-                    bigram_counts['paranormal'][bigram] +=1
-                    word_counts['paranormal'][tokens[index]] +=1
-                elif class_ == 'S':
-                    bigram_counts['sceptic'][bigram] +=1
-                    word_counts['sceptic'][tokens[index]] +=1
+            post = clear_post(text)
+            posts[" ".join(post)] = int(exp)
+            for word in post:
+                vocabulary.add(word)
+    return vocabulary, posts

-    return bigram_counts, word_counts
+def create_mappings(vocabulary):
+    word_to_index_mapping = {}
+    index_to_word_mapping = {}
+    xi = 1
+    for word in vocabulary:
+        word_to_index_mapping[word] = xi
+        index_to_word_mapping[xi] = word
+        xi += 1
+    return word_to_index_mapping, index_to_word_mapping

 def main():
    if len(sys.argv) != 4:
-        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
+        print("syntax ./train.py model expected_file in_file")
        return
-    expected_file = str(sys.argv[1])
-    in_file = str(sys.argv[2])
-    model = str(sys.argv[3])
-    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
-    #bigrams_count = calc_bigram_count(in_file, expected_file)
-    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
-    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
-    word_logprobs = calc_word_logprobs(words_count)
-    total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
-    total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
-    total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
-    total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
-    with open(model, 'wb') as f:
-        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
-main()
+    model = str(sys.argv[1])
+    expected_file = str(sys.argv[2])
+    in_file = str(sys.argv[3])
+    vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
+    word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)

+    weights = []
+    for xi in range(0, len(vocabulary) + 1):
+        weights.append(random.uniform(-0.01,0.01))
+
+    learning_rate = 0.000001
+    loss_sum = 0.0
+    loss_sum_counter = 0
+    lowest_loss_sum_weights = []
+    lowest_loss_sum = 10000.0
+
+    print(f"len of vocabulary {len(vocabulary)}")
+    # mozna ustawić na bardzo bardzo duzo
+    while True: #loss_sum_counter != 10:
+        try:
+            d, y = random.choice(list(posts.items()))
+            y_hat = weights[0]
+            tokens = d.split(' ')
+            for word in tokens:
+                # mozna tez cos pomyslec z count aby lepiej dzialalo
+                #print(f"{d.count(word)} : {word}")
+                y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
+
+            loss = (y_hat - y)**2
+            loss_sum += loss
+            delta = (y_hat - y) * learning_rate
+            if loss_sum_counter % 100 == 0:
+                print(f"{loss_sum /1000} : {loss_sum_counter} : {y_hat} : {delta}")
+                loss_sum_counter = 0
+                loss_sum = 0
+
+            weights[0] -= delta
+            for word in tokens:
+                weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
+
+            if lowest_loss_sum > loss_sum and loss_sum != 0:
+                print("it happened")
+                lowest_loss_sum = loss_sum
+                lowest_loss_sum_weights = weights
+
+            loss_sum_counter +=1
+        except KeyboardInterrupt:
+            break
+    print(lowest_loss_sum_weights)
+main()
--- a/train_bigram.py
+++ b/train_bigram.py
@ -0,0 +1,157 @@
+#!/usr/bin/python3
+from collections import defaultdict
+import math
+import pickle
+import re
+import sys
+import nltk
+from nltk.corpus import stopwords
+
+def calc_class_logprob(expected_path):
+    paranormal_classcount = 0
+    sceptic_classcount = 0
+
+    with open(expected_path) as f:
+        for line in f:
+            line = line.rstrip('\n').replace(' ','')
+            if 'P' in line:
+                paranormal_classcount +=1
+            elif 'S' in line:
+                sceptic_classcount +=1
+
+    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
+    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
+
+    return math.log(paranol_prob), math.log(sceptic_prob)
+
+def clear_post(post):
+    post = post.replace('\\n', ' ')
+    post = post.lower()
+    # delete links
+    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
+    post = re.sub(r'[\.\,\/\~]+', ' ', post)
+    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
+    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
+    post = re.sub(r'( \- |\-\-+)', ' ', post)
+    post = re.sub(r' +', ' ', post)
+    post = post.rstrip(' ')
+    post = post.split(' ')
+    stop_words = set(stopwords.words('english'))
+    post_no_stop = [w for w in post if not w in stop_words]
+    return post_no_stop
+
+#def calc_bigram_count(in_path, expected_path):
+#    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
+#    with open(in_path) as infile, open(expected_path) as expected_file:
+#        num_of_bigams = 0
+#        for line, exp in zip(infile, expected_file):
+#            class_ = exp.rstrip('\n').replace(' ', '')
+#            text, timestap = line.rstrip('\n').split('\t')
+#            tokens = clear_post(text)
+#            #tokens = text.lower().split(' ')
+#            for index in range(len(tokens)-1):
+#                # if there is next token we append current and next
+#                bigram = tokens[index] + " " + tokens[index + 1]
+#                #print(bigram)
+#                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
+#                if class_ == 'P':
+#                    bigram_counts['paranormal'][bigram] +=1
+#                elif class_ == 'S':
+#                    bigram_counts['sceptic'][bigram] +=1
+#                num_of_bigams +=1
+#    #print(f"num of every added bigams with repetitions {num_of_bigams})")
+#    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
+#    return bigram_counts
+
+def calc_bigram_logprobs(bigram_counts):
+    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
+    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
+    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
+    for class_ in bigram_counts.keys():
+        for bigram, value in bigram_counts[class_].items():
+            if class_ == "sceptic":
+                bigram_prob = (value + 1) / total_sceptic
+            elif class_ == "paranormal":
+                bigram_prob = (value + 1) / total_paranormal
+
+            bigram_logprobs[class_][bigram] = math.log(bigram_prob)
+
+    return bigram_logprobs
+
+#def calc_word_count(in_path, expected_path):
+#    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
+#    with open(in_path) as infile, open(expected_path)  as expectedfile:
+#        for line, exp in zip(infile, expectedfile):
+#            class_ = exp.rstrip('\n').replace(' ','')
+#            text, timestap =line.rstrip('\n').split('\t')
+#            #print(f"text  {type(text)}")
+#            text = clear_tokens(text, True)
+#            tokens = text.lower().split(' ')
+#            #print(f"tokens {type(tokens)}")
+#            for token in tokens:
+#                clear_tokens(token,False)
+#                if class_ == 'P':
+#                    word_counts['paranormal'][token] += 1
+#                elif class_ == 'S':
+#                    word_counts['sceptic'][token]+=1
+#
+#    return word_counts
+
+def calc_word_logprobs(word_counts):
+    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
+    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
+    word_logprobs= {'paranormal': {}, 'sceptic': {}}
+    for class_ in word_counts.keys(): # sceptic paranormal
+        for token, value in word_counts[class_].items():
+            if class_ == 'sceptic':
+                word_prob = (value +1)/ total_skeptic
+            elif class_ == 'paranormal':
+                word_prob = (value+1)/ total_paranormal
+
+            #print (token)
+            word_logprobs[class_][token] = math.log(word_prob)
+
+    return word_logprobs
+
+def launch_bigrams_and_words(in_path, expected_path):
+    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
+    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
+    with open(in_path) as infile, open(expected_path) as expected_file:
+        for line, exp in zip(infile, expected_file):
+            class_ = exp.rstrip('\n').replace(' ', '')
+            text, timestap = line.rstrip('\n').split('\t')
+            tokens = clear_post(text)
+            for index in range(len(tokens)-1):
+                # if there is next token we append current and next
+                bigram = tokens[index] + " " + tokens[index + 1]
+                #print(bigram)
+                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
+                if class_ == 'P':
+                    bigram_counts['paranormal'][bigram] +=1
+                    word_counts['paranormal'][tokens[index]] +=1
+                elif class_ == 'S':
+                    bigram_counts['sceptic'][bigram] +=1
+                    word_counts['sceptic'][tokens[index]] +=1
+
+    return bigram_counts, word_counts
+
+def main():
+    if len(sys.argv) != 4:
+        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
+        return
+    expected_file = str(sys.argv[1])
+    in_file = str(sys.argv[2])
+    model = str(sys.argv[3])
+    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
+    #bigrams_count = calc_bigram_count(in_file, expected_file)
+    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
+    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
+    word_logprobs = calc_word_logprobs(words_count)
+    total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
+    total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
+    total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
+    total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
+    with open(model, 'wb') as f:
+        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
+main()
+