Begin lin reg

2020-04-04 22:07:48 +02:00 · 2020-04-04 22:07:48 +02:00 · 0839c5ca41
commit 0839c5ca41
parent 4b926f648f
3 changed files with 228 additions and 132 deletions
--- a/predict_bigram.py
+++ b/predict_bigram.py
--- a/train.py
+++ b/train.py
@ -1,33 +1,10 @@
 #!/usr/bin/python3
-from collections import defaultdict
+import re, sys, pickle, nltk, math, random
 import math
 import pickle
 import re
 import sys
 import nltk
 from nltk.corpus import stopwords
 def calc_class_logprob(expected_path):
    paranormal_classcount = 0
    sceptic_classcount = 0
    with open(expected_path) as f:
        for line in f:
            line = line.rstrip('\n').replace(' ','')
            if 'P' in line:
                paranormal_classcount +=1
            elif 'S' in line:
                sceptic_classcount +=1
    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
    return math.log(paranol_prob), math.log(sceptic_prob)
 def clear_post(post):
    post = post.replace('\\n', ' ')
    post = post.lower()
    # delete links
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
@ -40,118 +17,80 @@ def clear_post(post):
    post_no_stop = [w for w in post if not w in stop_words]
    return post_no_stop
-#def calc_bigram_count(in_path, expected_path):
+# czy słowa musza byc setem?
-#    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
+def create_vocabulary_and_documents(in_file, expected_file):
-#    with open(in_path) as infile, open(expected_path) as expected_file:
+    vocabulary = set()
-#        num_of_bigams = 0
+    posts = {}
-#        for line, exp in zip(infile, expected_file):
+    with open(in_file) as in_f, open(expected_file) as exp_f:
-#            class_ = exp.rstrip('\n').replace(' ', '')
+        for line, exp in zip(in_f, exp_f):
 #            text, timestap = line.rstrip('\n').split('\t')
 #            tokens = clear_post(text)
 #            #tokens = text.lower().split(' ')
 #            for index in range(len(tokens)-1):
 #                # if there is next token we append current and next
 #                bigram = tokens[index] + " " + tokens[index + 1]
 #                #print(bigram)
 #                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
 #                if class_ == 'P':
 #                    bigram_counts['paranormal'][bigram] +=1
 #                elif class_ == 'S':
 #                    bigram_counts['sceptic'][bigram] +=1
 #                num_of_bigams +=1
 #    #print(f"num of every added bigams with repetitions {num_of_bigams})")
 #    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
 #    return bigram_counts
 def calc_bigram_logprobs(bigram_counts):
    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
    for class_ in bigram_counts.keys():
        for bigram, value in bigram_counts[class_].items():
            if class_ == "sceptic":
                bigram_prob = (value + 1) / total_sceptic
            elif class_ == "paranormal":
                bigram_prob = (value + 1) / total_paranormal
            bigram_logprobs[class_][bigram] = math.log(bigram_prob)
    return bigram_logprobs
 #def calc_word_count(in_path, expected_path):
 #    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
 #    with open(in_path) as infile, open(expected_path)  as expectedfile:
 #        for line, exp in zip(infile, expectedfile):
 #            class_ = exp.rstrip('\n').replace(' ','')
 #            text, timestap =line.rstrip('\n').split('\t')
 #            #print(f"text  {type(text)}")
 #            text = clear_tokens(text, True)
 #            tokens = text.lower().split(' ')
 #            #print(f"tokens {type(tokens)}")
 #            for token in tokens:
 #                clear_tokens(token,False)
 #                if class_ == 'P':
 #                    word_counts['paranormal'][token] += 1
 #                elif class_ == 'S':
 #                    word_counts['sceptic'][token]+=1
 #
 #    return word_counts
 def calc_word_logprobs(word_counts):
    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
    word_logprobs= {'paranormal': {}, 'sceptic': {}}
    for class_ in word_counts.keys(): # sceptic paranormal
        for token, value in word_counts[class_].items():
            if class_ == 'sceptic':
                word_prob = (value +1)/ total_skeptic
            elif class_ == 'paranormal':
                word_prob = (value+1)/ total_paranormal
            #print (token)
            word_logprobs[class_][token] = math.log(word_prob)
    return word_logprobs
 def launch_bigrams_and_words(in_path, expected_path):
    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
    with open(in_path) as infile, open(expected_path) as expected_file:
        for line, exp in zip(infile, expected_file):
            class_ = exp.rstrip('\n').replace(' ', '')
            text, timestap = line.rstrip('\n').split('\t')
-            tokens = clear_post(text)
+            post = clear_post(text)
-            for index in range(len(tokens)-1):
+            posts[" ".join(post)] = int(exp)
-                # if there is next token we append current and next
+            for word in post:
-                bigram = tokens[index] + " " + tokens[index + 1]
+                vocabulary.add(word)
-                #print(bigram)
+    return vocabulary, posts
                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
                if class_ == 'P':
                    bigram_counts['paranormal'][bigram] +=1
                    word_counts['paranormal'][tokens[index]] +=1
                elif class_ == 'S':
                    bigram_counts['sceptic'][bigram] +=1
                    word_counts['sceptic'][tokens[index]] +=1
-    return bigram_counts, word_counts
+def create_mappings(vocabulary):
    word_to_index_mapping = {}
    index_to_word_mapping = {}
    xi = 1
    for word in vocabulary:
        word_to_index_mapping[word] = xi
        index_to_word_mapping[xi] = word
        xi += 1
    return word_to_index_mapping, index_to_word_mapping
 def main():
    if len(sys.argv) != 4:
-        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
+        print("syntax ./train.py model expected_file in_file")
        return
-    expected_file = str(sys.argv[1])
+    model = str(sys.argv[1])
-    in_file = str(sys.argv[2])
+    expected_file = str(sys.argv[2])
-    model = str(sys.argv[3])
+    in_file = str(sys.argv[3])
-    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
+    vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
-    #bigrams_count = calc_bigram_count(in_file, expected_file)
+    word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
    word_logprobs = calc_word_logprobs(words_count)
    total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
    total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
    total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
    total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
    with open(model, 'wb') as f:
        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
 main()
    weights = []
    for xi in range(0, len(vocabulary) + 1):
        weights.append(random.uniform(-0.01,0.01))
    learning_rate = 0.000001
    loss_sum = 0.0
    loss_sum_counter = 0
    lowest_loss_sum_weights = []
    lowest_loss_sum = 10000.0
    print(f"len of vocabulary {len(vocabulary)}")
    # mozna ustawić na bardzo bardzo duzo
    while True: #loss_sum_counter != 10:
        try:
            d, y = random.choice(list(posts.items()))
            y_hat = weights[0]
            tokens = d.split(' ')
            for word in tokens:
                # mozna tez cos pomyslec z count aby lepiej dzialalo
                #print(f"{d.count(word)} : {word}")
                y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
            loss = (y_hat - y)**2
            loss_sum += loss
            delta = (y_hat - y) * learning_rate
            if loss_sum_counter % 100 == 0:
                print(f"{loss_sum /1000} : {loss_sum_counter} : {y_hat} : {delta}")
                loss_sum_counter = 0
                loss_sum = 0
            weights[0] -= delta
            for word in tokens:
                weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
            if lowest_loss_sum > loss_sum and loss_sum != 0:
                print("it happened")
                lowest_loss_sum = loss_sum
                lowest_loss_sum_weights = weights
            loss_sum_counter +=1
        except KeyboardInterrupt:
            break
    print(lowest_loss_sum_weights)
 main()
--- a/train_bigram.py
+++ b/train_bigram.py
@ -0,0 +1,157 @@
 #!/usr/bin/python3
 from collections import defaultdict
 import math
 import pickle
 import re
 import sys
 import nltk
 from nltk.corpus import stopwords
 def calc_class_logprob(expected_path):
    paranormal_classcount = 0
    sceptic_classcount = 0
    with open(expected_path) as f:
        for line in f:
            line = line.rstrip('\n').replace(' ','')
            if 'P' in line:
                paranormal_classcount +=1
            elif 'S' in line:
                sceptic_classcount +=1
    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
    return math.log(paranol_prob), math.log(sceptic_prob)
 def clear_post(post):
    post = post.replace('\\n', ' ')
    post = post.lower()
    # delete links
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
    post = re.sub(r'( \- |\-\-+)', ' ', post)
    post = re.sub(r' +', ' ', post)
    post = post.rstrip(' ')
    post = post.split(' ')
    stop_words = set(stopwords.words('english'))
    post_no_stop = [w for w in post if not w in stop_words]
    return post_no_stop
 #def calc_bigram_count(in_path, expected_path):
 #    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
 #    with open(in_path) as infile, open(expected_path) as expected_file:
 #        num_of_bigams = 0
 #        for line, exp in zip(infile, expected_file):
 #            class_ = exp.rstrip('\n').replace(' ', '')
 #            text, timestap = line.rstrip('\n').split('\t')
 #            tokens = clear_post(text)
 #            #tokens = text.lower().split(' ')
 #            for index in range(len(tokens)-1):
 #                # if there is next token we append current and next
 #                bigram = tokens[index] + " " + tokens[index + 1]
 #                #print(bigram)
 #                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
 #                if class_ == 'P':
 #                    bigram_counts['paranormal'][bigram] +=1
 #                elif class_ == 'S':
 #                    bigram_counts['sceptic'][bigram] +=1
 #                num_of_bigams +=1
 #    #print(f"num of every added bigams with repetitions {num_of_bigams})")
 #    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
 #    return bigram_counts
 def calc_bigram_logprobs(bigram_counts):
    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
    for class_ in bigram_counts.keys():
        for bigram, value in bigram_counts[class_].items():
            if class_ == "sceptic":
                bigram_prob = (value + 1) / total_sceptic
            elif class_ == "paranormal":
                bigram_prob = (value + 1) / total_paranormal
            bigram_logprobs[class_][bigram] = math.log(bigram_prob)
    return bigram_logprobs
 #def calc_word_count(in_path, expected_path):
 #    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
 #    with open(in_path) as infile, open(expected_path)  as expectedfile:
 #        for line, exp in zip(infile, expectedfile):
 #            class_ = exp.rstrip('\n').replace(' ','')
 #            text, timestap =line.rstrip('\n').split('\t')
 #            #print(f"text  {type(text)}")
 #            text = clear_tokens(text, True)
 #            tokens = text.lower().split(' ')
 #            #print(f"tokens {type(tokens)}")
 #            for token in tokens:
 #                clear_tokens(token,False)
 #                if class_ == 'P':
 #                    word_counts['paranormal'][token] += 1
 #                elif class_ == 'S':
 #                    word_counts['sceptic'][token]+=1
 #
 #    return word_counts
 def calc_word_logprobs(word_counts):
    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
    word_logprobs= {'paranormal': {}, 'sceptic': {}}
    for class_ in word_counts.keys(): # sceptic paranormal
        for token, value in word_counts[class_].items():
            if class_ == 'sceptic':
                word_prob = (value +1)/ total_skeptic
            elif class_ == 'paranormal':
                word_prob = (value+1)/ total_paranormal
            #print (token)
            word_logprobs[class_][token] = math.log(word_prob)
    return word_logprobs
 def launch_bigrams_and_words(in_path, expected_path):
    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
    with open(in_path) as infile, open(expected_path) as expected_file:
        for line, exp in zip(infile, expected_file):
            class_ = exp.rstrip('\n').replace(' ', '')
            text, timestap = line.rstrip('\n').split('\t')
            tokens = clear_post(text)
            for index in range(len(tokens)-1):
                # if there is next token we append current and next
                bigram = tokens[index] + " " + tokens[index + 1]
                #print(bigram)
                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
                if class_ == 'P':
                    bigram_counts['paranormal'][bigram] +=1
                    word_counts['paranormal'][tokens[index]] +=1
                elif class_ == 'S':
                    bigram_counts['sceptic'][bigram] +=1
                    word_counts['sceptic'][tokens[index]] +=1
    return bigram_counts, word_counts
 def main():
    if len(sys.argv) != 4:
        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
        return
    expected_file = str(sys.argv[1])
    in_file = str(sys.argv[2])
    model = str(sys.argv[3])
    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
    #bigrams_count = calc_bigram_count(in_file, expected_file)
    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
    word_logprobs = calc_word_logprobs(words_count)
    total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
    total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
    total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
    total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
    with open(model, 'wb') as f:
        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
 main()