add w2v

change output
2020-06-03 18:01:38 +02:00 · 2020-05-02 19:29:47 +02:00 · 2020-05-02 19:27:04 +02:00 · 2020-05-02 16:26:34 +02:00 · 2020-05-02 15:24:22 +02:00 · 2020-05-02 13:47:19 +02:00
5 changed files with 10646 additions and 10466 deletions
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/solution.py
+++ b/solution.py
@ -1,71 +1,150 @@
 import csv
 from collections import defaultdict
 import math
+import pickle
+import os
+from pathlib import Path

-counter = 0
-docs = []
-with open('in.tsv') as tsvfile:
-    reader = csv.reader(tsvfile, delimiter='\t')
-    for row in reader:
-        docs.append(row)
-        counter+=1

-print(counter)
-pcounter = 0
-scounter = 0
-with open('expected.tsv') as tsvfile:
-    reader = csv.reader(tsvfile, delimiter='\t')
-    for row in reader:
-        if row[0] == " P":
-            pcounter += 1
-        if row[0] == " S":
-            scounter += 1
+def tokenize(text):
+    text = text.replace("n't", " not")
+    text = text.replace("'s", " is")
+    text = text.replace("'ve", " have")
+    text = text.replace("'", " ")
+    text = text.replace("(", " ")
+    text = text.replace(")", " ")
+    text = text.replace("/", " ")
+    text = text.replace("\\n\\n", "")
+    text = text.replace(".", "")
+    text = text.replace("?", "")
+    text = text.replace(",", "")
+    text = text.replace("!", "")
+    text = text.replace('"', '')
+    text = text.replace(" a ", " ")
+    text = text.replace(" on ", " ")
+    text = text.replace(" the ", " ")
+    text = text.replace(" of ", " ")
+    text = text.replace(" an ", " ")
+    text = text.replace(" to ", " ")
+    #text = text.replace("a", "")
+    return text

-print(pcounter)
-print(scounter)

-print("P(S) = " + str(scounter+1/counter+2))
-print("P(P) = " + str(pcounter+1/counter+2))

-def calc_class_logprob(expected_path):
+
+def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
    paranoarmal_class_count = 0
    skeptic_class_count = 0
    with open(expected_path) as f:
        for line in f:
-            if "P" in line:
+            if "1" in line:
                paranoarmal_class_count +=1
-            elif "S" in line:
+            elif "0" in line:
                skeptic_class_count +=1

    paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
    skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)

-    return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
+    return paranormal_class_prob, skeptic_class_prob

 def calc_word_counts(in_path, expected_path):
-    with open(in_path), open(expected_path) as in_file, exp_file:
+    with open(in_path) as in_file, open(expected_path) as exp_file:
        word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
        for in_line, exp_line in zip(in_file, exp_file):
-            for line in f:
-                class_ = exp_line.rstrip('\n').replace(" ", "")
-                text, timestamp = line.rstrip('\n').split('\t')
-                tokens = text.lower().split(' ')
-                for token in tokens:
-                    if class_ == 'P':
-                        word_counts['paranormal'][token] += 1
-                    elif class_ == 'S':
-                        word_counts['skeptic'][token] += 1
+            class_ = exp_line.rstrip('\n').replace(" ", "")
+            text, timestamp = in_line.rstrip('\n').split('\t')
+            text = tokenize(text)
+            tokens = text.lower().split(' ')
+            for token in tokens:
+                if class_ == '1':
+                    word_counts['paranormal'][token] += 1
+                elif class_ == '0':
+                    word_counts['skeptic'][token] += 1

    return word_counts


-def calc_words_logprobs(words_counts):
-    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())) 
-    total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
+def calc_word_logprobs(word_counts):
+    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
+    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
+    word_logprobs = {'paranormal': {}, 'skeptic':{}}
+    for class_ in word_logprobs.keys():
+        for token, value in word_counts[class_].items():
+            if class_ == 'skeptic':
+                word_prob = (value + 1)/ total_skeptic
+            else:
+                 word_prob = (value + 1)/total_paranormal
+            word_logprobs[class_][token] = word_prob
+    return word_logprobs
+
+paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
+
+word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
+
+word_logprobs = calc_word_logprobs(word_counts)
+
+#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
+
+#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
+
+def get_test_posts(path):
+    posts = []
+    with open(path) as f:
+        for line in f:
+            text, timestamp = line.rstrip('\n').split('\t')
+            posts.append(text)
+    return posts


-# with open('prediction.tsv', 'wt') as tsvfile:
-#     tsv_writer = csv.writer(tsvfile, delimiter='\t')
-#     for i in range(counter):
-#         tsv_writer.writerow('S')
+def predict_post_class(posts, sprob, pprob, word_logprobs):
+    out_classes = []

+    for post in posts:
+        total_s_prob = math.log(sprob)
+        total_p_prob = math.log(pprob)
+        post = tokenize(post)
+        tokens = post.lower().split(' ')
+        for token in tokens:
+            #dlasceptic
+            if (token in word_logprobs['skeptic'].keys()):
+                sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
+            else:
+                sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
+            #dlaparanormal
+            if (token in word_logprobs['paranormal'].keys()):
+                paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
+            else:
+                paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
+            total_s_prob += math.log(sceptic_prob)
+            total_p_prob += math.log(paranormal_prob)
+        
+        #print(total_p_prob)
+        #print(total_s_prob)
+        if total_p_prob > total_s_prob:
+            out_classes.append(total_p_prob)
+        else:
+            out_classes.append(total_s_prob)
+
+    return out_classes
+
+
+def predict_posts(path):
+    posts = get_test_posts(path+'/in.tsv')
+    classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
+    with open(path+"/out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            # for i in classes:
+            #     tsv_writer.writerow(i)
+            tsv_writer.writerows(map(lambda x: [-x], classes))
+
+predict_posts("dev-0")
+predict_posts("test-A")
+
+with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+    counter = 0
+    positive = 0
+    for out_line, exp_line in zip(out_file, exp_file):
+        counter+=1
+        if out_line == exp_line:
+            positive += 1
+    print(positive/counter)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/w2v.py
+++ b/w2v.py
@ -0,0 +1,101 @@
+import pandas as pd # our main data management package
+import string # used for preprocessing
+import re # used for preprocessing
+import nltk # the Natural Language Toolkit, used for preprocessing
+import numpy as np # used for managing NaNs
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords # used for preprocessing
+from nltk.stem import WordNetLemmatizer # used for preprocessing
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression # our model
+from sklearn.model_selection import train_test_split
+
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt')
+import gensim.models
+
+def remove_urls(text):
+    new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
+    return new_text
+
+# make all text lowercase
+def text_lowercase(text):
+    return text.lower()
+
+# remove numbers
+def remove_numbers(text):
+    result = re.sub(r'\d+', '', text)
+    return result
+
+# remove punctuation
+def remove_punctuation(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.translate(translator)
+
+# tokenize
+def tokenize(text):
+    text = word_tokenize(text)
+    return text
+
+# remove stopwords
+stop_words = set(stopwords.words('english'))
+def remove_stopwords(text):
+    text = [i for i in text if not i in stop_words]
+    return text
+    
+# lemmatize
+lemmatizer = WordNetLemmatizer()
+def lemmatize(text):
+    text = [lemmatizer.lemmatize(token) for token in text]
+    return text
+
+def preprocessing(text):
+    text = text_lowercase(text)
+    text = remove_urls(text)
+    text = remove_numbers(text)
+    text = remove_punctuation(text)
+    text = tokenize(text)
+    text = remove_stopwords(text)
+    text = lemmatize(text)
+    #text = ' '.join(text)
+    return text
+
+
+def make_posts_list(in_file):
+    posts = []
+    with open(in_file) as f:
+            for line in f:
+                post = (line.split('\t')[0])
+                posts.append(preprocessing(post))
+    return posts
+
+
+def make_exp_list(exp_file):
+    exp_list = []
+    with open(exp_file) as f:
+        for exp_line in f:
+            y = int(exp_line)
+            exp_list.append(y)
+
+    return exp_list
+
+
+tokens = make_posts_list("train/in.tsv")
+Y = make_exp_list("train/out.tsv")
+model = gensim.models.Word2Vec(tokens, size=300, min_count=1, workers=4)
+print("\n Training the word2vec model...\n")
+# reducing the epochs will decrease the computation time
+model.train(tokens, total_examples=len(tokens), epochs=4000)
+
+clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(model.wv.syn0, Y)
+
+
+tokens = make_posts_list("dev-0/in.tsv")
+# Prediction of the first 15 samples of all features
+predict = clf.predict(model.wv.syn0[:15, :])
+# Calculating the score of the predictions
+score = clf.score(model.wv.syn0, Y_dataset[:max_dataset_size])
+print("\nPrediction word2vec : \n", predict)
+print("Score word2vec : \n", score)
Author	SHA1	Message	Date
Artur Dylewski	4a86e3878e	add w2v	2020-06-03 18:01:38 +02:00
dylodylo	d13443a750	change output	2020-05-02 19:29:47 +02:00
dylodylo	9aea4283bd	change output	2020-05-02 19:27:04 +02:00
dylodylo	599d13bf16	change output	2020-05-02 16:26:34 +02:00
dylodylo	f0b5319f41	change output	2020-05-02 15:24:22 +02:00
dylodylo	fa68a0fe33	change output	2020-05-02 13:47:19 +02:00
dylodylo	dafa49e690	add tokenizer	2020-05-02 13:40:22 +02:00
dylodylo	744e5db758	naive-bayess solution	2020-03-29 21:22:20 +02:00
dylodylo	2a9ca866c9	naive-bayess solution	2020-03-29 21:03:04 +02:00
dylodylo	8fd7b62eef	naive-bayess solution	2020-03-29 20:58:56 +02:00