paranormal-or-skeptic/solution.py

150 lines
5.5 KiB
Python
Raw Permalink Normal View History

2020-03-09 14:37:26 +01:00
import csv
2020-03-09 18:30:02 +01:00
from collections import defaultdict
import math
2020-03-29 21:03:04 +02:00
import pickle
import os
from pathlib import Path
2020-03-09 14:37:26 +01:00
2020-05-02 13:40:22 +02:00
def tokenize(text):
text = text.replace("n't", " not")
text = text.replace("'s", " is")
text = text.replace("'ve", " have")
text = text.replace("'", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("/", " ")
text = text.replace("\\n\\n", "")
text = text.replace(".", "")
text = text.replace("?", "")
text = text.replace(",", "")
text = text.replace("!", "")
text = text.replace('"', '')
text = text.replace(" a ", " ")
text = text.replace(" on ", " ")
text = text.replace(" the ", " ")
text = text.replace(" of ", " ")
text = text.replace(" an ", " ")
text = text.replace(" to ", " ")
#text = text.replace("a", "")
return text
2020-03-29 21:03:04 +02:00
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
2020-03-09 18:30:02 +01:00
paranoarmal_class_count = 0
skeptic_class_count = 0
with open(expected_path) as f:
for line in f:
2020-05-02 13:40:22 +02:00
if "1" in line:
2020-03-09 18:30:02 +01:00
paranoarmal_class_count +=1
2020-05-02 13:40:22 +02:00
elif "0" in line:
2020-03-09 18:30:02 +01:00
skeptic_class_count +=1
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
2020-05-02 19:29:47 +02:00
return paranormal_class_prob, skeptic_class_prob
2020-03-09 18:30:02 +01:00
def calc_word_counts(in_path, expected_path):
2020-03-29 21:03:04 +02:00
with open(in_path) as in_file, open(expected_path) as exp_file:
2020-03-09 18:30:02 +01:00
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
for in_line, exp_line in zip(in_file, exp_file):
2020-03-29 21:03:04 +02:00
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = in_line.rstrip('\n').split('\t')
2020-05-02 13:40:22 +02:00
text = tokenize(text)
2020-03-29 21:03:04 +02:00
tokens = text.lower().split(' ')
for token in tokens:
2020-05-02 13:40:22 +02:00
if class_ == '1':
2020-03-29 21:03:04 +02:00
word_counts['paranormal'][token] += 1
2020-05-02 13:40:22 +02:00
elif class_ == '0':
2020-03-29 21:03:04 +02:00
word_counts['skeptic'][token] += 1
2020-03-09 18:30:02 +01:00
return word_counts
2020-03-29 21:03:04 +02:00
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs = {'paranormal': {}, 'skeptic':{}}
for class_ in word_logprobs.keys():
for token, value in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (value + 1)/ total_skeptic
else:
word_prob = (value + 1)/total_paranormal
2020-05-02 19:29:47 +02:00
word_logprobs[class_][token] = word_prob
2020-03-29 21:03:04 +02:00
return word_logprobs
2020-03-09 18:30:02 +01:00
2020-03-29 21:03:04 +02:00
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
2020-03-09 14:37:26 +01:00
2020-03-29 21:03:04 +02:00
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
2020-03-09 14:37:26 +01:00
2020-03-29 21:03:04 +02:00
word_logprobs = calc_word_logprobs(word_counts)
2020-05-02 13:40:22 +02:00
#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
2020-03-29 21:03:04 +02:00
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
def predict_post_class(posts, sprob, pprob, word_logprobs):
out_classes = []
for post in posts:
2020-05-02 19:29:47 +02:00
total_s_prob = math.log(sprob)
total_p_prob = math.log(pprob)
2020-05-02 13:40:22 +02:00
post = tokenize(post)
2020-03-29 21:03:04 +02:00
tokens = post.lower().split(' ')
for token in tokens:
#dlasceptic
if (token in word_logprobs['skeptic'].keys()):
2020-05-02 19:29:47 +02:00
sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
2020-03-29 21:03:04 +02:00
else:
2020-05-02 19:29:47 +02:00
sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
2020-03-29 21:03:04 +02:00
#dlaparanormal
if (token in word_logprobs['paranormal'].keys()):
2020-05-02 19:29:47 +02:00
paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
2020-03-29 21:03:04 +02:00
else:
2020-05-02 19:29:47 +02:00
paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
total_s_prob += math.log(sceptic_prob)
total_p_prob += math.log(paranormal_prob)
2020-03-29 21:03:04 +02:00
#print(total_p_prob)
#print(total_s_prob)
if total_p_prob > total_s_prob:
2020-05-02 19:29:47 +02:00
out_classes.append(total_p_prob)
2020-03-29 21:03:04 +02:00
else:
2020-05-02 19:29:47 +02:00
out_classes.append(total_s_prob)
2020-03-29 21:03:04 +02:00
return out_classes
def predict_posts(path):
posts = get_test_posts(path+'/in.tsv')
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
2020-05-02 19:29:47 +02:00
# for i in classes:
# tsv_writer.writerow(i)
tsv_writer.writerows(map(lambda x: [-x], classes))
2020-03-29 21:03:04 +02:00
predict_posts("dev-0")
predict_posts("test-A")
2020-03-29 21:22:20 +02:00
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
2020-05-02 13:40:22 +02:00
if out_line == exp_line:
2020-03-29 21:22:20 +02:00
positive += 1
print(positive/counter)