Compare commits

...

9 Commits

Author SHA1 Message Date
dylodylo
d13443a750 change output 2020-05-02 19:29:47 +02:00
dylodylo
9aea4283bd change output 2020-05-02 19:27:04 +02:00
dylodylo
599d13bf16 change output 2020-05-02 16:26:34 +02:00
dylodylo
f0b5319f41 change output 2020-05-02 15:24:22 +02:00
dylodylo
fa68a0fe33 change output 2020-05-02 13:47:19 +02:00
dylodylo
dafa49e690 add tokenizer 2020-05-02 13:40:22 +02:00
dylodylo
744e5db758 naive-bayess solution 2020-03-29 21:22:20 +02:00
dylodylo
2a9ca866c9 naive-bayess solution 2020-03-29 21:03:04 +02:00
dylodylo
8fd7b62eef naive-bayess solution 2020-03-29 20:58:56 +02:00
5 changed files with 15817 additions and 15738 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -1,71 +1,150 @@
import csv
from collections import defaultdict
import math
import pickle
import os
from pathlib import Path
counter = 0
docs = []
with open('in.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
docs.append(row)
counter+=1
print(counter)
pcounter = 0
scounter = 0
with open('expected.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if row[0] == " P":
pcounter += 1
if row[0] == " S":
scounter += 1
def tokenize(text):
text = text.replace("n't", " not")
text = text.replace("'s", " is")
text = text.replace("'ve", " have")
text = text.replace("'", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("/", " ")
text = text.replace("\\n\\n", "")
text = text.replace(".", "")
text = text.replace("?", "")
text = text.replace(",", "")
text = text.replace("!", "")
text = text.replace('"', '')
text = text.replace(" a ", " ")
text = text.replace(" on ", " ")
text = text.replace(" the ", " ")
text = text.replace(" of ", " ")
text = text.replace(" an ", " ")
text = text.replace(" to ", " ")
#text = text.replace("a", "")
return text
print(pcounter)
print(scounter)
print("P(S) = " + str(scounter+1/counter+2))
print("P(P) = " + str(pcounter+1/counter+2))
def calc_class_logprob(expected_path):
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
paranoarmal_class_count = 0
skeptic_class_count = 0
with open(expected_path) as f:
for line in f:
if "P" in line:
if "1" in line:
paranoarmal_class_count +=1
elif "S" in line:
elif "0" in line:
skeptic_class_count +=1
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
return paranormal_class_prob, skeptic_class_prob
def calc_word_counts(in_path, expected_path):
with open(in_path), open(expected_path) as in_file, exp_file:
with open(in_path) as in_file, open(expected_path) as exp_file:
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
for in_line, exp_line in zip(in_file, exp_file):
for line in f:
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = in_line.rstrip('\n').split('\t')
text = tokenize(text)
tokens = text.lower().split(' ')
for token in tokens:
if class_ == '1':
word_counts['paranormal'][token] += 1
elif class_ == '0':
word_counts['skeptic'][token] += 1
return word_counts
def calc_words_logprobs(words_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs = {'paranormal': {}, 'skeptic':{}}
for class_ in word_logprobs.keys():
for token, value in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (value + 1)/ total_skeptic
else:
word_prob = (value + 1)/total_paranormal
word_logprobs[class_][token] = word_prob
return word_logprobs
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
word_logprobs = calc_word_logprobs(word_counts)
#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
# with open('prediction.tsv', 'wt') as tsvfile:
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
# for i in range(counter):
# tsv_writer.writerow('S')
def predict_post_class(posts, sprob, pprob, word_logprobs):
out_classes = []
for post in posts:
total_s_prob = math.log(sprob)
total_p_prob = math.log(pprob)
post = tokenize(post)
tokens = post.lower().split(' ')
for token in tokens:
#dlasceptic
if (token in word_logprobs['skeptic'].keys()):
sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
else:
sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
#dlaparanormal
if (token in word_logprobs['paranormal'].keys()):
paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
else:
paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
total_s_prob += math.log(sceptic_prob)
total_p_prob += math.log(paranormal_prob)
#print(total_p_prob)
#print(total_s_prob)
if total_p_prob > total_s_prob:
out_classes.append(total_p_prob)
else:
out_classes.append(total_s_prob)
return out_classes
def predict_posts(path):
posts = get_test_posts(path+'/in.tsv')
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
# for i in classes:
# tsv_writer.writerow(i)
tsv_writer.writerows(map(lambda x: [-x], classes))
predict_posts("dev-0")
predict_posts("test-A")
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)

File diff suppressed because it is too large Load Diff