Compare commits
9 Commits
master
...
naive-baye
Author | SHA1 | Date | |
---|---|---|---|
|
d13443a750 | ||
|
9aea4283bd | ||
|
599d13bf16 | ||
|
f0b5319f41 | ||
|
fa68a0fe33 | ||
|
dafa49e690 | ||
|
744e5db758 | ||
|
2a9ca866c9 | ||
|
8fd7b62eef |
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
BIN
dev-0/in.tsv.xz
Binary file not shown.
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
163
solution.py
163
solution.py
@ -1,71 +1,150 @@
|
|||||||
import csv
|
import csv
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import math
|
import math
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
counter = 0
|
|
||||||
docs = []
|
|
||||||
with open('in.tsv') as tsvfile:
|
|
||||||
reader = csv.reader(tsvfile, delimiter='\t')
|
|
||||||
for row in reader:
|
|
||||||
docs.append(row)
|
|
||||||
counter+=1
|
|
||||||
|
|
||||||
print(counter)
|
def tokenize(text):
|
||||||
pcounter = 0
|
text = text.replace("n't", " not")
|
||||||
scounter = 0
|
text = text.replace("'s", " is")
|
||||||
with open('expected.tsv') as tsvfile:
|
text = text.replace("'ve", " have")
|
||||||
reader = csv.reader(tsvfile, delimiter='\t')
|
text = text.replace("'", " ")
|
||||||
for row in reader:
|
text = text.replace("(", " ")
|
||||||
if row[0] == " P":
|
text = text.replace(")", " ")
|
||||||
pcounter += 1
|
text = text.replace("/", " ")
|
||||||
if row[0] == " S":
|
text = text.replace("\\n\\n", "")
|
||||||
scounter += 1
|
text = text.replace(".", "")
|
||||||
|
text = text.replace("?", "")
|
||||||
|
text = text.replace(",", "")
|
||||||
|
text = text.replace("!", "")
|
||||||
|
text = text.replace('"', '')
|
||||||
|
text = text.replace(" a ", " ")
|
||||||
|
text = text.replace(" on ", " ")
|
||||||
|
text = text.replace(" the ", " ")
|
||||||
|
text = text.replace(" of ", " ")
|
||||||
|
text = text.replace(" an ", " ")
|
||||||
|
text = text.replace(" to ", " ")
|
||||||
|
#text = text.replace("a", "")
|
||||||
|
return text
|
||||||
|
|
||||||
print(pcounter)
|
|
||||||
print(scounter)
|
|
||||||
|
|
||||||
print("P(S) = " + str(scounter+1/counter+2))
|
|
||||||
print("P(P) = " + str(pcounter+1/counter+2))
|
|
||||||
|
|
||||||
def calc_class_logprob(expected_path):
|
|
||||||
|
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
|
||||||
paranoarmal_class_count = 0
|
paranoarmal_class_count = 0
|
||||||
skeptic_class_count = 0
|
skeptic_class_count = 0
|
||||||
with open(expected_path) as f:
|
with open(expected_path) as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
if "P" in line:
|
if "1" in line:
|
||||||
paranoarmal_class_count +=1
|
paranoarmal_class_count +=1
|
||||||
elif "S" in line:
|
elif "0" in line:
|
||||||
skeptic_class_count +=1
|
skeptic_class_count +=1
|
||||||
|
|
||||||
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
|
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
|
||||||
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
|
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
|
||||||
|
|
||||||
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
|
return paranormal_class_prob, skeptic_class_prob
|
||||||
|
|
||||||
def calc_word_counts(in_path, expected_path):
|
def calc_word_counts(in_path, expected_path):
|
||||||
with open(in_path), open(expected_path) as in_file, exp_file:
|
with open(in_path) as in_file, open(expected_path) as exp_file:
|
||||||
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
|
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
|
||||||
for in_line, exp_line in zip(in_file, exp_file):
|
for in_line, exp_line in zip(in_file, exp_file):
|
||||||
for line in f:
|
class_ = exp_line.rstrip('\n').replace(" ", "")
|
||||||
class_ = exp_line.rstrip('\n').replace(" ", "")
|
text, timestamp = in_line.rstrip('\n').split('\t')
|
||||||
text, timestamp = line.rstrip('\n').split('\t')
|
text = tokenize(text)
|
||||||
tokens = text.lower().split(' ')
|
tokens = text.lower().split(' ')
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if class_ == 'P':
|
if class_ == '1':
|
||||||
word_counts['paranormal'][token] += 1
|
word_counts['paranormal'][token] += 1
|
||||||
elif class_ == 'S':
|
elif class_ == '0':
|
||||||
word_counts['skeptic'][token] += 1
|
word_counts['skeptic'][token] += 1
|
||||||
|
|
||||||
return word_counts
|
return word_counts
|
||||||
|
|
||||||
|
|
||||||
def calc_words_logprobs(words_counts):
|
def calc_word_logprobs(word_counts):
|
||||||
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
|
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
|
||||||
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
|
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
||||||
|
word_logprobs = {'paranormal': {}, 'skeptic':{}}
|
||||||
|
for class_ in word_logprobs.keys():
|
||||||
|
for token, value in word_counts[class_].items():
|
||||||
|
if class_ == 'skeptic':
|
||||||
|
word_prob = (value + 1)/ total_skeptic
|
||||||
|
else:
|
||||||
|
word_prob = (value + 1)/total_paranormal
|
||||||
|
word_logprobs[class_][token] = word_prob
|
||||||
|
return word_logprobs
|
||||||
|
|
||||||
|
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
|
||||||
|
|
||||||
|
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
|
||||||
|
|
||||||
|
word_logprobs = calc_word_logprobs(word_counts)
|
||||||
|
|
||||||
|
#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
|
||||||
|
|
||||||
|
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
|
||||||
|
|
||||||
|
def get_test_posts(path):
|
||||||
|
posts = []
|
||||||
|
with open(path) as f:
|
||||||
|
for line in f:
|
||||||
|
text, timestamp = line.rstrip('\n').split('\t')
|
||||||
|
posts.append(text)
|
||||||
|
return posts
|
||||||
|
|
||||||
|
|
||||||
# with open('prediction.tsv', 'wt') as tsvfile:
|
def predict_post_class(posts, sprob, pprob, word_logprobs):
|
||||||
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
out_classes = []
|
||||||
# for i in range(counter):
|
|
||||||
# tsv_writer.writerow('S')
|
|
||||||
|
|
||||||
|
for post in posts:
|
||||||
|
total_s_prob = math.log(sprob)
|
||||||
|
total_p_prob = math.log(pprob)
|
||||||
|
post = tokenize(post)
|
||||||
|
tokens = post.lower().split(' ')
|
||||||
|
for token in tokens:
|
||||||
|
#dlasceptic
|
||||||
|
if (token in word_logprobs['skeptic'].keys()):
|
||||||
|
sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
|
||||||
|
else:
|
||||||
|
sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
|
||||||
|
#dlaparanormal
|
||||||
|
if (token in word_logprobs['paranormal'].keys()):
|
||||||
|
paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
|
||||||
|
else:
|
||||||
|
paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
|
||||||
|
total_s_prob += math.log(sceptic_prob)
|
||||||
|
total_p_prob += math.log(paranormal_prob)
|
||||||
|
|
||||||
|
#print(total_p_prob)
|
||||||
|
#print(total_s_prob)
|
||||||
|
if total_p_prob > total_s_prob:
|
||||||
|
out_classes.append(total_p_prob)
|
||||||
|
else:
|
||||||
|
out_classes.append(total_s_prob)
|
||||||
|
|
||||||
|
return out_classes
|
||||||
|
|
||||||
|
|
||||||
|
def predict_posts(path):
|
||||||
|
posts = get_test_posts(path+'/in.tsv')
|
||||||
|
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
|
||||||
|
with open(path+"/out.tsv", 'wt') as tsvfile:
|
||||||
|
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||||
|
# for i in classes:
|
||||||
|
# tsv_writer.writerow(i)
|
||||||
|
tsv_writer.writerows(map(lambda x: [-x], classes))
|
||||||
|
|
||||||
|
predict_posts("dev-0")
|
||||||
|
predict_posts("test-A")
|
||||||
|
|
||||||
|
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
||||||
|
counter = 0
|
||||||
|
positive = 0
|
||||||
|
for out_line, exp_line in zip(out_file, exp_file):
|
||||||
|
counter+=1
|
||||||
|
if out_line == exp_line:
|
||||||
|
positive += 1
|
||||||
|
print(positive/counter)
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user