2020-03-09 14:37:26 +01:00
import csv
2020-03-09 18:30:02 +01:00
from collections import defaultdict
import math
2020-03-29 21:03:04 +02:00
import pickle
import os
from pathlib import Path
2020-03-09 14:37:26 +01:00
2020-03-29 21:03:04 +02:00
def calc_class_logprob ( expected_path ) : #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
2020-03-09 18:30:02 +01:00
paranoarmal_class_count = 0
skeptic_class_count = 0
with open ( expected_path ) as f :
for line in f :
if " P " in line :
paranoarmal_class_count + = 1
elif " S " in line :
skeptic_class_count + = 1
paranormal_class_prob = paranoarmal_class_count / ( paranoarmal_class_count + skeptic_class_count )
skeptic_class_prob = skeptic_class_count / ( paranoarmal_class_count + skeptic_class_count )
return math . log ( paranormal_class_prob ) , math . log ( skeptic_class_prob )
def calc_word_counts ( in_path , expected_path ) :
2020-03-29 21:03:04 +02:00
with open ( in_path ) as in_file , open ( expected_path ) as exp_file :
2020-03-09 18:30:02 +01:00
word_counts = { ' paranormal ' : defaultdict ( int ) , ' skeptic ' : defaultdict ( int ) }
for in_line , exp_line in zip ( in_file , exp_file ) :
2020-03-29 21:03:04 +02:00
class_ = exp_line . rstrip ( ' \n ' ) . replace ( " " , " " )
text , timestamp = in_line . rstrip ( ' \n ' ) . split ( ' \t ' )
tokens = text . lower ( ) . split ( ' ' )
for token in tokens :
if class_ == ' P ' :
word_counts [ ' paranormal ' ] [ token ] + = 1
elif class_ == ' S ' :
word_counts [ ' skeptic ' ] [ token ] + = 1
2020-03-09 18:30:02 +01:00
return word_counts
2020-03-29 21:03:04 +02:00
def calc_word_logprobs ( word_counts ) :
total_skeptic = sum ( word_counts [ ' skeptic ' ] . values ( ) ) + len ( word_counts [ ' skeptic ' ] . keys ( ) )
total_paranormal = sum ( word_counts [ ' paranormal ' ] . values ( ) ) + len ( word_counts [ ' paranormal ' ] . keys ( ) )
word_logprobs = { ' paranormal ' : { } , ' skeptic ' : { } }
for class_ in word_logprobs . keys ( ) :
for token , value in word_counts [ class_ ] . items ( ) :
if class_ == ' skeptic ' :
word_prob = ( value + 1 ) / total_skeptic
else :
word_prob = ( value + 1 ) / total_paranormal
word_logprobs [ class_ ] [ token ] = math . log ( word_prob )
return word_logprobs
2020-03-09 18:30:02 +01:00
2020-03-29 21:03:04 +02:00
paranormal_class_logprob , skeptic_class_logprob = calc_class_logprob ( " train/expected.tsv " )
2020-03-09 14:37:26 +01:00
2020-03-29 21:03:04 +02:00
word_counts = calc_word_counts ( ' train/in.tsv ' , ' train/expected.tsv ' )
2020-03-09 14:37:26 +01:00
2020-03-29 21:03:04 +02:00
word_logprobs = calc_word_logprobs ( word_counts )
print ( word_logprobs [ ' skeptic ' ] [ " hair. " ] ) #-12.166205308815476
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
def get_test_posts ( path ) :
posts = [ ]
with open ( path ) as f :
for line in f :
text , timestamp = line . rstrip ( ' \n ' ) . split ( ' \t ' )
posts . append ( text )
return posts
def predict_post_class ( posts , sprob , pprob , word_logprobs ) :
out_classes = [ ]
for post in posts :
total_s_prob = sprob
total_p_prob = pprob
tokens = post . lower ( ) . split ( ' ' )
for token in tokens :
#dlasceptic
if ( token in word_logprobs [ ' skeptic ' ] . keys ( ) ) :
sceptic_prob = word_logprobs [ ' skeptic ' ] [ token ]
else :
sceptic_prob = 0
#dlaparanormal
if ( token in word_logprobs [ ' paranormal ' ] . keys ( ) ) :
paranormal_prob = word_logprobs [ ' paranormal ' ] [ token ]
else :
paranormal_prob = 0
total_s_prob + = sceptic_prob
total_p_prob + = paranormal_prob
#print(total_p_prob)
#print(total_s_prob)
if total_p_prob > total_s_prob :
2020-03-29 21:22:20 +02:00
out_classes . append ( ' P ' )
2020-03-29 21:03:04 +02:00
else :
2020-03-29 21:22:20 +02:00
out_classes . append ( ' S ' )
2020-03-29 21:03:04 +02:00
return out_classes
def predict_posts ( path ) :
posts = get_test_posts ( path + ' /in.tsv ' )
classes = predict_post_class ( posts , skeptic_class_logprob , paranormal_class_logprob , word_logprobs )
with open ( path + " /out.tsv " , ' wt ' ) as tsvfile :
tsv_writer = csv . writer ( tsvfile , delimiter = ' \t ' )
for i in classes :
tsv_writer . writerow ( i )
predict_posts ( " dev-0 " )
predict_posts ( " test-A " )
2020-03-29 21:22:20 +02:00
with open ( " dev-0/out.tsv " ) as out_file , open ( " dev-0/expected.tsv " ) as exp_file :
counter = 0
positive = 0
for out_line , exp_line in zip ( out_file , exp_file ) :
counter + = 1
if " " + out_line == exp_line :
positive + = 1
print ( positive / counter )