#!/usr/bin/python3 from collections import defaultdict import math import pickle import re # in expected.tsv def calc_class_logprob(expected_path): paranolal_classcount=0 sceptic_classcount=0 with open(expected_path) as f: for line in f: line = line.rstrip('\n').replace(' ','') if 'P' in line: paranolal_classcount +=1 elif 'S' in line: sceptic_classcount +=1 paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount) sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount) return math.log(paranol_prob), math.log(sceptic_prob) def clear_tokens(tokens, is_text=True): tokens = tokens.replace('\\n', ' ') return tokens # delete links, special characters, kropki, and \n tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens) tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens) tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) tokens = re.sub(r'[0-9]+', ' ', tokens) tokens = re.sub(r'œ|·', '', tokens) if is_text: tokens = re.sub(r' +', ' ', tokens) else: tokens = re.sub(r' +', '', tokens) return tokens # ile razy slowo wystepuje w dokumentach w danej klasie def calc_word_count(in_path, expected_path): word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja with open(in_path) as infile, open(expected_path) as expectedfile: for line, exp in zip(infile, expectedfile): class_ = exp.rstrip('\n').replace(' ','') text, timestap =line.rstrip('\n').split('\t') #print(f"text {type(text)}") text = clear_tokens(text, True) tokens = text.lower().split(' ') #print(f"tokens {type(tokens)}") for token in tokens: clear_tokens(token,False) if class_ == 'P': word_counts['paranormal'][token] += 1 elif class_ == 'S': word_counts['sceptic'][token]+=1 return word_counts def calc_word_logprobs(word_counts): total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys()) total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys()) word_logprobs= {'paranormal': {}, 'sceptic': {}} for class_ in word_counts.keys(): # sceptic paranormal for token, value in word_counts[class_].items(): if class_ == 'sceptic': word_prob = (value +1)/ total_skeptic elif class_ == 'paranormal': word_prob = (value+1)/ total_paranormal #print (token) word_logprobs[class_][token] = math.log(word_prob) return word_logprobs def main(): expected = './train/expected.tsv' #expected = './dev-0/expected.tsv' in_f = './train/in.tsv' #in_f = './dev-0/in.tsv' print (f"expected {expected}") print (f"in {in_f}") paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected) wordcounts =calc_word_count(in_f,expected) word_logprobs = calc_word_logprobs(wordcounts) with open('naive_base_model.pkl', 'wb') as f: pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f) # w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c) main()