2020-03-28 20:40:28 +01:00
|
|
|
from collections import defaultdict
|
|
|
|
import math
|
|
|
|
import pickle
|
2020-05-02 17:08:25 +02:00
|
|
|
import re
|
2020-03-28 20:40:28 +01:00
|
|
|
|
2020-05-02 18:40:08 +02:00
|
|
|
open_file = open('naive_base_model.pkl', 'rb')
|
|
|
|
pickle_loaded = pickle.load(open_file)
|
|
|
|
paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded
|
|
|
|
#pickle_loaded=pickle.load(open_file)
|
|
|
|
#paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded
|
|
|
|
#Niektórych słów nie bezie w zbiorze treningowym dev-0 i dev-A
|
2020-03-31 17:09:32 +02:00
|
|
|
def prediction(input,output):
|
|
|
|
output_file = open(output,'w')
|
2020-05-02 18:40:08 +02:00
|
|
|
#pickle_load = pickle.load(open('naive_base_model.pkl', 'rb'))
|
|
|
|
#paranormal_class_logprob, skeptic_class_logprob, word_logprob = pickle_load
|
2020-03-31 17:09:32 +02:00
|
|
|
with open(input,encoding='utf-8') as in_file:
|
|
|
|
for line in in_file:
|
|
|
|
temp_paranormal_logprob = paranomal_class_logprob
|
|
|
|
temp_skeptic_logprob = skeptic_class_logprob
|
|
|
|
text, timestamp = line.rstrip('\n').split('\t')
|
2020-05-02 18:30:58 +02:00
|
|
|
text = text.lower()
|
|
|
|
text = re.sub(r'\\n+', " ", text)
|
2020-05-02 18:40:08 +02:00
|
|
|
text = re.sub(r'http\S+', " ", text)
|
2020-05-02 18:16:25 +02:00
|
|
|
text = re.sub(r'\/[a-z]\/', " ", text)
|
|
|
|
text = re.sub(r'[^a-z]', " ", text)
|
|
|
|
text = re.sub(r'\s{2,}', " ", text)
|
2020-05-02 17:08:25 +02:00
|
|
|
text = re.sub(r'(\s+|\\n)', ' ', text)
|
2020-05-02 18:16:25 +02:00
|
|
|
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
|
|
|
text = re.sub(r'^\s', "", text)
|
2020-05-02 18:30:58 +02:00
|
|
|
tokens = text.split(' ')
|
2020-03-31 17:09:32 +02:00
|
|
|
for token in tokens:
|
|
|
|
if token not in word_logprobs['paranormal']:
|
2020-05-02 18:30:58 +02:00
|
|
|
word_logprobs['paranormal'][token] = -14.78
|
2020-03-31 17:09:32 +02:00
|
|
|
if token not in word_logprobs['skeptic']:
|
2020-05-02 18:30:58 +02:00
|
|
|
word_logprobs['skeptic'][token] = -15.6
|
2020-03-31 17:09:32 +02:00
|
|
|
|
|
|
|
temp_paranormal_logprob += paranomal_class_logprob + word_logprobs['paranormal'][token]
|
|
|
|
temp_skeptic_logprob += skeptic_class_logprob + word_logprobs['skeptic'][token]
|
|
|
|
|
|
|
|
if temp_paranormal_logprob > temp_skeptic_logprob:
|
2020-04-20 18:34:14 +02:00
|
|
|
output_file.write('1\n')
|
2020-05-02 18:31:32 +02:00
|
|
|
else:
|
|
|
|
output_file.write('0\n')
|
2020-04-20 18:40:51 +02:00
|
|
|
##
|
2020-03-31 17:09:32 +02:00
|
|
|
def main():
|
|
|
|
prediction('dev-0/in.tsv','dev-0/out.tsv')
|
|
|
|
prediction('test-A/in.tsv/in.tsv','test-A/out.tsv')
|
|
|
|
|
|
|
|
main()
|
|
|
|
|