diff --git a/code.py b/code.py index 8c61c2c..644c61f 100644 --- a/code.py +++ b/code.py @@ -28,10 +28,17 @@ def calc_word_count(in_path, expected_path): word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)} with open(in_path,encoding='utf-8') as in_file, open(expected_path,encoding='utf-8') as expected_file: for line, exp in zip(in_file, expected_file): - class_ = exp.rstrip('\n').replace(' ','') + class_ = exp.rstrip('\n') + #.replace(' ','') text, timestamp = line.rstrip('\n').split('\t') + text = re.sub(r'\\n+', " ", text) + text = re.sub(r'http\S+', " ", text) + text = re.sub(r'\/[a-z]\/', " ", text) + text = re.sub(r'[^a-z]', " ", text) + text = re.sub(r'\s{2,}', " ", text) text = re.sub(r'(\s+|\\n)', ' ', text) - text = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', text) + text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text) + text = re.sub(r'^\s', "", text) tokens = text.lower().split(' ') for token in tokens: if class_ == 'P': diff --git a/code_prediction.py b/code_prediction.py index 2d22cfe..4ea86a8 100644 --- a/code_prediction.py +++ b/code_prediction.py @@ -11,13 +11,21 @@ paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded #Niektórych słów nie bezie w zbiorze treningowym dev-0 i dev-A def prediction(input,output): output_file = open(output,'w') + pickle_load = pickle.load(open('naive_base_model.pkl', 'rb')) + paranormal_class_logprob, skeptic_class_logprob, word_logprob = pickle_load with open(input,encoding='utf-8') as in_file: for line in in_file: temp_paranormal_logprob = paranomal_class_logprob temp_skeptic_logprob = skeptic_class_logprob text, timestamp = line.rstrip('\n').split('\t') + text = re.sub(r'\\n+', " ", text) + text = re.sub(r'http\S+', " ", text) + text = re.sub(r'\/[a-z]\/', " ", text) + text = re.sub(r'[^a-z]', " ", text) + text = re.sub(r'\s{2,}', " ", text) text = re.sub(r'(\s+|\\n)', ' ', text) - text = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', text) + text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text) + text = re.sub(r'^\s', "", text) tokens = text.lower().split(' ') for token in tokens: if token not in word_logprobs['paranormal']: