laptop commit fixed naive baise

2020-05-02 19:26:03 +02:00 · 2020-05-02 19:26:03 +02:00 · 6be8cd183c
commit 6be8cd183c
parent a6694d768d
5 changed files with 457 additions and 452 deletions
--- a/code.py
+++ b/code.py
@ -33,8 +33,13 @@ def calc_word_count(in_path, expected_path):
            text = text.lower()
            text = re.sub(r'\\n+', " ", text)
            text = re.sub(r'http\S+', " ", text)
-            #text = re.sub(r'(\s+|\\n)', ' ', text)
+            text = re.sub(r'\/[a-z]\/', " ", text)
-            #text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
+            text = re.sub(r'[^a-z]', " ", text)
            text = re.sub(r'\s{2,}', " ", text)
            text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            text = re.sub(r'^\s', "", text)
            tokens = text.split(' ')
            for token in tokens:
                if class_ == '1':
--- a/code_prediction.py
+++ b/code_prediction.py
@ -6,13 +6,8 @@ import re
 open_file = open('naive_base_model.pkl', 'rb')
 pickle_loaded = pickle.load(open_file)
 paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded
 #pickle_loaded=pickle.load(open_file)
 #paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded
 #Niektórych słów nie bezie w zbiorze treningowym dev-0 i dev-A
 def prediction(input,output):
    output_file = open(output,'w')
    #pickle_load = pickle.load(open('naive_base_model.pkl', 'rb'))
    #paranormal_class_logprob, skeptic_class_logprob, word_logprob = pickle_load
    with open(input,encoding='utf-8') as in_file:
        for line in in_file:
            temp_paranormal_logprob = paranomal_class_logprob
@ -21,8 +16,13 @@ def prediction(input,output):
            text = text.lower()
            text = re.sub(r'\\n+', " ", text)
            text = re.sub(r'http\S+', " ", text)
-            #text = re.sub(r'(\s+|\\n)', ' ', text)
+            text = re.sub(r'\/[a-z]\/', " ", text)
-            #text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
+            text = re.sub(r'[^a-z]', " ", text)
            text = re.sub(r'\s{2,}', " ", text)
            text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
            text = re.sub(r'^\s', "", text)
            tokens = text.split(' ')
            for token in tokens:
                if token not in word_logprobs['paranormal']:
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/naive_base_model.pkl
+++ b/naive_base_model.pkl
--- a/test-A/out.tsv
+++ b/test-A/out.tsv