diff --git a/dev-0/naive_bigram.pkl b/dev-0/naive_bigram.pkl index 65b8291..8aeb34f 100644 Binary files a/dev-0/naive_bigram.pkl and b/dev-0/naive_bigram.pkl differ diff --git a/dev-0/out.tsv b/dev-0/out.tsv index d2c2dda..3201db3 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -116,7 +116,7 @@ P S S - P + S S P P @@ -1850,7 +1850,7 @@ S S S - P + S S S S @@ -2511,7 +2511,7 @@ P P P - P + S P S S @@ -2788,7 +2788,7 @@ S S P - P + S S S P @@ -2924,7 +2924,7 @@ S P S - P + S P S S @@ -3521,7 +3521,7 @@ S S S - P + S S S S diff --git a/predict.py b/predict.py index 0ed14e7..547d64e 100755 --- a/predict.py +++ b/predict.py @@ -30,11 +30,11 @@ def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigra def clear_post(post): post = post.replace('\\n', ' ') - post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+]+(\)|)', '', post) - post = re.sub(r'[\.\,]+', ' ', post) - post = re.sub(r'(<|>)','',post) - post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post) - post = re.sub(r' \- ', ' ', post) + post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', '', post) + post = re.sub(r'[\.\,\/\~]+', ' ', post) + post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post) + post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|]+', '', post) + post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r' +', ' ', post) post = post.rstrip(' ') return post diff --git a/test-A/out.tsv b/test-A/out.tsv index 5c4fe57..e9dedc1 100644 --- a/test-A/out.tsv +++ b/test-A/out.tsv @@ -235,7 +235,7 @@ S S S - P + S S S S @@ -343,7 +343,7 @@ P S S - P + S S S S @@ -357,7 +357,7 @@ P S S - P + S P S S @@ -425,7 +425,7 @@ S P S - S + P S S S @@ -441,7 +441,7 @@ P S S - P + S S P P @@ -456,7 +456,7 @@ S S S - S + P S S S @@ -694,7 +694,7 @@ S P S - S + P S P S @@ -844,7 +844,7 @@ S P S - S + P S P S @@ -998,7 +998,7 @@ S S S - S + P S S S @@ -1268,7 +1268,7 @@ P S S - P + S S S P @@ -1383,7 +1383,7 @@ S S S - S + P S S S @@ -1582,7 +1582,7 @@ S S S - P + S S S S @@ -1788,7 +1788,7 @@ S S P - S + P S S S @@ -1855,7 +1855,7 @@ S S S - S + P S S S @@ -1986,7 +1986,7 @@ S S S - S + P S S S @@ -2217,7 +2217,7 @@ S S S - S + P S S S @@ -2227,7 +2227,7 @@ S S S - P + S S S S @@ -3282,7 +3282,7 @@ S P S - S + P S S S @@ -3378,7 +3378,7 @@ S S P - S + P S S P @@ -3724,7 +3724,7 @@ S P S - P + S S S S @@ -3765,7 +3765,7 @@ P S S - P + S S S S @@ -4111,7 +4111,7 @@ P S P - P + S S S S @@ -4190,7 +4190,7 @@ S S S - P + S P S P @@ -4344,7 +4344,7 @@ S S S - S + P P S S @@ -4421,19 +4421,19 @@ S P P - P S S S S S - P S P S + P S S S + P S S S @@ -4763,7 +4763,7 @@ S S S - S + P S S S diff --git a/train.py b/train.py index d9ea143..7384838 100755 --- a/train.py +++ b/train.py @@ -25,11 +25,11 @@ def calc_class_logprob(expected_path): def clear_post(post): post = post.replace('\\n', ' ') # delete links - post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+]+(\)|)', '', post) - post = re.sub(r'[\.\,\/]+', ' ', post) - post = re.sub(r'(<|>)','',post) - post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post) - post = re.sub(r' \- ', ' ', post) + post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', '', post) + post = re.sub(r'[\.\,\/\~]+', ' ', post) + post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post) + post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|]+', '', post) + post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r' +', ' ', post) post = post.rstrip(' ') return post @@ -45,7 +45,7 @@ def calc_bigram_count(in_path, expected_path): for index in range(len(tokens)-1): # if there is next token we append current and next bigram = tokens[index] + " " + tokens[index + 1] - #print(bigram) + print(bigram) #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;") if class_ == 'P': bigram_counts['paranormal'][bigram] +=1