Bigram implemented

This commit is contained in:
s426135 2020-03-29 13:39:47 +02:00
parent c267e9e7e0
commit 0b9f952661
16 changed files with 13329 additions and 2739 deletions

Binary file not shown.

Binary file not shown.

BIN
dev-0/naive_bigram.pkl Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

5272
dev-0/out.tsv_baseline Normal file

File diff suppressed because it is too large Load Diff

10
exp Normal file
View File

@ -0,0 +1,10 @@
S
P
P
S
S
S
S
P
S
S

10
in Normal file
View File

@ -0,0 +1,10 @@
In which case, tell them I'm in work, or dead, or down the shops. They can come back in when they've got Professor Brian Cox with them. If only cause I want to meet him and steal his hair. 1328302967
Put me down as another for Mysterious Universe. Those dudes are brilliant. 1347836881
The military of any country would never admit that UFO's have taken down our most powerful weapons. 1331905826
An example would have been more productive than a downvote, I think. 1315584834
sorry, but the authors of this article admit that the study is limited and flawed. Also, upon peer review, the study is found to be fallacious. It says this towards the end of the article, a point to which the majority of people usually fail to read. \n\nalso, 463 people is NOT, in any way, compelling evidence for a causal relationship. 1347389166
"Are you afraid of science in general, or just the kind you read about on the internet?" 1303864529
Well, I know it can decrease intraocular pressure, but as for *cancer...*\n\nThe only data I'm aware of relating marijuana to cancer is that it's an antiemetic and an appetite increasing drug. Which, while very good for patient health, are not directly related to the cancer itself, as much as a treatment of the adverse reactions of chemotherapy/radiation treatment (unless if the cancer is in the GI system, or is making weird hormones...).\n\nIf you have any information indicating that marijuana helps in cancer/HIV treatment *other* than as an antiemetic/appetite increasing medication, could you please provide it? I mean, I'm all for it's responsible use, but I don't want to go around spreading misinformation, especially if I'm going to one day be in the medical field. 1318558797
That could be anything. Why even bother... 1285029343
what was the joke? he deleted it 1337651956
Its the landing and taking off that/'s hard, and those things will do do that for you as well. The pilots, for the most part, are there in case shit happens.\n\nBTW stalling a Cessna is fun as hell. 1336941346

BIN
naive_bigram.pkl Normal file

Binary file not shown.

View File

@ -3,69 +3,61 @@
import pickle import pickle
import math import math
import re import re
import sys
def clear_tokens(tokens, is_text=True): def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs):
tokens = tokens.replace('\\n', ' ')
return tokens
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens)
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
# dla kazdego tokenu z danego posta
text, timestap = post.rstrip('\n').split('\t') text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text, True) text = clear_post(text)
tokens = text.lower().split(' ') tokens = text.lower().split(' ')
#probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
probs = {} probs = {}
for class_ in word_logprobs.keys(): for class_ in bigrams_logprobs.keys():
product = 1 product = 0
for token in tokens: for index in range(len(tokens)-1):
token = clear_tokens(token, False) # we handle bigrams not in models as neutral
bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram)
try: try:
product *= word_logprobs[class_][token] product += bigrams_logprobs[class_][bigram]
except KeyError: except KeyError:
product *= 1 product +=0
# tu wzoru uzyj
if class_ == 'sceptic': if class_ == 'sceptic':
product *= sceptic_class_logprob product += sceptic_class_logprob
elif class_ == 'paranormal': elif class_ == 'paranormal':
product *= paranormal_class_logprob product += paranormal_class_logprob
probs[abs(product)] = class_ probs[product] = class_
#print(probs) #print(probs)
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal return probs[min(probs.keys())]
if search_for_keywords(text):
return 'paranormal'
return probs[max(probs.keys())]
def search_for_keywords(text): def clear_post(post):
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis'] post = post.replace('\\n', ' ')
return any(keyword in text for keyword in keywords) post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+]+(\)|)', '', post)
post = re.sub(r'[\.\,]+', ' ', post)
post = re.sub(r'(&lt|&gt)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post)
post = re.sub(r' \- ', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
return post
def main(): def main():
with open('naive_base_model.pkl', 'rb') as f: if len(sys.argv) != 4:
print("syntax is ./predict.py in.tsv out.tsv model.pkl")
return
in_file = sys.argv[1]
out_file = sys.argv[2]
model = sys.argv[3]
with open(model, 'rb') as f:
pickle_list = pickle.load(f) pickle_list = pickle.load(f)
paranormal_class_logprob = pickle_list[0] paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1] sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2] bigrams_logprobs = pickle_list[2]
in_file = "test-A/in.tsv"
#in_file = "dev-0/in.tsv"
out_file = "test-A/out.tsv"
#out_file = "dev-0/out.tsv"
print (f"in {in_file}")
print (f"out {out_file}")
with open(in_file) as in_f, open(out_file, 'w') as out_f: with open(in_file) as in_f, open(out_file, 'w') as out_f:
for line in in_f: for line in in_f:
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs) hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs)
if hyp == 'sceptic': if hyp == 'sceptic':
out_f.write(" S\n") out_f.write(' S\n')
elif hyp == 'paranormal': elif hyp == 'paranormal':
out_f.write(' P\n') out_f.write(' P\n')
main() main()

71
predict_baseline.py Executable file
View File

@ -0,0 +1,71 @@
#!/usr/bin/python3
import pickle
import math
import re
def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ')
return tokens
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens)
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
# dla kazdego tokenu z danego posta
text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text, True)
tokens = text.lower().split(' ')
#probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
probs = {}
for class_ in word_logprobs.keys():
product = 1
for token in tokens:
token = clear_tokens(token, False)
try:
product *= word_logprobs[class_][token]
except KeyError:
product *= 1
# tu wzoru uzyj
if class_ == 'sceptic':
product *= sceptic_class_logprob
elif class_ == 'paranormal':
product *= paranormal_class_logprob
probs[abs(product)] = class_
#print(probs)
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
if search_for_keywords(text):
return 'paranormal'
return probs[max(probs.keys())]
def search_for_keywords(text):
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis']
return any(keyword in text for keyword in keywords)
def main():
with open('naive_base_model.pkl', 'rb') as f:
pickle_list = pickle.load(f)
paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2]
in_file = "test-A/in.tsv"
#in_file = "dev-0/in.tsv"
out_file = "test-A/out.tsv"
#out_file = "dev-0/out.tsv"
print (f"in {in_file}")
print (f"out {out_file}")
with open(in_file) as in_f, open(out_file, 'w') as out_f:
for line in in_f:
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
if hyp == 'sceptic':
out_f.write(" S\n")
elif hyp == 'paranormal':
out_f.write(' P\n')
main()

File diff suppressed because it is too large Load Diff

5152
test-A/out.tsv_baseline Normal file

File diff suppressed because it is too large Load Diff

120
train.py
View File

@ -3,89 +3,81 @@ from collections import defaultdict
import math import math
import pickle import pickle
import re import re
import sys
# in expected.tsv
def calc_class_logprob(expected_path): def calc_class_logprob(expected_path):
paranolal_classcount=0 paranormal_classcount = 0
sceptic_classcount=0 sceptic_classcount = 0
with open(expected_path) as f: with open(expected_path) as f:
for line in f: for line in f:
line = line.rstrip('\n').replace(' ','') line = line.rstrip('\n').replace(' ','')
if 'P' in line: if 'P' in line:
paranolal_classcount +=1 paranormal_classcount +=1
elif 'S' in line: elif 'S' in line:
sceptic_classcount +=1 sceptic_classcount +=1
paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount) paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount) sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob) return math.log(paranol_prob), math.log(sceptic_prob)
def clear_tokens(tokens, is_text=True): def clear_post(post):
tokens = tokens.replace('\\n', ' ') post = post.replace('\\n', ' ')
return tokens # delete links
# delete links, special characters, kropki, and \n post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+]+(\)|)', '', post)
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) post = re.sub(r'[\.\,\/]+', ' ', post)
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens) post = re.sub(r'(&lt|&gt)','',post)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens) post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) post = re.sub(r' \- ', ' ', post)
tokens = re.sub(r'[0-9]+', ' ', tokens) post = re.sub(r' +', ' ', post)
tokens = re.sub(r'œ|·', '', tokens) post = post.rstrip(' ')
if is_text: return post
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie def calc_bigram_count(in_path, expected_path):
def calc_word_count(in_path, expected_path): bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja with open(in_path) as infile, open(expected_path) as expected_file:
with open(in_path) as infile, open(expected_path) as expectedfile: for line, exp in zip(infile, expected_file):
for line, exp in zip(infile, expectedfile): class_ = exp.rstrip('\n').replace(' ', '')
class_ = exp.rstrip('\n').replace(' ','') text, timestap = line.rstrip('\n').split('\t')
text, timestap =line.rstrip('\n').split('\t') text = clear_post(text)
#print(f"text {type(text)}")
text = clear_tokens(text, True)
tokens = text.lower().split(' ') tokens = text.lower().split(' ')
#print(f"tokens {type(tokens)}") for index in range(len(tokens)-1):
for token in tokens: # if there is next token we append current and next
clear_tokens(token,False) bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P': if class_ == 'P':
word_counts['paranormal'][token] += 1 bigram_counts['paranormal'][bigram] +=1
elif class_ == 'S': elif class_ == 'S':
word_counts['sceptic'][token]+=1 bigram_counts['sceptic'][bigram] +=1
return bigram_counts
return word_counts def calc_bigram_logprobs(bigram_counts):
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
for class_ in bigram_counts.keys():
for bigram, value in bigram_counts[class_].items():
if class_ == "sceptic":
bigram_prob = (value + 1) / total_sceptic
elif class_ == "paranormal":
bigram_prob = (value + 1) / total_paranormal
def calc_word_logprobs(word_counts): bigram_logprobs[class_][bigram] = math.log(bigram_prob)
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
#print (token) return bigram_logprobs
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def main(): def main():
expected = './train/expected.tsv' if len(sys.argv) != 4:
#expected = './dev-0/expected.tsv' print("syntax is ./train.py expected.tsv in.tsv model.pkl")
in_f = './train/in.tsv' return
#in_f = './dev-0/in.tsv' expected_file = str(sys.argv[1])
print (f"expected {expected}") in_file = str(sys.argv[2])
print (f"in {in_f}") model = str(sys.argv[3])
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected) paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
wordcounts =calc_word_count(in_f,expected) bigrams_count = calc_bigram_count(in_file, expected_file)
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
word_logprobs = calc_word_logprobs(wordcounts) with open(model, 'wb') as f:
with open('naive_base_model.pkl', 'wb') as f: pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f)
pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
# w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
main() main()

BIN
train.pyc

Binary file not shown.

91
train_baseline.py Executable file
View File

@ -0,0 +1,91 @@
#!/usr/bin/python3
from collections import defaultdict
import math
import pickle
import re
# in expected.tsv
def calc_class_logprob(expected_path):
paranolal_classcount=0
sceptic_classcount=0
with open(expected_path) as f:
for line in f:
line = line.rstrip('\n').replace(' ','')
if 'P' in line:
paranolal_classcount +=1
elif 'S' in line:
sceptic_classcount +=1
paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob)
def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ')
return tokens
# delete links, special characters, kropki, and \n
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens)
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie
def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
with open(in_path) as infile, open(expected_path) as expectedfile:
for line, exp in zip(infile, expectedfile):
class_ = exp.rstrip('\n').replace(' ','')
text, timestap =line.rstrip('\n').split('\t')
#print(f"text {type(text)}")
text = clear_tokens(text, True)
tokens = text.lower().split(' ')
#print(f"tokens {type(tokens)}")
for token in tokens:
clear_tokens(token,False)
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['sceptic'][token]+=1
return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
#print (token)
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def main():
expected = './train/expected.tsv'
#expected = './dev-0/expected.tsv'
in_f = './train/in.tsv'
#in_f = './dev-0/in.tsv'
print (f"expected {expected}")
print (f"in {in_f}")
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
wordcounts =calc_word_count(in_f,expected)
word_logprobs = calc_word_logprobs(wordcounts)
with open('naive_base_model.pkl', 'wb') as f:
pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
# w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
main()