Updated with stopwords

This commit is contained in:
s426135 2020-03-29 23:29:19 +02:00
parent a3a146a87c
commit d1ca0a2ea8
17 changed files with 873 additions and 310901 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
dev-0/in.tsv
train/in.tsv
test/in.tsv

File diff suppressed because one or more lines are too long

BIN
dev-0/in.tsv.xz Normal file

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
e412b617206095df98ac606360b222d0 naive_base_model.pkl

Binary file not shown.

View File

@ -4,11 +4,13 @@ import pickle
import math import math
import re import re
import sys import sys
import nltk
from nltk.corpus import stopwords
def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs): def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs):
text, timestap = post.rstrip('\n').split('\t') text, timestap = post.rstrip('\n').split('\t')
text = clear_post(text) tokens = clear_post(text)
tokens = text.lower().split(' ') #tokens = text.lower().split(' ')
probs = {} probs = {}
for class_ in bigrams_logprobs.keys(): for class_ in bigrams_logprobs.keys():
product = 0 product = 0
@ -20,16 +22,23 @@ def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigra
product += bigrams_logprobs[class_][bigram] product += bigrams_logprobs[class_][bigram]
except KeyError: except KeyError:
product += 0 product += 0
for token in tokens:
try:
product += words_logprobs[class_][token]
except KeyError:
product += 0
if class_ == 'sceptic': if class_ == 'sceptic':
product += sceptic_class_logprob product += sceptic_class_logprob
elif class_ == 'paranormal': elif class_ == 'paranormal':
product += paranormal_class_logprob product += paranormal_class_logprob
probs[abs(product)] = class_ probs[abs(product)] = class_
#print(probs) #print(probs)
return probs[max(probs.keys())] return probs[max(probs.keys())]
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
post = post.lower()
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post) post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post) post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
@ -37,7 +46,10 @@ def clear_post(post):
post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post) post = re.sub(r' +', ' ', post)
post = post.rstrip(' ') post = post.rstrip(' ')
return post post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
def main(): def main():
if len(sys.argv) != 4: if len(sys.argv) != 4:
@ -52,10 +64,11 @@ def main():
paranormal_class_logprob = pickle_list[0] paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1] sceptic_class_logprob = pickle_list[1]
bigrams_logprobs = pickle_list[2] bigrams_logprobs = pickle_list[2]
words_logprobs = pickle_list[3]
with open(in_file) as in_f, open(out_file, 'w') as out_f: with open(in_file) as in_f, open(out_file, 'w') as out_f:
for line in in_f: for line in in_f:
hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs) hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs)
if hyp == 'sceptic': if hyp == 'sceptic':
out_f.write(' S\n') out_f.write(' S\n')
elif hyp == 'paranormal': elif hyp == 'paranormal':

View File

@ -69,3 +69,4 @@ def main():
elif hyp == 'paranormal': elif hyp == 'paranormal':
out_f.write(' P\n') out_f.write(' P\n')
main() main()
c

File diff suppressed because one or more lines are too long

BIN
test-A/in.tsv.xz Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

116
train.py
View File

@ -4,6 +4,8 @@ import math
import pickle import pickle
import re import re
import sys import sys
import nltk
from nltk.corpus import stopwords
def calc_class_logprob(expected_path): def calc_class_logprob(expected_path):
paranormal_classcount = 0 paranormal_classcount = 0
@ -24,6 +26,7 @@ def calc_class_logprob(expected_path):
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
post = post.lower()
# delete links # delete links
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post) post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post)
@ -32,30 +35,33 @@ def clear_post(post):
post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post) post = re.sub(r' +', ' ', post)
post = post.rstrip(' ') post = post.rstrip(' ')
return post post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
def calc_bigram_count(in_path, expected_path): #def calc_bigram_count(in_path, expected_path):
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)} # bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
with open(in_path) as infile, open(expected_path) as expected_file: # with open(in_path) as infile, open(expected_path) as expected_file:
num_of_bigams = 0 # num_of_bigams = 0
for line, exp in zip(infile, expected_file): # for line, exp in zip(infile, expected_file):
class_ = exp.rstrip('\n').replace(' ', '') # class_ = exp.rstrip('\n').replace(' ', '')
text, timestap = line.rstrip('\n').split('\t') # text, timestap = line.rstrip('\n').split('\t')
text = clear_post(text) # tokens = clear_post(text)
tokens = text.lower().split(' ') # #tokens = text.lower().split(' ')
for index in range(len(tokens)-1): # for index in range(len(tokens)-1):
# if there is next token we append current and next # # if there is next token we append current and next
bigram = tokens[index] + " " + tokens[index + 1] # bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram) # #print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;") # #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P': # if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1 # bigram_counts['paranormal'][bigram] +=1
elif class_ == 'S': # elif class_ == 'S':
bigram_counts['sceptic'][bigram] +=1 # bigram_counts['sceptic'][bigram] +=1
num_of_bigams +=1 # num_of_bigams +=1
#print(f"num of every added bigams with repetitions {num_of_bigams})") # #print(f"num of every added bigams with repetitions {num_of_bigams})")
#print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}") # #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
return bigram_counts # return bigram_counts
def calc_bigram_logprobs(bigram_counts): def calc_bigram_logprobs(bigram_counts):
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys()) total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
@ -72,6 +78,63 @@ def calc_bigram_logprobs(bigram_counts):
return bigram_logprobs return bigram_logprobs
#def calc_word_count(in_path, expected_path):
# word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
# with open(in_path) as infile, open(expected_path) as expectedfile:
# for line, exp in zip(infile, expectedfile):
# class_ = exp.rstrip('\n').replace(' ','')
# text, timestap =line.rstrip('\n').split('\t')
# #print(f"text {type(text)}")
# text = clear_tokens(text, True)
# tokens = text.lower().split(' ')
# #print(f"tokens {type(tokens)}")
# for token in tokens:
# clear_tokens(token,False)
# if class_ == 'P':
# word_counts['paranormal'][token] += 1
# elif class_ == 'S':
# word_counts['sceptic'][token]+=1
#
# return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
#print (token)
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def launch_bigrams_and_words(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
with open(in_path) as infile, open(expected_path) as expected_file:
for line, exp in zip(infile, expected_file):
class_ = exp.rstrip('\n').replace(' ', '')
text, timestap = line.rstrip('\n').split('\t')
tokens = clear_post(text)
for index in range(len(tokens)-1):
# if there is next token we append current and next
bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1
word_counts['paranormal'][tokens[index]] +=1
elif class_ == 'S':
bigram_counts['sceptic'][bigram] +=1
word_counts['sceptic'][tokens[index]] +=1
return bigram_counts, word_counts
def main(): def main():
if len(sys.argv) != 4: if len(sys.argv) != 4:
print("syntax is ./train.py expected.tsv in.tsv model.pkl") print("syntax is ./train.py expected.tsv in.tsv model.pkl")
@ -80,8 +143,11 @@ def main():
in_file = str(sys.argv[2]) in_file = str(sys.argv[2])
model = str(sys.argv[3]) model = str(sys.argv[3])
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file) paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
bigrams_count = calc_bigram_count(in_file, expected_file) #bigrams_count = calc_bigram_count(in_file, expected_file)
bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
bigram_logprobs = calc_bigram_logprobs(bigrams_count) bigram_logprobs = calc_bigram_logprobs(bigrams_count)
word_logprobs = calc_word_logprobs(words_count)
with open(model, 'wb') as f: with open(model, 'wb') as f:
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f) pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs],f)
main() main()

94
train.py_only_bi Executable file
View File

@ -0,0 +1,94 @@
#!/usr/bin/python3
from collections import defaultdict
import math
import pickle
import re
import sys
import nltk
from nltk.corpus import stopwords
def calc_class_logprob(expected_path):
paranormal_classcount = 0
sceptic_classcount = 0
with open(expected_path) as f:
for line in f:
line = line.rstrip('\n').replace(' ','')
if 'P' in line:
paranormal_classcount +=1
elif 'S' in line:
sceptic_classcount +=1
paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob)
def clear_post(post):
post = post.replace('\\n', ' ')
post = post.lower()
# delete links
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\”\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
def calc_bigram_count(in_path, expected_path):
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
with open(in_path) as infile, open(expected_path) as expected_file:
num_of_bigams = 0
for line, exp in zip(infile, expected_file):
class_ = exp.rstrip('\n').replace(' ', '')
text, timestap = line.rstrip('\n').split('\t')
tokens = clear_post(text)
#tokens = text.lower().split(' ')
for index in range(len(tokens)-1):
# if there is next token we append current and next
bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1
elif class_ == 'S':
bigram_counts['sceptic'][bigram] +=1
num_of_bigams +=1
#print(f"num of every added bigams with repetitions {num_of_bigams})")
#print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
return bigram_counts
def calc_bigram_logprobs(bigram_counts):
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
for class_ in bigram_counts.keys():
for bigram, value in bigram_counts[class_].items():
if class_ == "sceptic":
bigram_prob = (value + 1) / total_sceptic
elif class_ == "paranormal":
bigram_prob = (value + 1) / total_paranormal
bigram_logprobs[class_][bigram] = math.log(bigram_prob)
return bigram_logprobs
def main():
if len(sys.argv) != 4:
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
return
expected_file = str(sys.argv[1])
in_file = str(sys.argv[2])
model = str(sys.argv[3])
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
bigrams_count = calc_bigram_count(in_file, expected_file)
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
with open(model, 'wb') as f:
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f)
main()

289579
train/in.tsv

File diff suppressed because one or more lines are too long