Updated with stopwords
This commit is contained in:
parent
a3a146a87c
commit
d1ca0a2ea8
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
dev-0/in.tsv
|
||||||
|
train/in.tsv
|
||||||
|
test/in.tsv
|
5272
dev-0/in.tsv
5272
dev-0/in.tsv
File diff suppressed because one or more lines are too long
BIN
dev-0/in.tsv.xz
Normal file
BIN
dev-0/in.tsv.xz
Normal file
Binary file not shown.
Binary file not shown.
4935
dev-0/out.tsv
4935
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
5272
dev-0/together
5272
dev-0/together
File diff suppressed because one or more lines are too long
@ -1 +0,0 @@
|
|||||||
e412b617206095df98ac606360b222d0 naive_base_model.pkl
|
|
BIN
naive_bigram.pkl
BIN
naive_bigram.pkl
Binary file not shown.
23
predict.py
23
predict.py
@ -4,11 +4,13 @@ import pickle
|
|||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs):
|
def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs):
|
||||||
text, timestap = post.rstrip('\n').split('\t')
|
text, timestap = post.rstrip('\n').split('\t')
|
||||||
text = clear_post(text)
|
tokens = clear_post(text)
|
||||||
tokens = text.lower().split(' ')
|
#tokens = text.lower().split(' ')
|
||||||
probs = {}
|
probs = {}
|
||||||
for class_ in bigrams_logprobs.keys():
|
for class_ in bigrams_logprobs.keys():
|
||||||
product = 0
|
product = 0
|
||||||
@ -20,16 +22,23 @@ def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigra
|
|||||||
product += bigrams_logprobs[class_][bigram]
|
product += bigrams_logprobs[class_][bigram]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
product += 0
|
product += 0
|
||||||
|
for token in tokens:
|
||||||
|
try:
|
||||||
|
product += words_logprobs[class_][token]
|
||||||
|
except KeyError:
|
||||||
|
product += 0
|
||||||
if class_ == 'sceptic':
|
if class_ == 'sceptic':
|
||||||
product += sceptic_class_logprob
|
product += sceptic_class_logprob
|
||||||
elif class_ == 'paranormal':
|
elif class_ == 'paranormal':
|
||||||
product += paranormal_class_logprob
|
product += paranormal_class_logprob
|
||||||
probs[abs(product)] = class_
|
probs[abs(product)] = class_
|
||||||
|
|
||||||
#print(probs)
|
#print(probs)
|
||||||
return probs[max(probs.keys())]
|
return probs[max(probs.keys())]
|
||||||
|
|
||||||
def clear_post(post):
|
def clear_post(post):
|
||||||
post = post.replace('\\n', ' ')
|
post = post.replace('\\n', ' ')
|
||||||
|
post = post.lower()
|
||||||
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post)
|
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post)
|
||||||
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
||||||
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
||||||
@ -37,7 +46,10 @@ def clear_post(post):
|
|||||||
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
||||||
post = re.sub(r' +', ' ', post)
|
post = re.sub(r' +', ' ', post)
|
||||||
post = post.rstrip(' ')
|
post = post.rstrip(' ')
|
||||||
return post
|
post = post.split(' ')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
post_no_stop = [w for w in post if not w in stop_words]
|
||||||
|
return post_no_stop
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
@ -52,10 +64,11 @@ def main():
|
|||||||
paranormal_class_logprob = pickle_list[0]
|
paranormal_class_logprob = pickle_list[0]
|
||||||
sceptic_class_logprob = pickle_list[1]
|
sceptic_class_logprob = pickle_list[1]
|
||||||
bigrams_logprobs = pickle_list[2]
|
bigrams_logprobs = pickle_list[2]
|
||||||
|
words_logprobs = pickle_list[3]
|
||||||
|
|
||||||
with open(in_file) as in_f, open(out_file, 'w') as out_f:
|
with open(in_file) as in_f, open(out_file, 'w') as out_f:
|
||||||
for line in in_f:
|
for line in in_f:
|
||||||
hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs)
|
hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs)
|
||||||
if hyp == 'sceptic':
|
if hyp == 'sceptic':
|
||||||
out_f.write(' S\n')
|
out_f.write(' S\n')
|
||||||
elif hyp == 'paranormal':
|
elif hyp == 'paranormal':
|
||||||
|
@ -69,3 +69,4 @@ def main():
|
|||||||
elif hyp == 'paranormal':
|
elif hyp == 'paranormal':
|
||||||
out_f.write(' P\n')
|
out_f.write(' P\n')
|
||||||
main()
|
main()
|
||||||
|
c
|
||||||
|
5152
test-A/in.tsv
5152
test-A/in.tsv
File diff suppressed because one or more lines are too long
BIN
test-A/in.tsv.xz
Normal file
BIN
test-A/in.tsv.xz
Normal file
Binary file not shown.
1326
test-A/out.tsv
1326
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
116
train.py
116
train.py
@ -4,6 +4,8 @@ import math
|
|||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
def calc_class_logprob(expected_path):
|
def calc_class_logprob(expected_path):
|
||||||
paranormal_classcount = 0
|
paranormal_classcount = 0
|
||||||
@ -24,6 +26,7 @@ def calc_class_logprob(expected_path):
|
|||||||
|
|
||||||
def clear_post(post):
|
def clear_post(post):
|
||||||
post = post.replace('\\n', ' ')
|
post = post.replace('\\n', ' ')
|
||||||
|
post = post.lower()
|
||||||
# delete links
|
# delete links
|
||||||
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
||||||
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
||||||
@ -32,30 +35,33 @@ def clear_post(post):
|
|||||||
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
||||||
post = re.sub(r' +', ' ', post)
|
post = re.sub(r' +', ' ', post)
|
||||||
post = post.rstrip(' ')
|
post = post.rstrip(' ')
|
||||||
return post
|
post = post.split(' ')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
post_no_stop = [w for w in post if not w in stop_words]
|
||||||
|
return post_no_stop
|
||||||
|
|
||||||
def calc_bigram_count(in_path, expected_path):
|
#def calc_bigram_count(in_path, expected_path):
|
||||||
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
# bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||||
with open(in_path) as infile, open(expected_path) as expected_file:
|
# with open(in_path) as infile, open(expected_path) as expected_file:
|
||||||
num_of_bigams = 0
|
# num_of_bigams = 0
|
||||||
for line, exp in zip(infile, expected_file):
|
# for line, exp in zip(infile, expected_file):
|
||||||
class_ = exp.rstrip('\n').replace(' ', '')
|
# class_ = exp.rstrip('\n').replace(' ', '')
|
||||||
text, timestap = line.rstrip('\n').split('\t')
|
# text, timestap = line.rstrip('\n').split('\t')
|
||||||
text = clear_post(text)
|
# tokens = clear_post(text)
|
||||||
tokens = text.lower().split(' ')
|
# #tokens = text.lower().split(' ')
|
||||||
for index in range(len(tokens)-1):
|
# for index in range(len(tokens)-1):
|
||||||
# if there is next token we append current and next
|
# # if there is next token we append current and next
|
||||||
bigram = tokens[index] + " " + tokens[index + 1]
|
# bigram = tokens[index] + " " + tokens[index + 1]
|
||||||
#print(bigram)
|
# #print(bigram)
|
||||||
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
# #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||||
if class_ == 'P':
|
# if class_ == 'P':
|
||||||
bigram_counts['paranormal'][bigram] +=1
|
# bigram_counts['paranormal'][bigram] +=1
|
||||||
elif class_ == 'S':
|
# elif class_ == 'S':
|
||||||
bigram_counts['sceptic'][bigram] +=1
|
# bigram_counts['sceptic'][bigram] +=1
|
||||||
num_of_bigams +=1
|
# num_of_bigams +=1
|
||||||
#print(f"num of every added bigams with repetitions {num_of_bigams})")
|
# #print(f"num of every added bigams with repetitions {num_of_bigams})")
|
||||||
#print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
|
# #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
|
||||||
return bigram_counts
|
# return bigram_counts
|
||||||
|
|
||||||
def calc_bigram_logprobs(bigram_counts):
|
def calc_bigram_logprobs(bigram_counts):
|
||||||
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
|
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
|
||||||
@ -72,6 +78,63 @@ def calc_bigram_logprobs(bigram_counts):
|
|||||||
|
|
||||||
return bigram_logprobs
|
return bigram_logprobs
|
||||||
|
|
||||||
|
#def calc_word_count(in_path, expected_path):
|
||||||
|
# word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
|
||||||
|
# with open(in_path) as infile, open(expected_path) as expectedfile:
|
||||||
|
# for line, exp in zip(infile, expectedfile):
|
||||||
|
# class_ = exp.rstrip('\n').replace(' ','')
|
||||||
|
# text, timestap =line.rstrip('\n').split('\t')
|
||||||
|
# #print(f"text {type(text)}")
|
||||||
|
# text = clear_tokens(text, True)
|
||||||
|
# tokens = text.lower().split(' ')
|
||||||
|
# #print(f"tokens {type(tokens)}")
|
||||||
|
# for token in tokens:
|
||||||
|
# clear_tokens(token,False)
|
||||||
|
# if class_ == 'P':
|
||||||
|
# word_counts['paranormal'][token] += 1
|
||||||
|
# elif class_ == 'S':
|
||||||
|
# word_counts['sceptic'][token]+=1
|
||||||
|
#
|
||||||
|
# return word_counts
|
||||||
|
|
||||||
|
def calc_word_logprobs(word_counts):
|
||||||
|
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
|
||||||
|
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
||||||
|
word_logprobs= {'paranormal': {}, 'sceptic': {}}
|
||||||
|
for class_ in word_counts.keys(): # sceptic paranormal
|
||||||
|
for token, value in word_counts[class_].items():
|
||||||
|
if class_ == 'sceptic':
|
||||||
|
word_prob = (value +1)/ total_skeptic
|
||||||
|
elif class_ == 'paranormal':
|
||||||
|
word_prob = (value+1)/ total_paranormal
|
||||||
|
|
||||||
|
#print (token)
|
||||||
|
word_logprobs[class_][token] = math.log(word_prob)
|
||||||
|
|
||||||
|
return word_logprobs
|
||||||
|
|
||||||
|
def launch_bigrams_and_words(in_path, expected_path):
|
||||||
|
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
|
||||||
|
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||||
|
with open(in_path) as infile, open(expected_path) as expected_file:
|
||||||
|
for line, exp in zip(infile, expected_file):
|
||||||
|
class_ = exp.rstrip('\n').replace(' ', '')
|
||||||
|
text, timestap = line.rstrip('\n').split('\t')
|
||||||
|
tokens = clear_post(text)
|
||||||
|
for index in range(len(tokens)-1):
|
||||||
|
# if there is next token we append current and next
|
||||||
|
bigram = tokens[index] + " " + tokens[index + 1]
|
||||||
|
#print(bigram)
|
||||||
|
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||||
|
if class_ == 'P':
|
||||||
|
bigram_counts['paranormal'][bigram] +=1
|
||||||
|
word_counts['paranormal'][tokens[index]] +=1
|
||||||
|
elif class_ == 'S':
|
||||||
|
bigram_counts['sceptic'][bigram] +=1
|
||||||
|
word_counts['sceptic'][tokens[index]] +=1
|
||||||
|
|
||||||
|
return bigram_counts, word_counts
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
|
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
|
||||||
@ -80,8 +143,11 @@ def main():
|
|||||||
in_file = str(sys.argv[2])
|
in_file = str(sys.argv[2])
|
||||||
model = str(sys.argv[3])
|
model = str(sys.argv[3])
|
||||||
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
|
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
|
||||||
bigrams_count = calc_bigram_count(in_file, expected_file)
|
#bigrams_count = calc_bigram_count(in_file, expected_file)
|
||||||
|
bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
|
||||||
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
|
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
|
||||||
|
word_logprobs = calc_word_logprobs(words_count)
|
||||||
with open(model, 'wb') as f:
|
with open(model, 'wb') as f:
|
||||||
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f)
|
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs],f)
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
94
train.py_only_bi
Executable file
94
train.py_only_bi
Executable file
@ -0,0 +1,94 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
from collections import defaultdict
|
||||||
|
import math
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
def calc_class_logprob(expected_path):
|
||||||
|
paranormal_classcount = 0
|
||||||
|
sceptic_classcount = 0
|
||||||
|
|
||||||
|
with open(expected_path) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.rstrip('\n').replace(' ','')
|
||||||
|
if 'P' in line:
|
||||||
|
paranormal_classcount +=1
|
||||||
|
elif 'S' in line:
|
||||||
|
sceptic_classcount +=1
|
||||||
|
|
||||||
|
paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
|
||||||
|
sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
|
||||||
|
|
||||||
|
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||||
|
|
||||||
|
def clear_post(post):
|
||||||
|
post = post.replace('\\n', ' ')
|
||||||
|
post = post.lower()
|
||||||
|
# delete links
|
||||||
|
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
||||||
|
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
||||||
|
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
||||||
|
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
|
||||||
|
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
||||||
|
post = re.sub(r' +', ' ', post)
|
||||||
|
post = post.rstrip(' ')
|
||||||
|
post = post.split(' ')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
post_no_stop = [w for w in post if not w in stop_words]
|
||||||
|
return post_no_stop
|
||||||
|
|
||||||
|
def calc_bigram_count(in_path, expected_path):
|
||||||
|
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||||
|
with open(in_path) as infile, open(expected_path) as expected_file:
|
||||||
|
num_of_bigams = 0
|
||||||
|
for line, exp in zip(infile, expected_file):
|
||||||
|
class_ = exp.rstrip('\n').replace(' ', '')
|
||||||
|
text, timestap = line.rstrip('\n').split('\t')
|
||||||
|
tokens = clear_post(text)
|
||||||
|
#tokens = text.lower().split(' ')
|
||||||
|
for index in range(len(tokens)-1):
|
||||||
|
# if there is next token we append current and next
|
||||||
|
bigram = tokens[index] + " " + tokens[index + 1]
|
||||||
|
#print(bigram)
|
||||||
|
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||||
|
if class_ == 'P':
|
||||||
|
bigram_counts['paranormal'][bigram] +=1
|
||||||
|
elif class_ == 'S':
|
||||||
|
bigram_counts['sceptic'][bigram] +=1
|
||||||
|
num_of_bigams +=1
|
||||||
|
#print(f"num of every added bigams with repetitions {num_of_bigams})")
|
||||||
|
#print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
|
||||||
|
return bigram_counts
|
||||||
|
|
||||||
|
def calc_bigram_logprobs(bigram_counts):
|
||||||
|
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
|
||||||
|
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
|
||||||
|
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
|
||||||
|
for class_ in bigram_counts.keys():
|
||||||
|
for bigram, value in bigram_counts[class_].items():
|
||||||
|
if class_ == "sceptic":
|
||||||
|
bigram_prob = (value + 1) / total_sceptic
|
||||||
|
elif class_ == "paranormal":
|
||||||
|
bigram_prob = (value + 1) / total_paranormal
|
||||||
|
|
||||||
|
bigram_logprobs[class_][bigram] = math.log(bigram_prob)
|
||||||
|
|
||||||
|
return bigram_logprobs
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) != 4:
|
||||||
|
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
|
||||||
|
return
|
||||||
|
expected_file = str(sys.argv[1])
|
||||||
|
in_file = str(sys.argv[2])
|
||||||
|
model = str(sys.argv[3])
|
||||||
|
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
|
||||||
|
bigrams_count = calc_bigram_count(in_file, expected_file)
|
||||||
|
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
|
||||||
|
with open(model, 'wb') as f:
|
||||||
|
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f)
|
||||||
|
main()
|
||||||
|
|
289579
train/in.tsv
289579
train/in.tsv
File diff suppressed because one or more lines are too long
Binary file not shown.
Loading…
Reference in New Issue
Block a user