Begin lin reg

This commit is contained in:
s426135 2020-04-04 22:07:48 +02:00
parent 4b926f648f
commit 0839c5ca41
3 changed files with 228 additions and 132 deletions

203
train.py
View File

@ -1,33 +1,10 @@
#!/usr/bin/python3 #!/usr/bin/python3
from collections import defaultdict import re, sys, pickle, nltk, math, random
import math
import pickle
import re
import sys
import nltk
from nltk.corpus import stopwords from nltk.corpus import stopwords
def calc_class_logprob(expected_path):
paranormal_classcount = 0
sceptic_classcount = 0
with open(expected_path) as f:
for line in f:
line = line.rstrip('\n').replace(' ','')
if 'P' in line:
paranormal_classcount +=1
elif 'S' in line:
sceptic_classcount +=1
paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob)
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
post = post.lower() post = post.lower()
# delete links
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post) post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post) post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
@ -40,118 +17,80 @@ def clear_post(post):
post_no_stop = [w for w in post if not w in stop_words] post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop return post_no_stop
#def calc_bigram_count(in_path, expected_path): # czy słowa musza byc setem?
# bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)} def create_vocabulary_and_documents(in_file, expected_file):
# with open(in_path) as infile, open(expected_path) as expected_file: vocabulary = set()
# num_of_bigams = 0 posts = {}
# for line, exp in zip(infile, expected_file): with open(in_file) as in_f, open(expected_file) as exp_f:
# class_ = exp.rstrip('\n').replace(' ', '') for line, exp in zip(in_f, exp_f):
# text, timestap = line.rstrip('\n').split('\t')
# tokens = clear_post(text)
# #tokens = text.lower().split(' ')
# for index in range(len(tokens)-1):
# # if there is next token we append current and next
# bigram = tokens[index] + " " + tokens[index + 1]
# #print(bigram)
# #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
# if class_ == 'P':
# bigram_counts['paranormal'][bigram] +=1
# elif class_ == 'S':
# bigram_counts['sceptic'][bigram] +=1
# num_of_bigams +=1
# #print(f"num of every added bigams with repetitions {num_of_bigams})")
# #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
# return bigram_counts
def calc_bigram_logprobs(bigram_counts):
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
for class_ in bigram_counts.keys():
for bigram, value in bigram_counts[class_].items():
if class_ == "sceptic":
bigram_prob = (value + 1) / total_sceptic
elif class_ == "paranormal":
bigram_prob = (value + 1) / total_paranormal
bigram_logprobs[class_][bigram] = math.log(bigram_prob)
return bigram_logprobs
#def calc_word_count(in_path, expected_path):
# word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
# with open(in_path) as infile, open(expected_path) as expectedfile:
# for line, exp in zip(infile, expectedfile):
# class_ = exp.rstrip('\n').replace(' ','')
# text, timestap =line.rstrip('\n').split('\t')
# #print(f"text {type(text)}")
# text = clear_tokens(text, True)
# tokens = text.lower().split(' ')
# #print(f"tokens {type(tokens)}")
# for token in tokens:
# clear_tokens(token,False)
# if class_ == 'P':
# word_counts['paranormal'][token] += 1
# elif class_ == 'S':
# word_counts['sceptic'][token]+=1
#
# return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
#print (token)
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def launch_bigrams_and_words(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
with open(in_path) as infile, open(expected_path) as expected_file:
for line, exp in zip(infile, expected_file):
class_ = exp.rstrip('\n').replace(' ', '')
text, timestap = line.rstrip('\n').split('\t') text, timestap = line.rstrip('\n').split('\t')
tokens = clear_post(text) post = clear_post(text)
for index in range(len(tokens)-1): posts[" ".join(post)] = int(exp)
# if there is next token we append current and next for word in post:
bigram = tokens[index] + " " + tokens[index + 1] vocabulary.add(word)
#print(bigram) return vocabulary, posts
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1
word_counts['paranormal'][tokens[index]] +=1
elif class_ == 'S':
bigram_counts['sceptic'][bigram] +=1
word_counts['sceptic'][tokens[index]] +=1
return bigram_counts, word_counts def create_mappings(vocabulary):
word_to_index_mapping = {}
index_to_word_mapping = {}
xi = 1
for word in vocabulary:
word_to_index_mapping[word] = xi
index_to_word_mapping[xi] = word
xi += 1
return word_to_index_mapping, index_to_word_mapping
def main(): def main():
if len(sys.argv) != 4: if len(sys.argv) != 4:
print("syntax is ./train.py expected.tsv in.tsv model.pkl") print("syntax ./train.py model expected_file in_file")
return return
expected_file = str(sys.argv[1]) model = str(sys.argv[1])
in_file = str(sys.argv[2]) expected_file = str(sys.argv[2])
model = str(sys.argv[3]) in_file = str(sys.argv[3])
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file) vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
#bigrams_count = calc_bigram_count(in_file, expected_file) word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
word_logprobs = calc_word_logprobs(words_count)
total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
with open(model, 'wb') as f:
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
main()
weights = []
for xi in range(0, len(vocabulary) + 1):
weights.append(random.uniform(-0.01,0.01))
learning_rate = 0.000001
loss_sum = 0.0
loss_sum_counter = 0
lowest_loss_sum_weights = []
lowest_loss_sum = 10000.0
print(f"len of vocabulary {len(vocabulary)}")
# mozna ustawić na bardzo bardzo duzo
while True: #loss_sum_counter != 10:
try:
d, y = random.choice(list(posts.items()))
y_hat = weights[0]
tokens = d.split(' ')
for word in tokens:
# mozna tez cos pomyslec z count aby lepiej dzialalo
#print(f"{d.count(word)} : {word}")
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
loss = (y_hat - y)**2
loss_sum += loss
delta = (y_hat - y) * learning_rate
if loss_sum_counter % 100 == 0:
print(f"{loss_sum /1000} : {loss_sum_counter} : {y_hat} : {delta}")
loss_sum_counter = 0
loss_sum = 0
weights[0] -= delta
for word in tokens:
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
if lowest_loss_sum > loss_sum and loss_sum != 0:
print("it happened")
lowest_loss_sum = loss_sum
lowest_loss_sum_weights = weights
loss_sum_counter +=1
except KeyboardInterrupt:
break
print(lowest_loss_sum_weights)
main()

157
train_bigram.py Executable file
View File

@ -0,0 +1,157 @@
#!/usr/bin/python3
from collections import defaultdict
import math
import pickle
import re
import sys
import nltk
from nltk.corpus import stopwords
def calc_class_logprob(expected_path):
paranormal_classcount = 0
sceptic_classcount = 0
with open(expected_path) as f:
for line in f:
line = line.rstrip('\n').replace(' ','')
if 'P' in line:
paranormal_classcount +=1
elif 'S' in line:
sceptic_classcount +=1
paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob)
def clear_post(post):
post = post.replace('\\n', ' ')
post = post.lower()
# delete links
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
#def calc_bigram_count(in_path, expected_path):
# bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
# with open(in_path) as infile, open(expected_path) as expected_file:
# num_of_bigams = 0
# for line, exp in zip(infile, expected_file):
# class_ = exp.rstrip('\n').replace(' ', '')
# text, timestap = line.rstrip('\n').split('\t')
# tokens = clear_post(text)
# #tokens = text.lower().split(' ')
# for index in range(len(tokens)-1):
# # if there is next token we append current and next
# bigram = tokens[index] + " " + tokens[index + 1]
# #print(bigram)
# #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
# if class_ == 'P':
# bigram_counts['paranormal'][bigram] +=1
# elif class_ == 'S':
# bigram_counts['sceptic'][bigram] +=1
# num_of_bigams +=1
# #print(f"num of every added bigams with repetitions {num_of_bigams})")
# #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
# return bigram_counts
def calc_bigram_logprobs(bigram_counts):
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
for class_ in bigram_counts.keys():
for bigram, value in bigram_counts[class_].items():
if class_ == "sceptic":
bigram_prob = (value + 1) / total_sceptic
elif class_ == "paranormal":
bigram_prob = (value + 1) / total_paranormal
bigram_logprobs[class_][bigram] = math.log(bigram_prob)
return bigram_logprobs
#def calc_word_count(in_path, expected_path):
# word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
# with open(in_path) as infile, open(expected_path) as expectedfile:
# for line, exp in zip(infile, expectedfile):
# class_ = exp.rstrip('\n').replace(' ','')
# text, timestap =line.rstrip('\n').split('\t')
# #print(f"text {type(text)}")
# text = clear_tokens(text, True)
# tokens = text.lower().split(' ')
# #print(f"tokens {type(tokens)}")
# for token in tokens:
# clear_tokens(token,False)
# if class_ == 'P':
# word_counts['paranormal'][token] += 1
# elif class_ == 'S':
# word_counts['sceptic'][token]+=1
#
# return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
#print (token)
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def launch_bigrams_and_words(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
with open(in_path) as infile, open(expected_path) as expected_file:
for line, exp in zip(infile, expected_file):
class_ = exp.rstrip('\n').replace(' ', '')
text, timestap = line.rstrip('\n').split('\t')
tokens = clear_post(text)
for index in range(len(tokens)-1):
# if there is next token we append current and next
bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1
word_counts['paranormal'][tokens[index]] +=1
elif class_ == 'S':
bigram_counts['sceptic'][bigram] +=1
word_counts['sceptic'][tokens[index]] +=1
return bigram_counts, word_counts
def main():
if len(sys.argv) != 4:
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
return
expected_file = str(sys.argv[1])
in_file = str(sys.argv[2])
model = str(sys.argv[3])
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
#bigrams_count = calc_bigram_count(in_file, expected_file)
bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
word_logprobs = calc_word_logprobs(words_count)
total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
with open(model, 'wb') as f:
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
main()