Begin lin reg
This commit is contained in:
parent
4b926f648f
commit
0839c5ca41
203
train.py
203
train.py
@ -1,33 +1,10 @@
|
||||
#!/usr/bin/python3
|
||||
from collections import defaultdict
|
||||
import math
|
||||
import pickle
|
||||
import re
|
||||
import sys
|
||||
import nltk
|
||||
import re, sys, pickle, nltk, math, random
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
def calc_class_logprob(expected_path):
|
||||
paranormal_classcount = 0
|
||||
sceptic_classcount = 0
|
||||
|
||||
with open(expected_path) as f:
|
||||
for line in f:
|
||||
line = line.rstrip('\n').replace(' ','')
|
||||
if 'P' in line:
|
||||
paranormal_classcount +=1
|
||||
elif 'S' in line:
|
||||
sceptic_classcount +=1
|
||||
|
||||
paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
|
||||
sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
|
||||
|
||||
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||
|
||||
def clear_post(post):
|
||||
post = post.replace('\\n', ' ')
|
||||
post = post.lower()
|
||||
# delete links
|
||||
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
||||
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
||||
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
||||
@ -40,118 +17,80 @@ def clear_post(post):
|
||||
post_no_stop = [w for w in post if not w in stop_words]
|
||||
return post_no_stop
|
||||
|
||||
#def calc_bigram_count(in_path, expected_path):
|
||||
# bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||
# with open(in_path) as infile, open(expected_path) as expected_file:
|
||||
# num_of_bigams = 0
|
||||
# for line, exp in zip(infile, expected_file):
|
||||
# class_ = exp.rstrip('\n').replace(' ', '')
|
||||
# text, timestap = line.rstrip('\n').split('\t')
|
||||
# tokens = clear_post(text)
|
||||
# #tokens = text.lower().split(' ')
|
||||
# for index in range(len(tokens)-1):
|
||||
# # if there is next token we append current and next
|
||||
# bigram = tokens[index] + " " + tokens[index + 1]
|
||||
# #print(bigram)
|
||||
# #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||
# if class_ == 'P':
|
||||
# bigram_counts['paranormal'][bigram] +=1
|
||||
# elif class_ == 'S':
|
||||
# bigram_counts['sceptic'][bigram] +=1
|
||||
# num_of_bigams +=1
|
||||
# #print(f"num of every added bigams with repetitions {num_of_bigams})")
|
||||
# #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
|
||||
# return bigram_counts
|
||||
|
||||
def calc_bigram_logprobs(bigram_counts):
|
||||
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
|
||||
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
|
||||
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
|
||||
for class_ in bigram_counts.keys():
|
||||
for bigram, value in bigram_counts[class_].items():
|
||||
if class_ == "sceptic":
|
||||
bigram_prob = (value + 1) / total_sceptic
|
||||
elif class_ == "paranormal":
|
||||
bigram_prob = (value + 1) / total_paranormal
|
||||
|
||||
bigram_logprobs[class_][bigram] = math.log(bigram_prob)
|
||||
|
||||
return bigram_logprobs
|
||||
|
||||
#def calc_word_count(in_path, expected_path):
|
||||
# word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
|
||||
# with open(in_path) as infile, open(expected_path) as expectedfile:
|
||||
# for line, exp in zip(infile, expectedfile):
|
||||
# class_ = exp.rstrip('\n').replace(' ','')
|
||||
# text, timestap =line.rstrip('\n').split('\t')
|
||||
# #print(f"text {type(text)}")
|
||||
# text = clear_tokens(text, True)
|
||||
# tokens = text.lower().split(' ')
|
||||
# #print(f"tokens {type(tokens)}")
|
||||
# for token in tokens:
|
||||
# clear_tokens(token,False)
|
||||
# if class_ == 'P':
|
||||
# word_counts['paranormal'][token] += 1
|
||||
# elif class_ == 'S':
|
||||
# word_counts['sceptic'][token]+=1
|
||||
#
|
||||
# return word_counts
|
||||
|
||||
def calc_word_logprobs(word_counts):
|
||||
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
|
||||
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
||||
word_logprobs= {'paranormal': {}, 'sceptic': {}}
|
||||
for class_ in word_counts.keys(): # sceptic paranormal
|
||||
for token, value in word_counts[class_].items():
|
||||
if class_ == 'sceptic':
|
||||
word_prob = (value +1)/ total_skeptic
|
||||
elif class_ == 'paranormal':
|
||||
word_prob = (value+1)/ total_paranormal
|
||||
|
||||
#print (token)
|
||||
word_logprobs[class_][token] = math.log(word_prob)
|
||||
|
||||
return word_logprobs
|
||||
|
||||
def launch_bigrams_and_words(in_path, expected_path):
|
||||
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
|
||||
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||
with open(in_path) as infile, open(expected_path) as expected_file:
|
||||
for line, exp in zip(infile, expected_file):
|
||||
class_ = exp.rstrip('\n').replace(' ', '')
|
||||
# czy słowa musza byc setem?
|
||||
def create_vocabulary_and_documents(in_file, expected_file):
|
||||
vocabulary = set()
|
||||
posts = {}
|
||||
with open(in_file) as in_f, open(expected_file) as exp_f:
|
||||
for line, exp in zip(in_f, exp_f):
|
||||
text, timestap = line.rstrip('\n').split('\t')
|
||||
tokens = clear_post(text)
|
||||
for index in range(len(tokens)-1):
|
||||
# if there is next token we append current and next
|
||||
bigram = tokens[index] + " " + tokens[index + 1]
|
||||
#print(bigram)
|
||||
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||
if class_ == 'P':
|
||||
bigram_counts['paranormal'][bigram] +=1
|
||||
word_counts['paranormal'][tokens[index]] +=1
|
||||
elif class_ == 'S':
|
||||
bigram_counts['sceptic'][bigram] +=1
|
||||
word_counts['sceptic'][tokens[index]] +=1
|
||||
post = clear_post(text)
|
||||
posts[" ".join(post)] = int(exp)
|
||||
for word in post:
|
||||
vocabulary.add(word)
|
||||
return vocabulary, posts
|
||||
|
||||
return bigram_counts, word_counts
|
||||
def create_mappings(vocabulary):
|
||||
word_to_index_mapping = {}
|
||||
index_to_word_mapping = {}
|
||||
xi = 1
|
||||
for word in vocabulary:
|
||||
word_to_index_mapping[word] = xi
|
||||
index_to_word_mapping[xi] = word
|
||||
xi += 1
|
||||
return word_to_index_mapping, index_to_word_mapping
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 4:
|
||||
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
|
||||
print("syntax ./train.py model expected_file in_file")
|
||||
return
|
||||
expected_file = str(sys.argv[1])
|
||||
in_file = str(sys.argv[2])
|
||||
model = str(sys.argv[3])
|
||||
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
|
||||
#bigrams_count = calc_bigram_count(in_file, expected_file)
|
||||
bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
|
||||
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
|
||||
word_logprobs = calc_word_logprobs(words_count)
|
||||
total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
|
||||
total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
|
||||
total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
|
||||
total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
|
||||
with open(model, 'wb') as f:
|
||||
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
|
||||
main()
|
||||
model = str(sys.argv[1])
|
||||
expected_file = str(sys.argv[2])
|
||||
in_file = str(sys.argv[3])
|
||||
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
|
||||
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
|
||||
|
||||
weights = []
|
||||
for xi in range(0, len(vocabulary) + 1):
|
||||
weights.append(random.uniform(-0.01,0.01))
|
||||
|
||||
learning_rate = 0.000001
|
||||
loss_sum = 0.0
|
||||
loss_sum_counter = 0
|
||||
lowest_loss_sum_weights = []
|
||||
lowest_loss_sum = 10000.0
|
||||
|
||||
print(f"len of vocabulary {len(vocabulary)}")
|
||||
# mozna ustawić na bardzo bardzo duzo
|
||||
while True: #loss_sum_counter != 10:
|
||||
try:
|
||||
d, y = random.choice(list(posts.items()))
|
||||
y_hat = weights[0]
|
||||
tokens = d.split(' ')
|
||||
for word in tokens:
|
||||
# mozna tez cos pomyslec z count aby lepiej dzialalo
|
||||
#print(f"{d.count(word)} : {word}")
|
||||
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
|
||||
|
||||
loss = (y_hat - y)**2
|
||||
loss_sum += loss
|
||||
delta = (y_hat - y) * learning_rate
|
||||
if loss_sum_counter % 100 == 0:
|
||||
print(f"{loss_sum /1000} : {loss_sum_counter} : {y_hat} : {delta}")
|
||||
loss_sum_counter = 0
|
||||
loss_sum = 0
|
||||
|
||||
weights[0] -= delta
|
||||
for word in tokens:
|
||||
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
|
||||
|
||||
if lowest_loss_sum > loss_sum and loss_sum != 0:
|
||||
print("it happened")
|
||||
lowest_loss_sum = loss_sum
|
||||
lowest_loss_sum_weights = weights
|
||||
|
||||
loss_sum_counter +=1
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
print(lowest_loss_sum_weights)
|
||||
main()
|
||||
|
157
train_bigram.py
Executable file
157
train_bigram.py
Executable file
@ -0,0 +1,157 @@
|
||||
#!/usr/bin/python3
|
||||
from collections import defaultdict
|
||||
import math
|
||||
import pickle
|
||||
import re
|
||||
import sys
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
def calc_class_logprob(expected_path):
|
||||
paranormal_classcount = 0
|
||||
sceptic_classcount = 0
|
||||
|
||||
with open(expected_path) as f:
|
||||
for line in f:
|
||||
line = line.rstrip('\n').replace(' ','')
|
||||
if 'P' in line:
|
||||
paranormal_classcount +=1
|
||||
elif 'S' in line:
|
||||
sceptic_classcount +=1
|
||||
|
||||
paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
|
||||
sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
|
||||
|
||||
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||
|
||||
def clear_post(post):
|
||||
post = post.replace('\\n', ' ')
|
||||
post = post.lower()
|
||||
# delete links
|
||||
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
||||
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
||||
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
||||
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
|
||||
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
||||
post = re.sub(r' +', ' ', post)
|
||||
post = post.rstrip(' ')
|
||||
post = post.split(' ')
|
||||
stop_words = set(stopwords.words('english'))
|
||||
post_no_stop = [w for w in post if not w in stop_words]
|
||||
return post_no_stop
|
||||
|
||||
#def calc_bigram_count(in_path, expected_path):
|
||||
# bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||
# with open(in_path) as infile, open(expected_path) as expected_file:
|
||||
# num_of_bigams = 0
|
||||
# for line, exp in zip(infile, expected_file):
|
||||
# class_ = exp.rstrip('\n').replace(' ', '')
|
||||
# text, timestap = line.rstrip('\n').split('\t')
|
||||
# tokens = clear_post(text)
|
||||
# #tokens = text.lower().split(' ')
|
||||
# for index in range(len(tokens)-1):
|
||||
# # if there is next token we append current and next
|
||||
# bigram = tokens[index] + " " + tokens[index + 1]
|
||||
# #print(bigram)
|
||||
# #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||
# if class_ == 'P':
|
||||
# bigram_counts['paranormal'][bigram] +=1
|
||||
# elif class_ == 'S':
|
||||
# bigram_counts['sceptic'][bigram] +=1
|
||||
# num_of_bigams +=1
|
||||
# #print(f"num of every added bigams with repetitions {num_of_bigams})")
|
||||
# #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
|
||||
# return bigram_counts
|
||||
|
||||
def calc_bigram_logprobs(bigram_counts):
|
||||
total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
|
||||
total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
|
||||
bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
|
||||
for class_ in bigram_counts.keys():
|
||||
for bigram, value in bigram_counts[class_].items():
|
||||
if class_ == "sceptic":
|
||||
bigram_prob = (value + 1) / total_sceptic
|
||||
elif class_ == "paranormal":
|
||||
bigram_prob = (value + 1) / total_paranormal
|
||||
|
||||
bigram_logprobs[class_][bigram] = math.log(bigram_prob)
|
||||
|
||||
return bigram_logprobs
|
||||
|
||||
#def calc_word_count(in_path, expected_path):
|
||||
# word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
|
||||
# with open(in_path) as infile, open(expected_path) as expectedfile:
|
||||
# for line, exp in zip(infile, expectedfile):
|
||||
# class_ = exp.rstrip('\n').replace(' ','')
|
||||
# text, timestap =line.rstrip('\n').split('\t')
|
||||
# #print(f"text {type(text)}")
|
||||
# text = clear_tokens(text, True)
|
||||
# tokens = text.lower().split(' ')
|
||||
# #print(f"tokens {type(tokens)}")
|
||||
# for token in tokens:
|
||||
# clear_tokens(token,False)
|
||||
# if class_ == 'P':
|
||||
# word_counts['paranormal'][token] += 1
|
||||
# elif class_ == 'S':
|
||||
# word_counts['sceptic'][token]+=1
|
||||
#
|
||||
# return word_counts
|
||||
|
||||
def calc_word_logprobs(word_counts):
|
||||
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
|
||||
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
||||
word_logprobs= {'paranormal': {}, 'sceptic': {}}
|
||||
for class_ in word_counts.keys(): # sceptic paranormal
|
||||
for token, value in word_counts[class_].items():
|
||||
if class_ == 'sceptic':
|
||||
word_prob = (value +1)/ total_skeptic
|
||||
elif class_ == 'paranormal':
|
||||
word_prob = (value+1)/ total_paranormal
|
||||
|
||||
#print (token)
|
||||
word_logprobs[class_][token] = math.log(word_prob)
|
||||
|
||||
return word_logprobs
|
||||
|
||||
def launch_bigrams_and_words(in_path, expected_path):
|
||||
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
|
||||
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
|
||||
with open(in_path) as infile, open(expected_path) as expected_file:
|
||||
for line, exp in zip(infile, expected_file):
|
||||
class_ = exp.rstrip('\n').replace(' ', '')
|
||||
text, timestap = line.rstrip('\n').split('\t')
|
||||
tokens = clear_post(text)
|
||||
for index in range(len(tokens)-1):
|
||||
# if there is next token we append current and next
|
||||
bigram = tokens[index] + " " + tokens[index + 1]
|
||||
#print(bigram)
|
||||
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
|
||||
if class_ == 'P':
|
||||
bigram_counts['paranormal'][bigram] +=1
|
||||
word_counts['paranormal'][tokens[index]] +=1
|
||||
elif class_ == 'S':
|
||||
bigram_counts['sceptic'][bigram] +=1
|
||||
word_counts['sceptic'][tokens[index]] +=1
|
||||
|
||||
return bigram_counts, word_counts
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 4:
|
||||
print("syntax is ./train.py expected.tsv in.tsv model.pkl")
|
||||
return
|
||||
expected_file = str(sys.argv[1])
|
||||
in_file = str(sys.argv[2])
|
||||
model = str(sys.argv[3])
|
||||
paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
|
||||
#bigrams_count = calc_bigram_count(in_file, expected_file)
|
||||
bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
|
||||
bigram_logprobs = calc_bigram_logprobs(bigrams_count)
|
||||
word_logprobs = calc_word_logprobs(words_count)
|
||||
total_sceptic_bigram = sum(bigrams_count['sceptic'].values()) + len(bigrams_count['sceptic'].keys())
|
||||
total_paranormal_bigram = sum(bigrams_count['paranormal'].values()) + len(bigrams_count['paranormal'].keys())
|
||||
total_sceptic_word = sum(words_count['sceptic'].values()) + len(words_count['sceptic'].keys())
|
||||
total_paranormal_word = sum(words_count['paranormal'].values())+ len(words_count['paranormal'].keys())
|
||||
with open(model, 'wb') as f:
|
||||
pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word],f)
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user