Added liniar regression

This commit is contained in:
s426135 2020-04-06 10:41:14 +02:00
parent 0839c5ca41
commit 9fb516216a
5 changed files with 10501 additions and 10432 deletions

File diff suppressed because it is too large Load Diff

BIN
model Normal file

Binary file not shown.

55
predict.py Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/python3
import pickle, re, sys
from nltk.corpus import stopwords
def clear_post(post):
post = post.replace('\\n', ' ')
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
def calc_prob(posts, weights, word_to_index_mapping):
for post in posts:
d = post.split(' ')
y_hat = weights[0]
for token in d:
try:
y_hat += weights[word_to_index_mapping[token]] * post.count(token)
except KeyError:
y_hat += 0
if y_hat > 0.5:
print("1")
else:
print("0")
def main():
if len(sys.argv) != 2:
print("Expected model")
return
model = str(sys.argv[1])
posts = []
for line in sys.stdin:
text, timestap = line.rstrip('\n').split('\t')
post = clear_post(text)
posts.append(" ".join(post))
with open(model, 'rb') as f:
pickle_list = pickle.load(f)
weights = pickle_list[0]
lowest_loss_weights = pickle_list[1]
word_to_index_mapping = pickle_list[2]
calc_prob(posts, weights, word_to_index_mapping)
main()

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
#!/usr/bin/python3 #!/usr/bin/python3
import re, sys, pickle, nltk, math, random import re, sys, pickle, random
from nltk.corpus import stopwords from nltk.corpus import stopwords
def clear_post(post): def clear_post(post):
@ -28,6 +28,9 @@ def create_vocabulary_and_documents(in_file, expected_file):
posts[" ".join(post)] = int(exp) posts[" ".join(post)] = int(exp)
for word in post: for word in post:
vocabulary.add(word) vocabulary.add(word)
with open('data', 'wb') as f:
pickle.dump([vocabulary, posts], f)
print("data created")
return vocabulary, posts return vocabulary, posts
def create_mappings(vocabulary): def create_mappings(vocabulary):
@ -47,14 +50,22 @@ def main():
model = str(sys.argv[1]) model = str(sys.argv[1])
expected_file = str(sys.argv[2]) expected_file = str(sys.argv[2])
in_file = str(sys.argv[3]) in_file = str(sys.argv[3])
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file) try:
with open("data", 'rb') as pos:
pickle_list = pickle.load(pos)
print("data loaded")
vocabulary = pickle_list[0]
posts = pickle_list[1]
except FileNotFoundError:
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary) word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
weights = [] weights = []
for xi in range(0, len(vocabulary) + 1): for xi in range(0, len(vocabulary) + 1):
weights.append(random.uniform(-0.01,0.01)) weights.append(random.uniform(-0.01,0.01))
learning_rate = 0.000001 learning_rate = 0.000000001
loss_sum = 0.0 loss_sum = 0.0
loss_sum_counter = 0 loss_sum_counter = 0
lowest_loss_sum_weights = [] lowest_loss_sum_weights = []
@ -62,7 +73,7 @@ def main():
print(f"len of vocabulary {len(vocabulary)}") print(f"len of vocabulary {len(vocabulary)}")
# mozna ustawić na bardzo bardzo duzo # mozna ustawić na bardzo bardzo duzo
while True: #loss_sum_counter != 10: while loss_sum_counter != 10000:
try: try:
d, y = random.choice(list(posts.items())) d, y = random.choice(list(posts.items()))
y_hat = weights[0] y_hat = weights[0]
@ -71,13 +82,14 @@ def main():
# mozna tez cos pomyslec z count aby lepiej dzialalo # mozna tez cos pomyslec z count aby lepiej dzialalo
#print(f"{d.count(word)} : {word}") #print(f"{d.count(word)} : {word}")
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word) y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
#print(f"{weights[word_to_index_mapping[word]]} : {word}")
loss = (y_hat - y)**2 loss = (y_hat - y)**2
loss_sum += loss loss_sum += loss
delta = (y_hat - y) * learning_rate delta = (y_hat - y) * learning_rate
if loss_sum_counter % 100 == 0: if loss_sum_counter % 100 == 0:
print(f"{loss_sum /1000} : {loss_sum_counter} : {y_hat} : {delta}") print(f"{loss_sum_counter} : {loss_sum /1000} : {y_hat} : {delta} : {lowest_loss_sum}")
loss_sum_counter = 0 #loss_sum_counter = 0
loss_sum = 0 loss_sum = 0
weights[0] -= delta weights[0] -= delta
@ -85,12 +97,14 @@ def main():
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
if lowest_loss_sum > loss_sum and loss_sum != 0: if lowest_loss_sum > loss_sum and loss_sum != 0:
print("it happened") print(f"it happened, new lowest_sum {loss_sum}")
lowest_loss_sum = loss_sum lowest_loss_sum = loss_sum
lowest_loss_sum_weights = weights lowest_loss_sum_weights = weights
loss_sum_counter +=1 loss_sum_counter +=1
except KeyboardInterrupt: except KeyboardInterrupt:
break break
print(lowest_loss_sum_weights) #print(lowest_loss_sum_weights)
with open(model, 'wb') as f:
pickle.dump([weights, lowest_loss_sum_weights, word_to_index_mapping], f)
main() main()