Added liniar regression
This commit is contained in:
parent
0839c5ca41
commit
9fb516216a
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
55
predict.py
Executable file
55
predict.py
Executable file
@ -0,0 +1,55 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import pickle, re, sys
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
def clear_post(post):
|
||||||
|
post = post.replace('\\n', ' ')
|
||||||
|
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
||||||
|
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
||||||
|
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
||||||
|
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
|
||||||
|
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
||||||
|
post = re.sub(r' +', ' ', post)
|
||||||
|
post = post.rstrip(' ')
|
||||||
|
post = post.split(' ')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
post_no_stop = [w for w in post if not w in stop_words]
|
||||||
|
return post_no_stop
|
||||||
|
|
||||||
|
def calc_prob(posts, weights, word_to_index_mapping):
|
||||||
|
for post in posts:
|
||||||
|
d = post.split(' ')
|
||||||
|
y_hat = weights[0]
|
||||||
|
for token in d:
|
||||||
|
try:
|
||||||
|
y_hat += weights[word_to_index_mapping[token]] * post.count(token)
|
||||||
|
except KeyError:
|
||||||
|
y_hat += 0
|
||||||
|
if y_hat > 0.5:
|
||||||
|
print("1")
|
||||||
|
else:
|
||||||
|
print("0")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Expected model")
|
||||||
|
return
|
||||||
|
|
||||||
|
model = str(sys.argv[1])
|
||||||
|
|
||||||
|
posts = []
|
||||||
|
for line in sys.stdin:
|
||||||
|
text, timestap = line.rstrip('\n').split('\t')
|
||||||
|
post = clear_post(text)
|
||||||
|
posts.append(" ".join(post))
|
||||||
|
|
||||||
|
with open(model, 'rb') as f:
|
||||||
|
pickle_list = pickle.load(f)
|
||||||
|
|
||||||
|
weights = pickle_list[0]
|
||||||
|
lowest_loss_weights = pickle_list[1]
|
||||||
|
word_to_index_mapping = pickle_list[2]
|
||||||
|
calc_prob(posts, weights, word_to_index_mapping)
|
||||||
|
|
||||||
|
main()
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
30
train.py
30
train.py
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
import re, sys, pickle, nltk, math, random
|
import re, sys, pickle, random
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
def clear_post(post):
|
def clear_post(post):
|
||||||
@ -28,6 +28,9 @@ def create_vocabulary_and_documents(in_file, expected_file):
|
|||||||
posts[" ".join(post)] = int(exp)
|
posts[" ".join(post)] = int(exp)
|
||||||
for word in post:
|
for word in post:
|
||||||
vocabulary.add(word)
|
vocabulary.add(word)
|
||||||
|
with open('data', 'wb') as f:
|
||||||
|
pickle.dump([vocabulary, posts], f)
|
||||||
|
print("data created")
|
||||||
return vocabulary, posts
|
return vocabulary, posts
|
||||||
|
|
||||||
def create_mappings(vocabulary):
|
def create_mappings(vocabulary):
|
||||||
@ -47,14 +50,22 @@ def main():
|
|||||||
model = str(sys.argv[1])
|
model = str(sys.argv[1])
|
||||||
expected_file = str(sys.argv[2])
|
expected_file = str(sys.argv[2])
|
||||||
in_file = str(sys.argv[3])
|
in_file = str(sys.argv[3])
|
||||||
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
|
try:
|
||||||
|
with open("data", 'rb') as pos:
|
||||||
|
pickle_list = pickle.load(pos)
|
||||||
|
print("data loaded")
|
||||||
|
vocabulary = pickle_list[0]
|
||||||
|
posts = pickle_list[1]
|
||||||
|
except FileNotFoundError:
|
||||||
|
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
|
||||||
|
|
||||||
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
|
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
|
||||||
|
|
||||||
weights = []
|
weights = []
|
||||||
for xi in range(0, len(vocabulary) + 1):
|
for xi in range(0, len(vocabulary) + 1):
|
||||||
weights.append(random.uniform(-0.01,0.01))
|
weights.append(random.uniform(-0.01,0.01))
|
||||||
|
|
||||||
learning_rate = 0.000001
|
learning_rate = 0.000000001
|
||||||
loss_sum = 0.0
|
loss_sum = 0.0
|
||||||
loss_sum_counter = 0
|
loss_sum_counter = 0
|
||||||
lowest_loss_sum_weights = []
|
lowest_loss_sum_weights = []
|
||||||
@ -62,7 +73,7 @@ def main():
|
|||||||
|
|
||||||
print(f"len of vocabulary {len(vocabulary)}")
|
print(f"len of vocabulary {len(vocabulary)}")
|
||||||
# mozna ustawić na bardzo bardzo duzo
|
# mozna ustawić na bardzo bardzo duzo
|
||||||
while True: #loss_sum_counter != 10:
|
while loss_sum_counter != 10000:
|
||||||
try:
|
try:
|
||||||
d, y = random.choice(list(posts.items()))
|
d, y = random.choice(list(posts.items()))
|
||||||
y_hat = weights[0]
|
y_hat = weights[0]
|
||||||
@ -71,13 +82,14 @@ def main():
|
|||||||
# mozna tez cos pomyslec z count aby lepiej dzialalo
|
# mozna tez cos pomyslec z count aby lepiej dzialalo
|
||||||
#print(f"{d.count(word)} : {word}")
|
#print(f"{d.count(word)} : {word}")
|
||||||
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
|
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
|
||||||
|
#print(f"{weights[word_to_index_mapping[word]]} : {word}")
|
||||||
|
|
||||||
loss = (y_hat - y)**2
|
loss = (y_hat - y)**2
|
||||||
loss_sum += loss
|
loss_sum += loss
|
||||||
delta = (y_hat - y) * learning_rate
|
delta = (y_hat - y) * learning_rate
|
||||||
if loss_sum_counter % 100 == 0:
|
if loss_sum_counter % 100 == 0:
|
||||||
print(f"{loss_sum /1000} : {loss_sum_counter} : {y_hat} : {delta}")
|
print(f"{loss_sum_counter} : {loss_sum /1000} : {y_hat} : {delta} : {lowest_loss_sum}")
|
||||||
loss_sum_counter = 0
|
#loss_sum_counter = 0
|
||||||
loss_sum = 0
|
loss_sum = 0
|
||||||
|
|
||||||
weights[0] -= delta
|
weights[0] -= delta
|
||||||
@ -85,12 +97,14 @@ def main():
|
|||||||
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
|
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
|
||||||
|
|
||||||
if lowest_loss_sum > loss_sum and loss_sum != 0:
|
if lowest_loss_sum > loss_sum and loss_sum != 0:
|
||||||
print("it happened")
|
print(f"it happened, new lowest_sum {loss_sum}")
|
||||||
lowest_loss_sum = loss_sum
|
lowest_loss_sum = loss_sum
|
||||||
lowest_loss_sum_weights = weights
|
lowest_loss_sum_weights = weights
|
||||||
|
|
||||||
loss_sum_counter +=1
|
loss_sum_counter +=1
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
break
|
break
|
||||||
print(lowest_loss_sum_weights)
|
#print(lowest_loss_sum_weights)
|
||||||
|
with open(model, 'wb') as f:
|
||||||
|
pickle.dump([weights, lowest_loss_sum_weights, word_to_index_mapping], f)
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user