diff --git a/code_regression.py b/code_regression.py index 2825cf6..f71a183 100644 --- a/code_regression.py +++ b/code_regression.py @@ -1,142 +1,266 @@ -from collections import defaultdict -import math -import pickle -import re - -from pip._vendor.msgpack.fallback import xrange import random - -vocabulary = [] - -file_to_save = open("test.tsv", "w", encoding='utf-8') - +import re +from _collections import defaultdict def define_vocabulary(file_to_learn_new_words): - word_counts = {'count': defaultdict(int)} - with open(file_to_learn_new_words, encoding='utf-8') as in_file: - for line in in_file: - text, timestamp = line.rstrip('\n').split('\t') - tokens = text.lower().split(' ') + word_counts = {'count': defaultdict(int)} + with open(file_to_learn_new_words, encoding='utf-8') as in_file: + for line in in_file: + text, timestamp = line.rstrip('\n').split('\t') + tokens = text.lower().split(' ') + for token in tokens: + word_counts['count'][token] += 1 + in_file.close() + return word_counts + +def tokenize_list(string_input): + words=[] + string=string_input.replace('\\n',' ') + text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) + string='' + for word in text: + string+=word + words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\ยท+',string) + regex=re.compile(r'http|^[a-zA-Z]$|org') + filtered_values=[ + word + for word in words if not regex.match(word) + ] + filtered_values[:] = ( + value.lower() + for value in filtered_values if len(value)!=0 + ) + return filtered_values + +def read_words(input_path): + vocabulary = {'count':defaultdict(int)} + index=0 + with open(input_path,encoding='utf-8') as infile: + for line in infile: + index+=1 + tokens = tokenize_list(line) for token in tokens: - word_counts['count'][token] += 1 - return word_counts + if token not in vocabulary: + vocabulary['vocabulary'][token]+=1 + infile.close() + return vocabulary - -def read_input(file_path): - read_word_counts = {'count': defaultdict(int)} - with open(file_path, encoding='utf-8') as in_file: - for line in in_file: - text, timestamp = line.rstrip('\n').split('\t') - tokens = text.lower().split(' ') - for token in tokens: - read_word_counts['count'][token] += 1 - return read_word_counts - - -def training(vocabulary, read_input, expected): - file_to_write = open(expected, 'w+', encoding='utf8') - file_to_write2 = open('out_y_hat.tsv', 'w+', encoding='utf8') - learning_rate = 0.00001 - learning_precision = 0.0001 - weights = [] - iteration = 0 - loss_sum = 0.0 - ix = 1 - readed_words_values = [] - for word in read_input['count']: - if word not in vocabulary['count']: - read_input['count'][word] = 0 - readed_words_values.append(read_input['count'][word]) - for ix in range(0, len(vocabulary['count']) + 1): - weights.append(random.uniform(-0.001, 0.001)) - # max_iteration=len(vocabulary['count'])+1 - max_iteration = 10000 - delta = 1 +def train(vocabulary,input_train,expected_train): + learning_rate=0.0001 + learning_precision=0.00000001 + words_vocabulary={} + with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file: + for line, exp in zip(input_file,expected_file): + words_vocabulary[line]=int(exp) + weights={} + weight={} + delta=1 + iteration=0 + loss_sum=0.0 + error=10.0 + max_iteration=len(vocabulary) + for i in vocabulary['count'].keys(): + weights[i]=random.uniform(-0.01,0.01) while delta>learning_precision and iteration 0.5: - file_to_write.write('1\n') - else: - file_to_write.write('0\n') - i = 0 - delta = (y_hat - y) * learning_rate - weights[0] = weights[0] - delta - for word_w in d: - if word_w in vocabulary['count'].keys(): - weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta - i += 1 - # print(weights) - #print(f'Y: {y}') - loss = (y_hat - y) ** 2.0 - # loss=(y_hat-y)*(y_hat-y) + d,y = random.choice(list(words_vocabulary.items())) + y_hat=0 + tokens=tokenize_list(d) + for token in tokens: + if token in vocabulary['count'].keys(): + y_hat += weights[token] * tokens.count(token) + delta=(y_hat-y) * learning_rate + for word in tokens: + if word in words_vocabulary: + weights[word] -= (tokens.count(word)) * delta + loss = (y_hat - y)**2.0 loss_sum += loss - if (iteration % 1000 == 0): - #print(loss_sum / 1000) - iteration = 0 - loss_sum = 0.0 + if iteration%1000 == 0: + if (error>(loss_sum/1000)): + weight=weights + error=loss_sum/1000 + loss_sum=0.0 iteration += 1 - file_to_write.close + input_file.close() + expected_file.close() + return weight, vocabulary + +def prediction(input,output,weights,vocabulary): + with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output: + for line in input_file: + y_hat=0 + tokens=tokenize_list(line) + for token in tokens: + if token in vocabulary['count'].keys(): + y_hat += weights[token] * (token.count(token)) + if y_hat>0.0: + output.write('1\n') + else: + output.write('0\n') + output.close() + input_file.close() + def main(): - vocabulary = define_vocabulary('train/in.tsv') - readed_words = read_input('dev-0/in.tsv') - readed_words_test_a = read_input('test-A/in.tsv/in.tsv') - training(vocabulary, readed_words, 'dev-0/out.tsv') - training(vocabulary, readed_words_test_a, 'test-A/out.tsv') - - -# def cost_function(y_hat,y): -# loss=(y_hat-y)**2.0 -# loss_sum+=loss -# if loss_counter%1000==0: -# print(loss_sum/1000) -# loss_counter=0 -# loss_sum=0.0 - - -# def main(): -# --------------- initialization --------------------------------- -# vocabulary = define_vocabulary('train/in.tsv') -# readed_words=read_input('dev-0/in.tsv') -# i=1; -# weights=[] -# readed_words_values=[] -# rangeVocabulary=len(vocabulary['count'])+1 -# for i in range(rangeVocabulary): -# weights.append(random.randrange(0,len(vocabulary['count'])+1)) -# for word in readed_words['count']: -# if word not in vocabulary['count']: -# readed_words['count'][word]=0 -# readed_words_values.append(readed_words['count'][word]) -# precision=0.00001 -# learning_rate=0.00001 -# delta=1 -# max_iterations=len(vocabulary['count'])+1 -# current_iteration=0 -# rangeReadedValues=len(readed_words['count'])+1 -# --------------- prediction ------------------------------------- -# while (delta>precision and current_iterationlearning_precision and iteration 0.5: +# file_to_write.write('1\n') +# else: +# file_to_write.write('0\n') +# i = 0 +# delta = (y_hat - y) * learning_rate +# weights[0] = weights[0] - delta +# for word_w in d: +# if word_w in vocabulary['count'].keys(): +# weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta +# i += 1 +# # print(weights) +# #print(f'Y: {y}') +# loss = (y_hat - y) ** 2.0 +# # loss=(y_hat-y)*(y_hat-y) +# loss_sum += loss +# if (iteration % 1000 == 0): +# #print(loss_sum / 1000) +# iteration = 0 +# loss_sum = 0.0 +# iteration += 1 +# file_to_write.close +# +# def main(): +# vocabulary = define_vocabulary('train/in.tsv') +# readed_words = read_input('dev-0/in.tsv') +# readed_words_test_a = read_input('test-A/in.tsv/in.tsv') +# training(vocabulary, readed_words, 'dev-0/out.tsv') +# training(vocabulary, readed_words_test_a, 'test-A/out.tsv') +# +# +# # def cost_function(y_hat,y): +# # loss=(y_hat-y)**2.0 +# # loss_sum+=loss +# # if loss_counter%1000==0: +# # print(loss_sum/1000) +# # loss_counter=0 +# # loss_sum=0.0 +# +# +# # def main(): +# # --------------- initialization --------------------------------- +# # vocabulary = define_vocabulary('train/in.tsv') +# # readed_words=read_input('dev-0/in.tsv') +# # i=1; +# # weights=[] +# # readed_words_values=[] +# # rangeVocabulary=len(vocabulary['count'])+1 +# # for i in range(rangeVocabulary): +# # weights.append(random.randrange(0,len(vocabulary['count'])+1)) +# # for word in readed_words['count']: +# # if word not in vocabulary['count']: +# # readed_words['count'][word]=0 +# # readed_words_values.append(readed_words['count'][word]) +# # precision=0.00001 +# # learning_rate=0.00001 +# # delta=1 +# # max_iterations=len(vocabulary['count'])+1 +# # current_iteration=0 +# # rangeReadedValues=len(readed_words['count'])+1 +# # --------------- prediction ------------------------------------- +# # while (delta>precision and current_iteration