from collections import defaultdict import math import pickle import re from pip._vendor.msgpack.fallback import xrange import random vocabulary = [] file_to_save = open("test.tsv", "w", encoding='utf-8') def define_vocabulary(file_to_learn_new_words): word_counts = {'count': defaultdict(int)} with open(file_to_learn_new_words, encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: word_counts['count'][token] += 1 return word_counts def read_input(file_path): read_word_counts = {'count': defaultdict(int)} with open(file_path, encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: read_word_counts['count'][token] += 1 return read_word_counts def training(vocabulary, read_input, expected): file_to_write = open(expected, 'w+', encoding='utf8') learning_rate = 0.00001 learning_precision = 0.0000001 weights = [] iteration = 0 loss_sum = 0.0 ix = 1 readed_words_values = [] for word in read_input['count']: if word not in vocabulary['count']: read_input['count'][word] = 0 readed_words_values.append(read_input['count'][word]) for ix in range(0, len(vocabulary['count']) + 1): weights.append(random.uniform(-0.001, 0.001)) # max_iteration=len(vocabulary['count'])+1 max_iteration = 1000 delta = 1 while delta > learning_precision and iteration < max_iteration: d, y = random.choice(list(read_input['count'].items())) # d-word, y-value of y_hat = weights[0] i = 0 for word_d in d: if word_d in vocabulary['count'].keys(): # print(vocabulary['count'][d]) y_hat += weights[vocabulary['count'][word_d]] * readed_words_values[i] i += 1 if y_hat > 0.0: file_to_write.write('1\n') else: file_to_write.write('0\n') i = 0 delta = (y_hat - y) * learning_rate weights[0] = weights[0] - delta for word_w in d: if word_w in vocabulary['count'].keys(): weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta i += 1 # print(weights) print(y_hat) print(y) loss = (y_hat - y) ** 2.0 # loss=(y_hat-y)*(y_hat-y) loss_sum += loss if (iteration % 1000 == 0): print(loss_sum / 1000) iteration = 0 loss_sum = 0.0 iteration += 1 file_to_write.close return weights, vocabulary def main(): vocabulary = define_vocabulary('train/in.tsv') readed_words = read_input('dev-0/in.tsv') readed_words_test_a = read_input('test-A/in.tsv/in.tsv') training(vocabulary, readed_words, 'test.tsv') training(vocabulary,readed_words_test_a, 'test_a.tsv') # def cost_function(y_hat,y): # loss=(y_hat-y)**2.0 # loss_sum+=loss # if loss_counter%1000==0: # print(loss_sum/1000) # loss_counter=0 # loss_sum=0.0 # def main(): # --------------- initialization --------------------------------- # vocabulary = define_vocabulary('train/in.tsv') # readed_words=read_input('dev-0/in.tsv') # i=1; # weights=[] # readed_words_values=[] # rangeVocabulary=len(vocabulary['count'])+1 # for i in range(rangeVocabulary): # weights.append(random.randrange(0,len(vocabulary['count'])+1)) # for word in readed_words['count']: # if word not in vocabulary['count']: # readed_words['count'][word]=0 # readed_words_values.append(readed_words['count'][word]) # precision=0.00001 # learning_rate=0.00001 # delta=1 # max_iterations=len(vocabulary['count'])+1 # current_iteration=0 # rangeReadedValues=len(readed_words['count'])+1 # --------------- prediction ------------------------------------- # while (delta>precision and current_iteration