import random import re from _collections import defaultdict def define_vocabulary(file_to_learn_new_words): word_counts = {'count': defaultdict(int)} with open(file_to_learn_new_words, encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: word_counts['count'][token] += 1 in_file.close() return word_counts def tokenize_list(string_input): words=[] string=string_input.replace('\\n',' ') #text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) text = re.sub(r'\\n+', " ", string) text = re.sub(r'http\S+', " ", text) text = re.sub(r'\/[a-z]\/', " ", text) text = re.sub(r'[^a-z]', " ", text) text = re.sub(r'\s{2,}', " ", text) text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text) text = re.sub(r'^\s', "", text) string='' for word in text: string+=word words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\ยท+',string) regex=re.compile(r'http|^[a-zA-Z]$|org') filtered_values=[ word for word in words if not regex.match(word) ] filtered_values[:] = ( value.lower() for value in filtered_values if len(value)!=0 ) return filtered_values def read_words(input_path): vocabulary = {'count':defaultdict(int)} index=0 with open(input_path,encoding='utf-8') as infile: for line in infile: index+=1 tokens = tokenize_list(line) for token in tokens: if token not in vocabulary: vocabulary['vocabulary'][token]+=1 infile.close() return vocabulary def train(vocabulary,input_train,expected_train): learning_rate=0.0000001 learning_precision=0.0000001 words_vocabulary={} with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file: for line, exp in zip(input_file,expected_file): words_vocabulary[line]=int(exp) weights={} weight={} delta=1 iteration=0 loss_sum=0.0 error=10.0 max_iteration=len(vocabulary) for i in vocabulary['count'].keys(): weights[i]=random.uniform(-0.01,0.01) while delta>learning_precision and iteration(loss_sum/1000)): weight=weights error=loss_sum/1000 loss_sum=0.0 iteration += 1 input_file.close() expected_file.close() return weight, vocabulary def prediction(input,output,weights,vocabulary): with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output: for line in input_file: y_hat=0 tokens=tokenize_list(line) for token in tokens: if token in vocabulary['count'].keys(): y_hat += weights[token] * (token.count(token)) if y_hat>0.0: output.write('1\n') else: output.write('0\n') output.close() input_file.close() def main(): vocabulary=define_vocabulary('train/in.tsv'); weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv') prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words) prediction('test-A/in.tsv','test-A/out.tsv',weights,words) main() # from collections import defaultdict # import math # import pickle # import re # # from pip._vendor.msgpack.fallback import xrange # import random # # vocabulary = [] # # file_to_save = open("test.tsv", "w", encoding='utf-8') # # # def define_vocabulary(file_to_learn_new_words): # word_counts = {'count': defaultdict(int)} # with open(file_to_learn_new_words, encoding='utf-8') as in_file: # for line in in_file: # text, timestamp = line.rstrip('\n').split('\t') # tokens = text.lower().split(' ') # for token in tokens: # word_counts['count'][token] += 1 # return word_counts # # # def read_input(file_path): # read_word_counts = {'count': defaultdict(int)} # with open(file_path, encoding='utf-8') as in_file: # for line in in_file: # text, timestamp = line.rstrip('\n').split('\t') # tokens = text.lower().split(' ') # for token in tokens: # read_word_counts['count'][token] += 1 # return read_word_counts # # # def training(vocabulary, read_input, expected): # file_to_write = open(expected, 'w+', encoding='utf8') # file_to_write2 = open('out_y_hat.tsv', 'w+', encoding='utf8') # learning_rate = 0.00001 # learning_precision = 0.0001 # weights = [] # iteration = 0 # loss_sum = 0.0 # ix = 1 # readed_words_values = [] # for word in read_input['count']: # if word not in vocabulary['count']: # read_input['count'][word] = 0 # readed_words_values.append(read_input['count'][word]) # for ix in range(0, len(vocabulary['count']) + 1): # weights.append(random.uniform(-0.001, 0.001)) # # max_iteration=len(vocabulary['count'])+1 # max_iteration = 10000 # delta = 1 # while delta>learning_precision and iteration 0.5: # file_to_write.write('1\n') # else: # file_to_write.write('0\n') # i = 0 # delta = (y_hat - y) * learning_rate # weights[0] = weights[0] - delta # for word_w in d: # if word_w in vocabulary['count'].keys(): # weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta # i += 1 # # print(weights) # #print(f'Y: {y}') # loss = (y_hat - y) ** 2.0 # # loss=(y_hat-y)*(y_hat-y) # loss_sum += loss # if (iteration % 1000 == 0): # #print(loss_sum / 1000) # iteration = 0 # loss_sum = 0.0 # iteration += 1 # file_to_write.close # # def main(): # vocabulary = define_vocabulary('train/in.tsv') # readed_words = read_input('dev-0/in.tsv') # readed_words_test_a = read_input('test-A/in.tsv/in.tsv') # training(vocabulary, readed_words, 'dev-0/out.tsv') # training(vocabulary, readed_words_test_a, 'test-A/out.tsv') # # # # def cost_function(y_hat,y): # # loss=(y_hat-y)**2.0 # # loss_sum+=loss # # if loss_counter%1000==0: # # print(loss_sum/1000) # # loss_counter=0 # # loss_sum=0.0 # # # # def main(): # # --------------- initialization --------------------------------- # # vocabulary = define_vocabulary('train/in.tsv') # # readed_words=read_input('dev-0/in.tsv') # # i=1; # # weights=[] # # readed_words_values=[] # # rangeVocabulary=len(vocabulary['count'])+1 # # for i in range(rangeVocabulary): # # weights.append(random.randrange(0,len(vocabulary['count'])+1)) # # for word in readed_words['count']: # # if word not in vocabulary['count']: # # readed_words['count'][word]=0 # # readed_words_values.append(readed_words['count'][word]) # # precision=0.00001 # # learning_rate=0.00001 # # delta=1 # # max_iterations=len(vocabulary['count'])+1 # # current_iteration=0 # # rangeReadedValues=len(readed_words['count'])+1 # # --------------- prediction ------------------------------------- # # while (delta>precision and current_iteration