from collections import defaultdict import math import pickle import re from pip._vendor.msgpack.fallback import xrange import random vocabulary=[] #word_to_index_mapping=[] #index_to_word_mapping=[] #file_to_save=open("test.tsv","w",encoding='utf-8') #def define_vocabulary(file_to_learn_new_words,expected_path): # word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)} # with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8') as expected_file: # for line, exp in zip(in_file, expected_file): # class_ = exp.rstrip('\n').replace(' ', '') # text, timestamp = line.rstrip('\n').split('\t') # tokens = text.lower().split(' ') # for token in tokens: # if class_ == 'P': # word_counts['paranormal'][token] += 1 # elif class_ == 'S': # word_counts['skeptic'][token] += 1 # return word_counts file_to_save=open("test.tsv","w",encoding='utf-8') def define_vocabulary(file_to_learn_new_words): word_counts={'count': defaultdict(int)} with open(file_to_learn_new_words,encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: word_counts['count'][token]+=1 return word_counts def read_input(file_path): word_counts={'count': defaultdict(int)} with open(file_path, encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: word_counts['count'][token]+=1 return word_counts def main(): # --------------- initialization --------------------------------- vocabulary = define_vocabulary('train/in.tsv') i=1; weights=[] testFuckingPython=len(vocabulary['count'])+1 for i in range(testFuckingPython): weights.append(random.randrange(0,len(vocabulary['count'])+1)) precision=0.00001 learning_rate=0.001 prev_step_size=1 max_iterations=len(vocabulary['count']) current_iteration=0 readed_words=read_input("train/in.tsv") # --------------- prediction ------------------------------------- #while (prev_step_size>precision and current_iteration