#!/usr/bin/python3 ''' Linear regression for paranormal and sceptic challange 2.0.0 In order to use train.py you need to pass two columns label document splited by \t Commands used: xzcat, paste ''' import sys import pickle import random from math import log, exp import collections from tokenizer import tokenize def train(): #Prepare vocabulary = set() word_to_index_mapping = {} index_to_word_mapping = {} word_count = collections.defaultdict(int) #Array x,y to use later for training process x = [] y = [] learning_rate = 0.000001 #Read values from file for line in sys.stdin: line = line.rstrip() fields = line.split('\t') label = fields[0] document = fields[1] terms = tokenize(document) #Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0) x.append(terms) if label == "P": y.append(1) else: y.append(0) #Update vocabulary and count how often word appear for t in terms: word_count[t] += 1 vocabulary.add(t) #Give numbers for words. Each word its own value. Indexing ix = 1 for w in vocabulary: word_to_index_mapping[w] = ix index_to_word_mapping[ix] = w ix += 1 #Initialize weights with random values from -1.0 to 1.0 (floats) weights = [] for ix in range(0,len(vocabulary) + 1): weights.append(random.uniform(-1.00, 1.00)) Loss_sum = 0.0 Loss_sum_counter = 1 while True: choose_random_example = random.randint(0,len(x)-1) actual_x = x[choose_random_example] #list of words actual_y = y[choose_random_example] #label for this set of words #Predict result y_predicted = weights[0] #Iterate over all words in randomly choosen example #With get u can avoid missing words and replace them with value u want #Weights replace value doesnt matter if word is missing cause word_count will give 0 for word in actual_x: y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1) #Cost count. Check how good was our prediction Loss = (y_predicted - actual_y) ** 2.0 #We sum loss to get average value. It will be easier for us to follow Loss_sum += Loss #We will stop after loss reach some value if Loss_sum_counter % 10000 == 0: print(str(Loss_sum_counter) + " " + str(Loss_sum / 10000)) Loss_sum = 0.0 Loss_sum_counter += 1 #Update weights delta = (y_predicted - actual_y) * learning_rate weights[0] = weights[0] - delta for word in actual_x: if word in word_to_index_mapping: weights[word_to_index_mapping[word]] -= (log(word_count[word] / len(word_count) + 1) * delta) if Loss_sum_counter > 7000000: break #We save only things we need for prediction model = (weights, word_to_index_mapping, word_count) pickle.dump(model, open("model.pkl", "wb")) train()