#!/usr/bin/env python3 import sys import re import random import pickle import math lines = sys.stdin.readlines() popr=0 fals=0 def train(): textdoc = sys.stdin vocabulary = set() xi=1 word_to_index={} learning_rate=0.0001 loss_sum=0 loss_counter=0 popr=0 words_count={} fals=0 linenr2=0 linenr=0 weights=[] for line in lines: linenr+=1 line = line.rstrip() fields = line.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami label = fields[0].strip() ##to etykiety document = fields[1] ##to posty terms = document.split(' ') ##to rozdziel posty na słowa for t in terms: ##dla każdego słowa w poście if t not in vocabulary: vocabulary.add(t) ##dodaj słowo do słownika for term2 in terms: ##dla każdego słowa w poście if term2 in words_count: words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje else: words_count[term2] = 1 for t in vocabulary: word_to_index[t]=xi xi=xi+1 for i in range(0, len(vocabulary)+1): weights.append(random.uniform(-0.1,0.1)) for cnt in range(0,500): for line2 in lines: line2 = line2.rstrip() fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami label2 = fields2[0].strip() ##to etykiety document2 = fields2[1] ##to posty terms2 = document2.split(' ') ##to rozdziel posty na słowa expected=weights[0] for t in terms2: expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]])) for t in terms2: weights[word_to_index[t]]-=(words_count[t]/len(words_count))*(expected-float(label2))*learning_rate loss=(expected-float(label2))**2 loss_sum+=loss if(loss_counter%10000==0): #print(loss_sum/10000) loss_counter=0 loss_sum=0.0 print(expected) print(label2) weights[0]-=(expected-float(label2))*learning_rate loss_counter+=1 model=(word_to_index, vocabulary, weights, words_count) pickle.dump(model,open("model.pkl", "wb")) train()