paranormal-or-skeptic/train.py

#!/usr/bin/env python3
import sys
import re
import random
import pickle
import math
lines = sys.stdin.readlines()
popr=0
fals=0
def train():
	textdoc = sys.stdin
	vocabulary = set()
	xi=1
	word_to_index={}
	learning_rate=0.0001
	loss_sum=0
	loss_counter=0
	popr=0
	words_count={}
	fals=0
	linenr2=0
	linenr=0
	weights=[]
	for line in lines:
		linenr+=1
		line = line.rstrip()
		fields = line.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
		label = fields[0].strip() ##to etykiety
		document = fields[1] ##to posty
		terms = document.split(' ') ##to rozdziel posty na słowa
		for t in terms: ##dla każdego słowa w poście
			if t not in vocabulary:
				vocabulary.add(t) ##dodaj słowo do słownika	
		for term2 in terms: ##dla każdego słowa w poście
			if term2 in words_count:
				words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
			else:
				words_count[term2] = 1
	for t in vocabulary:
		word_to_index[t]=xi
		xi=xi+1	
	for i in range(0, len(vocabulary)+1):
		weights.append(random.uniform(-0.1,0.1))
	for cnt in range(0,500):
		for line2 in lines:
			line2 = line2.rstrip()
			fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
			label2 = fields2[0].strip() ##to etykiety
			document2 = fields2[1] ##to posty
			terms2 = document2.split(' ') ##to rozdziel posty na słowa
			expected=weights[0]
			for t in terms2:
				expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))				
			for t in terms2:
				weights[word_to_index[t]]-=(words_count[t]/len(words_count))*(expected-float(label2))*learning_rate
			loss=(expected-float(label2))**2
			loss_sum+=loss
			if(loss_counter%10000==0):
				#print(loss_sum/10000)
				loss_counter=0
				loss_sum=0.0
				print(expected)
				print(label2)
			weights[0]-=(expected-float(label2))*learning_rate
			loss_counter+=1
	model=(word_to_index, vocabulary, weights, words_count)
	pickle.dump(model,open("model.pkl", "wb"))
	

		
train()
my brilliant solution4 2020-04-04 19:25:57 +02:00			`#!/usr/bin/env python3`
			`import sys`
			`import re`
			`import random`
			`import pickle`
			`import math`
			`lines = sys.stdin.readlines()`
			`popr=0`
			`fals=0`
			`def train():`
			`textdoc = sys.stdin`
			`vocabulary = set()`
			`xi=1`
			`word_to_index={}`
			`learning_rate=0.0001`
			`loss_sum=0`
			`loss_counter=0`
			`popr=0`
			`words_count={}`
			`fals=0`
			`linenr2=0`
			`linenr=0`
			`weights=[]`
			`for line in lines:`
			`linenr+=1`
			`line = line.rstrip()`
			`fields = line.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami`
			`label = fields[0].strip() ##to etykiety`
			`document = fields[1] ##to posty`
			`terms = document.split(' ') ##to rozdziel posty na słowa`
			`for t in terms: ##dla każdego słowa w poście`
			`if t not in vocabulary:`
			`vocabulary.add(t) ##dodaj słowo do słownika`
			`for term2 in terms: ##dla każdego słowa w poście`
			`if term2 in words_count:`
			`words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje`
			`else:`
			`words_count[term2] = 1`
			`for t in vocabulary:`
			`word_to_index[t]=xi`
			`xi=xi+1`
			`for i in range(0, len(vocabulary)+1):`
			`weights.append(random.uniform(-0.1,0.1))`
			`for cnt in range(0,500):`
			`for line2 in lines:`
			`line2 = line2.rstrip()`
			`fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami`
			`label2 = fields2[0].strip() ##to etykiety`
			`document2 = fields2[1] ##to posty`
			`terms2 = document2.split(' ') ##to rozdziel posty na słowa`
			`expected=weights[0]`
			`for t in terms2:`
			`expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))`
			`for t in terms2:`
			`weights[word_to_index[t]]-=(words_count[t]/len(words_count))(expected-float(label2))learning_rate`
			`loss=(expected-float(label2))**2`
			`loss_sum+=loss`
			`if(loss_counter%10000==0):`
			`#print(loss_sum/10000)`
			`loss_counter=0`
			`loss_sum=0.0`
			`print(expected)`
			`print(label2)`
			`weights[0]-=(expected-float(label2))*learning_rate`
			`loss_counter+=1`
			`model=(word_to_index, vocabulary, weights, words_count)`
			`pickle.dump(model,open("model.pkl", "wb"))`



			`train()`