my brilliant solution4

2020-04-04 19:25:57 +02:00 · 2020-04-04 19:25:57 +02:00 · 24b02bc754
commit 24b02bc754
parent abba594b01
4 changed files with 10523 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/linearpred.py
+++ b/linearpred.py
@ -0,0 +1,28 @@
 #!/usr/bin/env python3
 import math
 import pickle
 import sys
 from tokenize import tokenize
 model = pickle.load(open("model.pkl", "rb"))
 word_to_index, vocabulary, weights, words_count = model
 lines = sys.stdin.readlines()
 for line2 in lines:
 			line2 = line2.rstrip()
 			fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
 			label2 = fields2[0].strip() ##to etykiety
 			document2 = fields2[1] ##to posty
 			terms2 = document2.split(' ') ##to rozdziel posty na słowa
 			for term2 in terms2: ##dla każdego słowa w poście
 				if term2 in words_count:
 					words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
 				else:
 					words_count[term2] = 1
 			expected=weights[0]
 			for t in terms2:
 				if(t in vocabulary):
 					expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))
 			if(expected>0.65):
 				print(1)
 			else:
 				print(0)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -0,0 +1,71 @@
 #!/usr/bin/env python3
 import sys
 import re
 import random
 import pickle
 import math
 lines = sys.stdin.readlines()
 popr=0
 fals=0
 def train():
 	textdoc = sys.stdin
 	vocabulary = set()
 	xi=1
 	word_to_index={}
 	learning_rate=0.0001
 	loss_sum=0
 	loss_counter=0
 	popr=0
 	words_count={}
 	fals=0
 	linenr2=0
 	linenr=0
 	weights=[]
 	for line in lines:
 		linenr+=1
 		line = line.rstrip()
 		fields = line.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
 		label = fields[0].strip() ##to etykiety
 		document = fields[1] ##to posty
 		terms = document.split(' ') ##to rozdziel posty na słowa
 		for t in terms: ##dla każdego słowa w poście
 			if t not in vocabulary:
 				vocabulary.add(t) ##dodaj słowo do słownika	
 		for term2 in terms: ##dla każdego słowa w poście
 			if term2 in words_count:
 				words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
 			else:
 				words_count[term2] = 1
 	for t in vocabulary:
 		word_to_index[t]=xi
 		xi=xi+1	
 	for i in range(0, len(vocabulary)+1):
 		weights.append(random.uniform(-0.1,0.1))
 	for cnt in range(0,500):
 		for line2 in lines:
 			line2 = line2.rstrip()
 			fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
 			label2 = fields2[0].strip() ##to etykiety
 			document2 = fields2[1] ##to posty
 			terms2 = document2.split(' ') ##to rozdziel posty na słowa
 			expected=weights[0]
 			for t in terms2:
 				expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))				
 			for t in terms2:
 				weights[word_to_index[t]]-=(words_count[t]/len(words_count))*(expected-float(label2))*learning_rate
 			loss=(expected-float(label2))**2
 			loss_sum+=loss
 			if(loss_counter%10000==0):
 				#print(loss_sum/10000)
 				loss_counter=0
 				loss_sum=0.0
 				print(expected)
 				print(label2)
 			weights[0]-=(expected-float(label2))*learning_rate
 			loss_counter+=1
 	model=(word_to_index, vocabulary, weights, words_count)
 	pickle.dump(model,open("model.pkl", "wb"))
 train()