my brilliant solution4

2020-04-04 19:25:57 +02:00 · 2020-04-04 19:25:57 +02:00 · 24b02bc754
commit 24b02bc754
parent abba594b01
4 changed files with 10523 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/linearpred.py
+++ b/linearpred.py
@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import math
+import pickle
+import sys
+from tokenize import tokenize
+
+model = pickle.load(open("model.pkl", "rb"))
+word_to_index, vocabulary, weights, words_count = model
+lines = sys.stdin.readlines()
+for line2 in lines:
+			line2 = line2.rstrip()
+			fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
+			label2 = fields2[0].strip() ##to etykiety
+			document2 = fields2[1] ##to posty
+			terms2 = document2.split(' ') ##to rozdziel posty na słowa
+			for term2 in terms2: ##dla każdego słowa w poście
+				if term2 in words_count:
+					words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
+				else:
+					words_count[term2] = 1
+			expected=weights[0]
+			for t in terms2:
+				if(t in vocabulary):
+					expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))
+			if(expected>0.65):
+				print(1)
+			else:
+				print(0)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+import sys
+import re
+import random
+import pickle
+import math
+lines = sys.stdin.readlines()
+popr=0
+fals=0
+def train():
+	textdoc = sys.stdin
+	vocabulary = set()
+	xi=1
+	word_to_index={}
+	learning_rate=0.0001
+	loss_sum=0
+	loss_counter=0
+	popr=0
+	words_count={}
+	fals=0
+	linenr2=0
+	linenr=0
+	weights=[]
+	for line in lines:
+		linenr+=1
+		line = line.rstrip()
+		fields = line.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
+		label = fields[0].strip() ##to etykiety
+		document = fields[1] ##to posty
+		terms = document.split(' ') ##to rozdziel posty na słowa
+		for t in terms: ##dla każdego słowa w poście
+			if t not in vocabulary:
+				vocabulary.add(t) ##dodaj słowo do słownika	
+		for term2 in terms: ##dla każdego słowa w poście
+			if term2 in words_count:
+				words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
+			else:
+				words_count[term2] = 1
+	for t in vocabulary:
+		word_to_index[t]=xi
+		xi=xi+1	
+	for i in range(0, len(vocabulary)+1):
+		weights.append(random.uniform(-0.1,0.1))
+	for cnt in range(0,500):
+		for line2 in lines:
+			line2 = line2.rstrip()
+			fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
+			label2 = fields2[0].strip() ##to etykiety
+			document2 = fields2[1] ##to posty
+			terms2 = document2.split(' ') ##to rozdziel posty na słowa
+			expected=weights[0]
+			for t in terms2:
+				expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))				
+			for t in terms2:
+				weights[word_to_index[t]]-=(words_count[t]/len(words_count))*(expected-float(label2))*learning_rate
+			loss=(expected-float(label2))**2
+			loss_sum+=loss
+			if(loss_counter%10000==0):
+				#print(loss_sum/10000)
+				loss_counter=0
+				loss_sum=0.0
+				print(expected)
+				print(label2)
+			weights[0]-=(expected-float(label2))*learning_rate
+			loss_counter+=1
+	model=(word_to_index, vocabulary, weights, words_count)
+	pickle.dump(model,open("model.pkl", "wb"))
+	
+
+		
+train()