my brilliant solution4

This commit is contained in:
Dominika Grajewska 2020-04-04 19:25:57 +02:00
parent abba594b01
commit 24b02bc754
4 changed files with 10523 additions and 0 deletions

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

28
linearpred.py Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import math
import pickle
import sys
from tokenize import tokenize
model = pickle.load(open("model.pkl", "rb"))
word_to_index, vocabulary, weights, words_count = model
lines = sys.stdin.readlines()
for line2 in lines:
line2 = line2.rstrip()
fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
label2 = fields2[0].strip() ##to etykiety
document2 = fields2[1] ##to posty
terms2 = document2.split(' ') ##to rozdziel posty na słowa
for term2 in terms2: ##dla każdego słowa w poście
if term2 in words_count:
words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
else:
words_count[term2] = 1
expected=weights[0]
for t in terms2:
if(t in vocabulary):
expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))
if(expected>0.65):
print(1)
else:
print(0)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

71
train.py Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env python3
import sys
import re
import random
import pickle
import math
lines = sys.stdin.readlines()
popr=0
fals=0
def train():
textdoc = sys.stdin
vocabulary = set()
xi=1
word_to_index={}
learning_rate=0.0001
loss_sum=0
loss_counter=0
popr=0
words_count={}
fals=0
linenr2=0
linenr=0
weights=[]
for line in lines:
linenr+=1
line = line.rstrip()
fields = line.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
label = fields[0].strip() ##to etykiety
document = fields[1] ##to posty
terms = document.split(' ') ##to rozdziel posty na słowa
for t in terms: ##dla każdego słowa w poście
if t not in vocabulary:
vocabulary.add(t) ##dodaj słowo do słownika
for term2 in terms: ##dla każdego słowa w poście
if term2 in words_count:
words_count[term2] += 1 ##robimy słownik dla danego słowa ile razy występuje
else:
words_count[term2] = 1
for t in vocabulary:
word_to_index[t]=xi
xi=xi+1
for i in range(0, len(vocabulary)+1):
weights.append(random.uniform(-0.1,0.1))
for cnt in range(0,500):
for line2 in lines:
line2 = line2.rstrip()
fields2 = line2.split('\t') ##rozdzielamy linie na tablice oddzielonymi tabami
label2 = fields2[0].strip() ##to etykiety
document2 = fields2[1] ##to posty
terms2 = document2.split(' ') ##to rozdziel posty na słowa
expected=weights[0]
for t in terms2:
expected=expected+(words_count[t]/len(words_count)*(weights[word_to_index[t]]))
for t in terms2:
weights[word_to_index[t]]-=(words_count[t]/len(words_count))*(expected-float(label2))*learning_rate
loss=(expected-float(label2))**2
loss_sum+=loss
if(loss_counter%10000==0):
#print(loss_sum/10000)
loss_counter=0
loss_sum=0.0
print(expected)
print(label2)
weights[0]-=(expected-float(label2))*learning_rate
loss_counter+=1
model=(word_to_index, vocabulary, weights, words_count)
pickle.dump(model,open("model.pkl", "wb"))
train()