Linear try
This commit is contained in:
parent
772b516776
commit
14432fab2d
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
26
predict.py
Normal file
26
predict.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from math import log
|
||||||
|
from tokenizer import tokenize
|
||||||
|
|
||||||
|
#Load model
|
||||||
|
model = pickle.load(open("model.pkl","rb"))
|
||||||
|
weights, word_to_index_mapping, word_count = model
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
document = line.rstrip()
|
||||||
|
fields = document.split('\t')
|
||||||
|
document = fields[0]
|
||||||
|
terms = tokenize(document)
|
||||||
|
|
||||||
|
y_predicted = weights[0]
|
||||||
|
for word in terms:
|
||||||
|
y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
|
||||||
|
|
||||||
|
|
||||||
|
if y_predicted <= 0.5:
|
||||||
|
print(0)
|
||||||
|
else:
|
||||||
|
print(1)
|
11
tokenizer.py
Normal file
11
tokenizer.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import nltk
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
|
def tokenize(d):
|
||||||
|
d = re.sub(r'\\n',' ',d)
|
||||||
|
words = word_tokenize(d)
|
||||||
|
return words
|
101
train.py
Normal file
101
train.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
import random
|
||||||
|
import collections
|
||||||
|
from tokenizer import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
#Prepare
|
||||||
|
vocabulary = set()
|
||||||
|
word_to_index_mapping = {}
|
||||||
|
index_to_word_mapping = {}
|
||||||
|
word_count = collections.defaultdict(int)
|
||||||
|
|
||||||
|
#Array x,y to use later for training process
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
|
||||||
|
learning_rate = 0.000001
|
||||||
|
|
||||||
|
#Read values from file
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.rstrip()
|
||||||
|
fields = line.split('\t')
|
||||||
|
label = fields[0]
|
||||||
|
document = fields[1]
|
||||||
|
terms = tokenize(document)
|
||||||
|
|
||||||
|
#Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
|
||||||
|
x.append(terms)
|
||||||
|
if label == "P":
|
||||||
|
y.append(1)
|
||||||
|
else:
|
||||||
|
y.append(0)
|
||||||
|
|
||||||
|
#Update vocabulary and count how often word appear
|
||||||
|
for t in terms:
|
||||||
|
word_count[t] += 1
|
||||||
|
vocabulary.add(t)
|
||||||
|
|
||||||
|
#Give numbers for words. Each word its own value. Indexing
|
||||||
|
ix = 1
|
||||||
|
for w in vocabulary:
|
||||||
|
word_to_index_mapping[w] = ix
|
||||||
|
index_to_word_mapping[ix] = w
|
||||||
|
ix += 1
|
||||||
|
|
||||||
|
#Initialize weights with random values from -1.0 to 1.0 (floats)
|
||||||
|
weights = []
|
||||||
|
for ix in range(0,len(vocabulary) + 1):
|
||||||
|
weights.append(random.uniform(-1.00, 1.00))
|
||||||
|
|
||||||
|
Loss_sum = 0.0
|
||||||
|
Loss_sum_counter = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
choose_random_example = random.randint(0,len(x)-1)
|
||||||
|
actual_x = x[choose_random_example] #list of words
|
||||||
|
actual_y = y[choose_random_example] #label for this set of words
|
||||||
|
|
||||||
|
#Predict result
|
||||||
|
y_predicted = weights[0]
|
||||||
|
|
||||||
|
#Iterate over all words in randomly choosen example
|
||||||
|
#With get u can avoid missing words and replace them with value u want
|
||||||
|
#Weights replace value doesnt matter if word is missing cause word_count will give 0
|
||||||
|
for word in actual_x:
|
||||||
|
y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
|
||||||
|
|
||||||
|
#Cost count. Check how good was our prediction
|
||||||
|
Loss = (y_predicted - actual_y) ** 2.0
|
||||||
|
#We sum loss to get average value. It will be easier for us to follow
|
||||||
|
Loss_sum += Loss
|
||||||
|
#We will stop after loss reach some value
|
||||||
|
|
||||||
|
if Loss_sum_counter % 1000 == 0:
|
||||||
|
print(Loss_sum / 1000)
|
||||||
|
Loss_sum = 0.0
|
||||||
|
Loss_sum_counter += 1
|
||||||
|
|
||||||
|
#Update weights
|
||||||
|
delta = (y_predicted - actual_y) * learning_rate
|
||||||
|
weights[0] = weights[0] - delta
|
||||||
|
for word in actual_x:
|
||||||
|
if word in word_to_index_mapping:
|
||||||
|
weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
|
||||||
|
|
||||||
|
if Loss_sum_counter > 1000000:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#We save only things we need for predicion
|
||||||
|
model = (weights, word_to_index_mapping, word_count)
|
||||||
|
pickle.dump(model, open("model.pkl", "wb"))
|
||||||
|
|
||||||
|
|
||||||
|
train()
|
Loading…
Reference in New Issue
Block a user