Linear try

2020-04-02 15:45:53 +02:00 · 2020-04-02 15:45:53 +02:00 · 14432fab2d
commit 14432fab2d
parent 772b516776
4 changed files with 5410 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,26 @@
 #!/usr/bin/python3
 import sys
 import pickle
 from math import log
 from tokenizer import tokenize
 #Load model
 model = pickle.load(open("model.pkl","rb"))
 weights, word_to_index_mapping, word_count = model
 for line in sys.stdin:
    document = line.rstrip()
    fields = document.split('\t')
    document = fields[0]
    terms = tokenize(document)
    y_predicted = weights[0]
    for word in terms:
        y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
    if y_predicted <= 0.5:
        print(0)
    else:
        print(1)
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,11 @@
 #!/usr/bin/python3
 from nltk.tokenize import word_tokenize
 import nltk
 import re
 import string
 def tokenize(d):
    d = re.sub(r'\\n',' ',d)
    words = word_tokenize(d)
    return words
--- a/train.py
+++ b/train.py
@ -0,0 +1,101 @@
 #!/usr/bin/python3
 import sys
 import pickle
 import random
 import collections
 from tokenizer import tokenize
 def train():
    #Prepare
    vocabulary = set()
    word_to_index_mapping = {}
    index_to_word_mapping = {}
    word_count = collections.defaultdict(int)
    #Array x,y to use later for training process
    x = []
    y = []
    learning_rate = 0.000001
    #Read values from file
    for line in sys.stdin:
        line = line.rstrip()
        fields = line.split('\t')
        label = fields[0]
        document = fields[1]
        terms = tokenize(document)
        #Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
        x.append(terms)
        if label == "P":
            y.append(1)
        else:
            y.append(0)
        #Update vocabulary and count how often word appear
        for t in terms:
            word_count[t] += 1
            vocabulary.add(t)
    #Give numbers for words. Each word its own value. Indexing
    ix = 1
    for w in vocabulary:
        word_to_index_mapping[w] = ix
        index_to_word_mapping[ix] = w
        ix += 1
    #Initialize weights with random values from -1.0 to 1.0 (floats)
    weights = []
    for ix in range(0,len(vocabulary) + 1):
        weights.append(random.uniform(-1.00, 1.00))
    Loss_sum = 0.0
    Loss_sum_counter = 1
    while True:
        choose_random_example = random.randint(0,len(x)-1)
        actual_x = x[choose_random_example] #list of words
        actual_y = y[choose_random_example] #label for this set of words
        #Predict result
        y_predicted = weights[0]
        #Iterate over all words in randomly choosen example
        #With get u can avoid missing words and replace them with value u want
        #Weights replace value doesnt matter if word is missing cause word_count will give 0
        for word in actual_x:
            y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
        #Cost count. Check how good was our prediction
        Loss = (y_predicted - actual_y) ** 2.0
        #We sum loss to get average value. It will be easier for us to follow
        Loss_sum += Loss
        #We will stop after loss reach some value
        if Loss_sum_counter % 1000 == 0:
            print(Loss_sum / 1000)
            Loss_sum = 0.0
        Loss_sum_counter += 1
        #Update weights
        delta = (y_predicted - actual_y) * learning_rate
        weights[0] = weights[0] - delta
        for word in actual_x:
            if word in word_to_index_mapping:
                weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
        if Loss_sum_counter > 1000000:
            break
    #We save only things we need for predicion
    model = (weights, word_to_index_mapping, word_count)
    pickle.dump(model, open("model.pkl", "wb"))
 train()