solution

2020-12-15 16:40:10 +01:00 · 2020-12-15 16:40:10 +01:00 · 9f31d8cc24
commit 9f31d8cc24
parent ecfafbf86c
9 changed files with 10562 additions and 0 deletions
--- a/create_dictionary.py
+++ b/create_dictionary.py
@ -0,0 +1,46 @@
+#!/usr/bin/python3
+
+
+import sys
+import pickle
+from tokenizator import tokenize
+
+
+def main():
+    dictionary = set()
+    train_y = []
+    documents = []
+    document_terms = {}
+
+    counter = 0
+    for line in sys.stdin:
+        #print(line)
+        if counter > 15000:
+            break
+        line = line.strip()
+        fields = line.split('\t')
+        if len(fields) < 2:
+            continue
+        label = fields[0].strip()
+        document = fields[1]
+        documents.append(document)
+        train_y.append(int(label))
+        terms = tokenize(document)
+        document_terms[counter] = terms
+
+        for t in terms:
+            dictionary.add(t)
+        counter += 1
+
+    word_to_index_mapping = {}
+    index = 1
+    for w in dictionary:
+        word_to_index_mapping[w] = index
+        index += 1
+
+    model = (dictionary, documents, document_terms, word_to_index_mapping, train_y)
+    pickle.dump(model, open('dictionary.pkl', 'wb'))
+
+
+if __name__ == '__main__':
+    main()
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dictionary.pkl
+++ b/dictionary.pkl
--- a/BIN
+++ b/BIN
--- a/model.pkl
+++ b/model.pkl
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,31 @@
+import pickle
+import sys
+import torch
+from tokenizator import tokenize
+
+
+def get_x(line, weights, mapping):
+    terms = tokenize(line)
+    x = len(weights) * [0.]
+    x[len(weights) - 1] = 1
+    for word in terms:
+        if word in mapping:
+            x[mapping[word]] += 1
+    return torch.tensor(x, dtype=torch.float)
+
+
+def main():
+    w, word_to_index_mapping = pickle.load(open('model.pkl', 'rb'))
+    for line in sys.stdin:
+        line = line.strip()
+        x = get_x(line, w, word_to_index_mapping)
+        y = torch.sigmoid(x @ w)
+        if y > 0.85:
+            y = torch.tensor([0.85])
+        elif y < 0.15:
+            y = torch.tensor([0.15])
+
+        print(y.item())
+
+
+main()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tokenizator.py
+++ b/tokenizator.py
@ -0,0 +1,18 @@
+import nltk
+#nltk.download()
+from nltk.corpus import stopwords
+
+
+def tokenize(d):
+
+    chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
+        '{', '}', '[', ']', '>', '<',"&", '~']
+    d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
+    #print(d)
+    tokens = nltk.word_tokenize(d)
+
+    stops = stopwords.words('english')
+    deletethis = chars + stops
+    tokens = [x.lower() for x in tokens if x not in deletethis]
+
+    return tokens
--- a/train.py
+++ b/train.py
@ -0,0 +1,43 @@
+#!/usr/bin/python3 -W ignore::UserWarning
+
+import pickle
+import torch
+
+
+def parse_dataset(documents, document_terms, mapping, voc_len):
+    print('voc len = ', voc_len)
+    x = []
+    for i in range(len(documents)):
+        features = voc_len * [0.]
+        # bias
+        features.append(1.0)
+        for word in document_terms[i]:
+            features[mapping[word]] += 1
+        x.append(features)
+    return torch.tensor(x, dtype=torch.float)
+
+
+def main():
+    model = pickle.load(open('dictionary.pkl', 'rb'))
+    vocabulary, documents, document_terms, word_to_index_mapping, train_y = model
+    x = parse_dataset(documents, document_terms, word_to_index_mapping, len(vocabulary))
+    y = torch.tensor(train_y, dtype=torch.float)
+    w = torch.randn(len(vocabulary) + 1, requires_grad=True)
+    learning_rate = torch.tensor(0.001)
+    counter = 0
+    for _ in range(2500):
+        if counter % 10 == 0:
+            print(counter)
+        y_predicted = torch.sigmoid(x @ w)
+        cost = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))
+        cost.backward()
+        with torch.no_grad():
+            w -= learning_rate * w.grad
+        w.requires_grad = True
+        counter += 1
+    print(w)
+    model = (w, word_to_index_mapping)
+    pickle.dump(model, open('model.pkl', 'wb'))
+
+
+main()