Add solution for full dataset

2021-05-14 00:04:00 +02:00 · 2021-05-14 00:04:00 +02:00 · 48d4451077
commit 48d4451077
parent 142f0ca72e
3 changed files with 5745 additions and 301 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/solution.py
+++ b/solution.py
@ -20,22 +20,23 @@ model = NeuralNetworkModel()
 criterion = torch.nn.BCELoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

+
 def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
-    for doc in docs[:10000]:
+    for doc in docs:
        y_with_doc = doc.split('\t')
        y.append(y_with_doc[0])
-        doc = y_with_doc[1]
-        doc = preprocess(doc)
-        docs_preprocessed.append(doc)
+        docs_preprocessed.append(preprocess(y_with_doc[1]))
    y = [int(value) for value in y]
    y = np.reshape(y, (len(y), 1))
    tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
+    print("Fitting vectorizer...")
    global vectorizer
    vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
+    print("DONE!")
    x = vectorizer.dv.vectors
    for epoch in range(5):
        loss_score = 0
@ -58,25 +59,21 @@ def train():
        print(epoch)
        print(get_loss_acc(model, x, y, criterion, optimizer))

+
 def classify(path):
+    print("Predicting for", path)
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs = [preprocess(doc) for doc in docs]
    test_x = [vectorizer.infer_vector(doc) for doc in docs]
    test_x = np.array(test_x, np.float32)
-    test_x = torch.tensor(test_x)
-    predictions = model(test_x).detach().numpy()[:, 0]
-    p = []
-    for prediction in predictions:
-        if prediction >= 0.5:
-            p.append(1)
-        else:
-            p.append(0)
+    predictions = model(torch.tensor(test_x)).detach().numpy()[:, 0]
+    predictions = [1 if value >= 0.5 else 0 for value in predictions]
    with open(path + 'out.tsv', 'w') as file:
-        for prediction in p:
+        for prediction in predictions:
            file.write("%i\n" % prediction)


 train()
 classify('dev-0/')
-# classify('test-A/')
+classify('test-A/')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv