From 9728e579d4235c769674b92ae5757482dc71eda8 Mon Sep 17 00:00:00 2001 From: Aleksy Wroblewski Date: Sat, 17 Apr 2021 14:23:23 +0200 Subject: [PATCH] Improve the results to 0.8ish --- dev-0/out.tsv | 148 +++--- solution-dev0.py | 3 +- solution-testA.py | 3 +- test-A/out.tsv | 1282 ++++++++++++++++++++++----------------------- 4 files changed, 717 insertions(+), 719 deletions(-) diff --git a/dev-0/out.tsv b/dev-0/out.tsv index 7881845..311a453 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -1,87 +1,87 @@ -5 -29 -35 -32 -18 -20 -3 -42 -4 -1 -17 -20 -16 -44 -13 -13 -41 -36 -10 +14 0 -27 -28 -8 -14 -25 -19 -2 -8 -38 -40 -34 -34 -15 -24 -13 -9 -39 -5 -2 -32 4 -33 -15 -21 -30 -37 -9 -11 -26 -15 -25 -35 -34 -19 -6 -14 -16 -16 -34 7 -15 +2 +5 +22 12 -34 -31 -20 -7 -34 -30 -35 +5 +4 +15 +5 +0 +2 +8 +8 +8 +2 9 -18 -10 -30 -3 -43 -42 -25 +23 +24 6 +13 +10 +15 +6 +11 +13 +23 +0 +22 +22 +3 +20 +8 +3 +20 +14 +11 +7 +5 +18 +1 +7 +16 21 22 -1 +0 +22 +3 +15 +4 +22 +6 17 +10 +0 +0 +22 +20 +3 +0 +1 +21 +5 +6 +22 +16 +19 +22 +2 +9 +16 1 0 +12 15 +17 +7 +2 +4 +15 +4 +2 +3 +6 7 -23 diff --git a/solution-dev0.py b/solution-dev0.py index d611c26..6bad508 100644 --- a/solution-dev0.py +++ b/solution-dev0.py @@ -17,12 +17,11 @@ def main(): documents = [preprocess(document, stopwords) for document in in_file.readlines()] - vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False) vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(documents) predictions = KMeans( - n_clusters=45, max_iter=1000).fit_predict(document_vectors) + n_clusters=25, max_iter=1000).fit_predict(document_vectors) with open("dev-0/out.tsv", "w") as out_file: for prediction in predictions: diff --git a/solution-testA.py b/solution-testA.py index 05ac567..3346661 100644 --- a/solution-testA.py +++ b/solution-testA.py @@ -17,12 +17,11 @@ def main(): documents = [preprocess(document, stopwords) for document in in_file.readlines()] - vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False) vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(documents) predictions = KMeans( - n_clusters=45, max_iter=1000).fit_predict(document_vectors) + n_clusters=25, max_iter=1000).fit_predict(document_vectors) with open("test-A/out.tsv", "w") as out_file: for prediction in predictions: diff --git a/test-A/out.tsv b/test-A/out.tsv index 15a02f7..fe8b5ad 100644 --- a/test-A/out.tsv +++ b/test-A/out.tsv @@ -1,691 +1,691 @@ -39 -17 -11 -27 -41 -2 -39 -12 -37 -31 -38 -34 -12 -42 -23 -30 -14 -36 -14 -23 +18 3 -43 -2 -19 -22 -21 -38 -5 -12 -16 -25 -12 -2 -11 -32 -4 -34 -13 -11 -9 -37 -44 -27 -12 -22 -42 -40 -7 -11 -23 -11 -44 -16 -19 -44 -13 -16 -11 -16 -6 1 -27 -10 -41 +5 +3 +11 +4 6 +18 +3 +14 +17 +6 +20 +22 +3 +9 +5 +9 +22 +13 +0 +11 +20 +20 +18 +5 +4 +2 +18 +10 +6 +11 +1 +19 24 -19 -2 -27 -18 -25 -32 -14 -44 -26 -39 -40 -6 -37 -6 -22 -8 -33 -16 -41 -34 -13 -1 -38 -37 -0 -25 -32 -41 -27 -27 -39 -22 -38 -42 -14 -43 -14 -44 -31 -40 -4 -5 -36 -38 -12 -11 -6 -11 -26 -41 -41 -38 -25 -40 -3 -12 -44 -40 -39 -11 -14 -13 -12 -38 -25 -32 -1 -13 -5 -32 -20 -4 -33 -20 -44 -42 -3 -20 -21 -42 -38 -20 -1 -13 -14 17 -27 -23 -19 -10 +4 +1 +6 14 -12 -31 -43 -19 +4 +9 +6 +23 +23 15 +21 +1 +22 +1 +4 +15 +20 +13 +10 +3 +1 +18 +18 +15 +9 +15 +9 +13 +1 +20 +11 +10 +3 +10 +19 +9 +4 +18 +4 +10 +18 +20 +13 13 7 -25 -41 -33 -0 -32 -32 -38 -4 -26 -44 -14 -33 -2 -42 -13 -15 8 -3 -3 -35 -21 -21 -12 -28 -12 -4 -18 -21 -10 13 -12 -32 -40 -11 -25 -39 -37 +3 17 -39 -40 -14 -27 -11 -7 -34 -39 -12 -34 -23 -38 -7 -34 -12 -2 -23 -6 -23 -13 -25 -35 -28 -32 -0 -36 -33 -10 -16 -5 -13 -7 -2 -25 -32 -19 -34 -40 18 -6 +14 +3 +18 +20 +10 +19 +3 +9 +16 +18 +18 +18 +12 +12 +0 +9 +4 +2 +18 +13 +2 +13 24 -5 -38 -41 -4 -4 -42 -7 -40 -2 -8 -12 -17 -28 -9 -42 -0 -3 -13 -12 -14 -1 -20 -25 -40 -38 -17 -12 -42 -23 -25 -3 -44 -31 -42 -12 -3 -31 -43 -25 -26 -3 -12 -9 -3 -11 -14 -2 -36 -40 -36 -38 -34 -4 -27 -9 -27 -5 -7 -36 -44 -4 -17 -42 -1 -17 -14 -38 -31 -4 -31 -9 -33 -16 -25 -44 -18 -21 -17 -33 -43 -16 -17 -37 -38 -26 -25 -42 -16 -33 -1 -13 -17 -11 -38 -31 -23 -38 -38 -1 -43 -23 -19 -3 -14 -34 -13 -27 -19 -7 -25 -12 -14 -36 -4 -14 -44 -30 -23 -32 -10 -23 -4 -38 -11 -9 -7 -0 -38 -37 -10 -5 -12 -12 -13 -31 -19 -36 -21 -22 6 +1 22 -39 -12 -22 -10 -29 -2 -21 +1 +17 +3 +3 24 -1 -11 -23 -31 -39 -41 -14 -4 -16 -32 -13 -12 -19 -23 -29 -12 -1 -12 -37 -9 -12 -40 -13 -25 -7 -4 -1 +18 +17 21 -33 -36 6 -2 -0 -17 -11 -41 -34 -23 -32 -19 -7 -44 -8 -1 -44 -12 -19 -2 -16 -11 -37 -13 -14 -15 -30 -8 -12 -42 -13 -44 -7 -12 -29 -13 +4 +10 3 -7 -13 -4 -25 -40 -44 -15 -9 -14 -11 -40 -44 -37 -37 -39 -37 -39 -38 -42 -17 -6 -27 -23 -0 -28 -17 -38 -22 -35 -40 -4 -17 -38 -44 -11 -7 -40 -12 -16 -12 -20 -41 -3 -16 -11 -38 -25 -2 -30 -43 -14 -12 -22 -2 -19 -2 -38 -17 -9 -15 -14 -38 -7 -12 -44 -0 -36 -16 -44 1 -33 -12 -12 -8 -36 -37 -11 -21 -43 -44 -23 -4 -40 -11 -20 -41 -29 -12 -2 -22 -12 +9 +10 6 -12 -30 -37 -17 +18 +10 +19 +15 +10 +18 +19 5 -32 +14 8 -39 -18 -32 -24 -2 -11 -26 -12 -3 -30 -13 -44 -12 -32 -11 -11 -25 -15 -37 -34 -12 -1 -34 -16 -13 -3 -36 -2 -39 -36 -8 -13 -3 +5 4 -15 -26 -38 +17 21 -43 -38 -18 -1 -1 -44 +5 11 +5 +15 +5 +15 +10 +9 +9 +3 +22 +20 +17 +9 +6 +10 +0 +12 +5 +15 +21 +10 +15 +8 +20 +19 +19 +24 23 +16 4 -23 -13 -26 -2 +9 +8 +10 +15 +10 +24 7 -29 -42 +21 +18 +17 +19 +18 +6 +8 +6 +13 +18 +11 +17 +3 +18 +19 +5 +1 +18 +18 +0 +16 +18 +23 +9 +9 +1 +21 +3 +18 +2 +17 +18 +15 +14 +17 +6 +11 +2 +20 +22 +15 10 12 -39 -44 -34 -21 -3 -7 -16 -38 -35 -43 -4 -11 -1 -28 -35 -35 -14 -35 -34 -7 -32 -16 -34 -39 -33 +8 19 +12 +13 +8 +17 +5 +18 +10 +13 +11 +0 +19 +20 +16 +5 +18 +22 +1 +14 +18 +18 +13 +13 +5 +21 +18 +24 +7 +6 +16 +8 +6 +20 +12 +21 +10 +6 +9 +15 +5 +2 +5 +24 +16 +18 +5 +18 +10 +18 +4 +4 +10 +6 +21 +9 +4 +10 23 -43 +24 +6 +5 +5 +1 +9 +11 +18 +18 +20 +24 +22 +13 +9 +5 +9 23 +17 +18 +4 +13 +13 +15 +15 +5 +3 +18 +3 18 3 -13 -12 -0 -36 -19 -5 -4 -4 -42 -8 -34 6 +8 +24 13 -27 -17 -34 -44 -26 -2 -36 -38 -16 -12 -12 -44 +18 +18 +18 +20 +8 +0 +3 +5 +15 +3 +23 +18 +20 +18 14 +15 +10 +10 +1 +24 +3 +14 +18 +2 +15 +6 +22 +20 +21 +18 +17 +10 +22 +18 +11 +10 +2 +9 +18 +13 +9 +18 +9 +23 +4 +17 +3 +13 +18 +1 +6 21 12 -14 -39 +3 +4 +17 +17 +6 +2 +10 +9 +20 +3 19 -39 -33 13 4 -25 -2 -38 -32 -2 +18 +18 +6 0 -12 +17 +20 11 +18 +1 +15 +1 +18 +2 +18 +3 +9 +2 +18 +18 +10 +6 +20 +22 +18 +6 +15 +6 +18 +5 +6 +18 +10 +13 +21 +13 +5 +15 +18 +18 +18 +11 +20 +16 +1 +3 +16 +22 +14 +20 +21 +18 +7 +15 +4 +6 +20 +11 +18 +1 +20 +2 +9 +24 +3 +7 +2 +17 +16 +4 +18 +6 +18 +10 +17 +21 +15 +13 +16 +17 +4 +24 +6 +9 +1 +18 +4 +14 +15 +14 +18 +18 +9 +14 +16 +10 +19 +2 +12 +8 +4 +24 +4 +22 +15 +13 +16 +3 +18 +1 +21 +20 +6 +18 +6 +5 +3 +21 +18 +1 +24 +18 +14 +3 +0 +18 +6 +4 +11 +20 +18 +24 +16 +14 +24 +9 +17 +21 +6 +3 +12 +18 +3 +4 +17 +8 +6 +6 +7 +18 +14 +1 +11 +0 +23 +22 +13 +14 +1 +5 +9 +18 +2 +11 +3 +6 +13 +6 +3 +20 +16 +18 +19 +7 +18 +18 +19 +1 +11 +1 23 6 -39 -44 +18 +18 +10 +18 +6 +19 +1 +1 +13 +24 +16 +17 +6 +18 +17 +18 +10 +18 +18 +11 +16 +18 +7 +10 +18 +13 +24 +18 +20 +4 0 +2 +18 +15 +15 +4 +1 +22 +13 +22 +10 +23 +18 +18 +18 +5 +17 +6 +18 +4 +17 +4 +18 +21 +4 +3 +10 +0 +13 +1 +15 +8 +23 +13 +9 +18 +17 +21 +14 +3 +18 +18 +8 +20 +2 +18 +22 +18 +5 +10 +6 +20 +13 +20 +20 +24 +10 +4 +7 +17 +18 +10 +9 +16 +17 +20 +23 +11 +14 +24 +3 +6 +6 +4 +11 +15 +6 +9 +18 +12 +14 +8 +3 +4 +18 +11 +18 +19 +11 +12 +6 +1 +22 +0 +18 +4 +12