Merge two scripts into one

2021-04-17 14:29:51 +02:00 · 2021-04-17 14:29:51 +02:00 · 4dffa4ee0a
commit 4dffa4ee0a
parent 9728e579d4
3 changed files with 717 additions and 680 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
@ -1,87 +1,87 @@
-14
-0
-4
-7
-2
-5
-22
-12
-5
-4
-15
-5
-0
-2
-8
-8
-8
-2
-9
-23
-24
-6
-13
-10
-15
-6
-11
-13
-23
-0
-22
-22
-3
-20
-8
-3
-20
-14
-11
-7
-5
 18
+11
+10
+12
+22
+5
+19
+8
+23
 1
+8
+5
+11
+16
+9
+9
+13
+22
 7
 16
-21
-22
-0
-22
-3
-15
-4
-22
-6
-17
 10
 0
-0
-22
-20
-3
-0
-1
-21
-5
 6
-22
-16
+15
+8
+0
+14
+6
+24
+11
 19
-22
 2
+2
+21
 9
-16
-1
-0
+2
+11
+18
+14
 12
-15
+23
+3
+2
 17
-7
+3
+4
+2
+11
+14
+2
+8
+10
+2
+0
+3
+15
+11
+11
+19
+21
+2
+11
 2
 4
-15
-4
+5
+21
 2
 3
-6
+10
+2
+22
 7
+3
+19
+11
+23
+8
+3
+17
+20
+1
+8
+1
+16
+2
+21
+22
--- a/solution.py
+++ b/solution.py
@ -0,0 +1,37 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+import sklearn.metrics
+from sklearn.cluster import KMeans
+
+
+def preprocess(document, stopwords):
+    return " ".join([word for word in document.split() if word not in stopwords])
+
+def predict(in_file, out_file, stopwords):
+   
+    with open(in_file) as in_file:
+        documents = [preprocess(document, stopwords)
+                     for document in in_file.readlines()]
+
+        vectorizer = TfidfVectorizer()
+
+        document_vectors = vectorizer.fit_transform(documents)
+        predictions = KMeans(
+            n_clusters=25, max_iter=1000).fit_predict(document_vectors)
+
+        with open(out_file, "w") as out_file:
+            for prediction in predictions:
+                out_file.write(str(prediction) + '\n')
+
+
+def main():
+    with open('stopwords.txt') as stopwords_file:
+        stopwords = [stopword.strip()
+                     for stopword in stopwords_file.readlines()]
+
+    predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)
+    predict("test-A/in.tsv", "test-A/out.tsv", stopwords)
+
+
+if __name__ == '__main__':
+    main()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv