Merge two scripts into one

2021-04-17 14:29:51 +02:00 · 2021-04-17 14:29:51 +02:00 · 4dffa4ee0a
commit 4dffa4ee0a
parent 9728e579d4
3 changed files with 717 additions and 680 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
@ -1,87 +1,87 @@
 14
 0
 4
 7
 2
 5
 22
 12
 5
 4
 15
 5
 0
 2
 8
 8
 8
 2
 9
 23
 24
 6
 13
 10
 15
 6
 11
 13
 23
 0
 22
 22
 3
 20
 8
 3
 20
 14
 11
 7
 5
 18
 11
 10
 12
 22
 5
 19
 8
 23
 1
 8
 5
 11
 16
 9
 9
 13
 22
 7
 16
 21
 22
 0
 22
 3
 15
 4
 22
 6
 17
 10
 0
 0
 22
 20
 3
 0
 1
 21
 5
 6
-22
+15
-16
+8
 0
 14
 6
 24
 11
 19
 22
 2
 2
 21
 9
-16
+2
-1
+11
-0
+18
 14
 12
-15
+23
 3
 2
 17
-7
+3
 4
 2
 11
 14
 2
 8
 10
 2
 0
 3
 15
 11
 11
 19
 21
 2
 11
 2
 4
-15
+5
-4
+21
 2
 3
-6
+10
 2
 22
 7
 3
 19
 11
 23
 8
 3
 17
 20
 1
 8
 1
 16
 2
 21
 22
--- a/solution.py
+++ b/solution.py
@ -0,0 +1,37 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 import sklearn.metrics
 from sklearn.cluster import KMeans
 def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])
 def predict(in_file, out_file, stopwords):
    with open(in_file) as in_file:
        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]
        vectorizer = TfidfVectorizer()
        document_vectors = vectorizer.fit_transform(documents)
        predictions = KMeans(
            n_clusters=25, max_iter=1000).fit_predict(document_vectors)
        with open(out_file, "w") as out_file:
            for prediction in predictions:
                out_file.write(str(prediction) + '\n')
 def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]
    predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)
    predict("test-A/in.tsv", "test-A/out.tsv", stopwords)
 if __name__ == '__main__':
    main()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
`@ -1,87 +1,87 @@`
	`14`
	`0`
	`4`
	`7`
	`2`
	`5`
	`22`
	`12`
	`5`
	`4`
	`15`
	`5`
	`0`
	`2`
	`8`
	`8`
	`8`
	`2`
	`9`
	`23`
	`24`
	`6`
	`13`
	`10`
	`15`
	`6`
	`11`
	`13`
	`23`
	`0`
	`22`
	`22`
	`3`
	`20`
	`8`
	`3`
	`20`
	`14`
	`11`
	`7`
	`5`
	`18`	`18`
		`11`
		`10`
		`12`
		`22`
		`5`
		`19`
		`8`
		`23`
	`1`	`1`
		`8`
		`5`
		`11`
		`16`
		`9`
		`9`
		`13`
		`22`
	`7`	`7`
	`16`	`16`
	`21`
	`22`
	`0`
	`22`
	`3`
	`15`
	`4`
	`22`
	`6`
	`17`
	`10`	`10`
	`0`	`0`
	`0`
	`22`
	`20`
	`3`
	`0`
	`1`
	`21`
	`5`
	`6`	`6`
	`22`	`15`
	`16`	`8`
		`0`
		`14`
		`6`
		`24`
		`11`
	`19`	`19`
	`22`
	`2`	`2`
		`2`
		`21`
	`9`	`9`
	`16`	`2`
	`1`	`11`
	`0`	`18`
		`14`
	`12`	`12`
	`15`	`23`
		`3`
		`2`
	`17`	`17`
	`7`	`3`
		`4`
		`2`
		`11`
		`14`
		`2`
		`8`
		`10`
		`2`
		`0`
		`3`
		`15`
		`11`
		`11`
		`19`
		`21`
		`2`
		`11`
	`2`	`2`
	`4`	`4`
	`15`	`5`
	`4`	`21`
	`2`	`2`
	`3`	`3`
	`6`	`10`
		`2`
		`22`
	`7`	`7`
		`3`
		`19`
		`11`
		`23`
		`8`
		`3`
		`17`
		`20`
		`1`
		`8`
		`1`
		`16`
		`2`
		`21`
		`22`