Merge two scripts into one

This commit is contained in:
Aleksy Wroblewski 2021-04-17 14:29:51 +02:00
parent 9728e579d4
commit 4dffa4ee0a
3 changed files with 717 additions and 680 deletions

View File

@ -1,87 +1,87 @@
14
0
4
7
2
5
22
12
5
4
15
5
0
2
8
8
8
2
9
23
24
6
13
10
15
6
11
13
23
0
22
22
3
20
8
3
20
14
11
7
5
18
11
10
12
22
5
19
8
23
1
8
5
11
16
9
9
13
22
7
16
21
22
0
22
3
15
4
22
6
17
10
0
0
22
20
3
0
1
21
5
6
22
16
15
8
0
14
6
24
11
19
22
2
2
21
9
16
1
0
2
11
18
14
12
15
23
3
2
17
7
3
4
2
11
14
2
8
10
2
0
3
15
11
11
19
21
2
11
2
4
15
4
5
21
2
3
6
10
2
22
7
3
19
11
23
8
3
17
20
1
8
1
16
2
21
22

1 14 18
14
0
4
7
2
5
22
12
5
4
15
5
0
2
8
8
8
2
9
23
24
6
13
10
15
6
11
13
23
0
22
22
3
20
8
3
20
14
11
7
5
1 18 18
2 11
3 10
4 12
5 22
6 5
7 19
8 8
9 23
10 1 1
11 8
12 5
13 11
14 16
15 9
16 9
17 13
18 22
19 7 7
20 16 16
21
22
0
22
3
15
4
22
6
17
21 10 10
22 0 0
0
22
20
3
0
1
21
5
23 6 6
24 22 15
25 16 8
26 0
27 14
28 6
29 24
30 11
31 19 19
22
32 2 2
33 2
34 21
35 9 9
36 16 2
37 1 11
38 0 18
39 14
40 12 12
41 15 23
42 3
43 2
44 17 17
45 7 3
46 4
47 2
48 11
49 14
50 2
51 8
52 10
53 2
54 0
55 3
56 15
57 11
58 11
59 19
60 21
61 2
62 11
63 2 2
64 4 4
65 15 5
66 4 21
67 2 2
68 3 3
69 6 10
70 2
71 22
72 7 7
73 3
74 19
75 11
76 23
77 8
78 3
79 17
80 20
81 1
82 8
83 1
84 16
85 2
86 21
87 22

37
solution.py Normal file
View File

@ -0,0 +1,37 @@
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
def preprocess(document, stopwords):
return " ".join([word for word in document.split() if word not in stopwords])
def predict(in_file, out_file, stopwords):
with open(in_file) as in_file:
documents = [preprocess(document, stopwords)
for document in in_file.readlines()]
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
predictions = KMeans(
n_clusters=25, max_iter=1000).fit_predict(document_vectors)
with open(out_file, "w") as out_file:
for prediction in predictions:
out_file.write(str(prediction) + '\n')
def main():
with open('stopwords.txt') as stopwords_file:
stopwords = [stopword.strip()
for stopword in stopwords_file.readlines()]
predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)
predict("test-A/in.tsv", "test-A/out.tsv", stopwords)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff