Improve the results to 0.8ish

This commit is contained in:
Aleksy Wroblewski 2021-04-17 14:23:23 +02:00
parent 83e6d37f53
commit 9728e579d4
4 changed files with 717 additions and 719 deletions

View File

@ -1,87 +1,87 @@
5
29
35
32
18
20
3
42
4
1
17
20
16
44
13
13
41
36
10
14
0
27
28
8
14
25
19
2
8
38
40
34
34
15
24
13
9
39
5
2
32
4
33
15
21
30
37
9
11
26
15
25
35
34
19
6
14
16
16
34
7
15
2
5
22
12
34
31
20
7
34
30
35
5
4
15
5
0
2
8
8
8
2
9
18
10
30
3
43
42
25
23
24
6
13
10
15
6
11
13
23
0
22
22
3
20
8
3
20
14
11
7
5
18
1
7
16
21
22
1
0
22
3
15
4
22
6
17
10
0
0
22
20
3
0
1
21
5
6
22
16
19
22
2
9
16
1
0
12
15
17
7
2
4
15
4
2
3
6
7
23

1 5 14
29
35
32
18
20
3
42
4
1
17
20
16
44
13
13
41
36
10
2 0 0
27
28
8
14
25
19
2
8
38
40
34
34
15
24
13
9
39
5
2
32
3 4 4
33
15
21
30
37
9
11
26
15
25
35
34
19
6
14
16
16
34
4 7 7
5 15 2
6 5
7 22
8 12 12
9 34 5
10 31 4
11 20 15
12 7 5
13 34 0
14 30 2
15 35 8
16 8
17 8
18 2
19 9 9
20 18 23
21 10 24
30
3
43
42
25
22 6 6
23 13
24 10
25 15
26 6
27 11
28 13
29 23
30 0
31 22
32 22
33 3
34 20
35 8
36 3
37 20
38 14
39 11
40 7
41 5
42 18
43 1
44 7
45 16
46 21 21
47 22 22
48 1 0
49 22
50 3
51 15
52 4
53 22
54 6
55 17 17
56 10
57 0
58 0
59 22
60 20
61 3
62 0
63 1
64 21
65 5
66 6
67 22
68 16
69 19
70 22
71 2
72 9
73 16
74 1 1
75 0 0
76 12
77 15 15
78 17
79 7
80 2
81 4
82 15
83 4
84 2
85 3
86 6
87 7 7
23

View File

@ -17,12 +17,11 @@ def main():
documents = [preprocess(document, stopwords)
for document in in_file.readlines()]
vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False)
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
predictions = KMeans(
n_clusters=45, max_iter=1000).fit_predict(document_vectors)
n_clusters=25, max_iter=1000).fit_predict(document_vectors)
with open("dev-0/out.tsv", "w") as out_file:
for prediction in predictions:

View File

@ -17,12 +17,11 @@ def main():
documents = [preprocess(document, stopwords)
for document in in_file.readlines()]
vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False)
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
predictions = KMeans(
n_clusters=45, max_iter=1000).fit_predict(document_vectors)
n_clusters=25, max_iter=1000).fit_predict(document_vectors)
with open("test-A/out.tsv", "w") as out_file:
for prediction in predictions:

File diff suppressed because it is too large Load Diff