Improve the result by around 0.1 geval score by removing polish stopwords

2021-04-15 18:29:53 +02:00 · 2021-04-15 18:29:53 +02:00 · 83e6d37f53
commit 83e6d37f53
parent 31a45be3f8
5 changed files with 1078 additions and 708 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
@ -1,87 +1,87 @@
 31
 33
 30
 12
 32
 19
 41
 15
 1
 11
 15
 19
 7
 9
 5
 42
 20
 36
 4
 24
 21
 40
 2
 0
 25
 23
 10
 2
 24
 44
 6
 43
 26
 9
 5
 3
 35
 22
 10
 12
 1
 27
 41
 29
-8
+35
-28
+32
 3
 7
 15
 6
 13
 30
 6
 23
 18
-0
+20
 7
 7
 6
 17
 3
-34
+42
-15
+4
-37
+1
-19
+17
 20
 16
 44
 13
 13
 41
 36
 10
 0
 27
 28
 8
 14
 25
 19
 2
 8
 38
-3
+40
 34
 34
 15
 24
 13
 9
 39
 5
 2
 32
 4
-8
+33
-14
+15
-7
+21
 30
 37
 9
 11
 26
 15
 25
-18
+35
-29
+34
-39
+19
-11
+6
 15
 11
 24
 14
 16
 16
 34
 7
 15
 12
 34
 31
 20
 7
 34
 30
 35
 9
 18
 10
 30
 3
 43
 42
 25
 6
 21
 22
-24
+1
 17
 1
 0
 15
 7
 23
--- a/solution-dev0.py
+++ b/solution-dev0.py
@ -4,15 +4,25 @@ import sklearn.metrics
 from sklearn.cluster import KMeans
 def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])
 def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]
    with open("dev-0/in.tsv") as in_file:
-        documents = in_file.readlines()
+        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]
        vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False)
        vectorizer = TfidfVectorizer()
        document_vectors = vectorizer.fit_transform(documents)
-        predictions = KMeans(n_clusters=45).fit_predict(document_vectors)        
+        predictions = KMeans(
            n_clusters=45, max_iter=1000).fit_predict(document_vectors)
        with open("dev-0/out.tsv", "w") as out_file:
            for prediction in predictions:
--- a/solution-testA.py
+++ b/solution-testA.py
@ -4,15 +4,25 @@ import sklearn.metrics
 from sklearn.cluster import KMeans
 def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])
 def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]
    with open("test-A/in.tsv") as in_file:
-        documents = in_file.readlines()
+        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]
        vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False)
        vectorizer = TfidfVectorizer()
        document_vectors = vectorizer.fit_transform(documents)
-        predictions = KMeans(n_clusters=45).fit_predict(document_vectors)        
+        predictions = KMeans(
            n_clusters=45, max_iter=1000).fit_predict(document_vectors)
        with open("test-A/out.tsv", "w") as out_file:
            for prediction in predictions:
--- a/stopwords.txt
+++ b/stopwords.txt
@ -0,0 +1,350 @@
 a
 aby
 ach
 acz
 aczkolwiek
 aj
 albo
 ale
 alez
 ależ
 ani
 az
 aż
 bardziej
 bardzo
 beda
 bedzie
 bez
 deda
 będą
 bede
 będę
 będzie
 bo
 bowiem
 by
 byc
 być
 byl
 byla
 byli
 bylo
 byly
 był
 była
 było
 były
 bynajmniej
 cala
 cali
 caly
 cała
 cały
 ci
 cie
 ciebie
 cię
 co
 cokolwiek
 cos
 coś
 czasami
 czasem
 czemu
 czy
 czyli
 daleko
 dla
 dlaczego
 dlatego
 do
 dobrze
 dokad
 dokąd
 dosc
 dość
 duzo
 dużo
 dwa
 dwaj
 dwie
 dwoje
 dzis
 dzisiaj
 dziś
 gdy
 gdyby
 gdyz
 gdyż
 gdzie
 gdziekolwiek
 gdzies
 gdzieś
 go
 i
 ich
 ile
 im
 inna
 inne
 inny
 innych
 iz
 iż
 ja
 jak
 jakas
 jakaś
 jakby
 jaki
 jakichs
 jakichś
 jakie
 jakis
 jakiś
 jakiz
 jakiż
 jakkolwiek
 jako
 jakos
 jakoś
 ją
 je
 jeden
 jedna
 jednak
 jednakze
 jednakże
 jedno
 jego
 jej
 jemu
 jesli
 jest
 jestem
 jeszcze
 jeśli
 jezeli
 jeżeli
 juz
 już
 kazdy
 każdy
 kiedy
 kilka
 kims
 kimś
 kto
 ktokolwiek
 ktora
 ktore
 ktorego
 ktorej
 ktory
 ktorych
 ktorym
 ktorzy
 ktos
 ktoś
 która
 które
 którego
 której
 który
 których
 którym
 którzy
 ku
 lat
 lecz
 lub
 ma
 mają
 mało
 mam
 mi
 miedzy
 między
 mimo
 mna
 mną
 mnie
 moga
 mogą
 moi
 moim
 moj
 moja
 moje
 moze
 mozliwe
 mozna
 może
 możliwe
 można
 mój
 mu
 musi
 my
 na
 nad
 nam
 nami
 nas
 nasi
 nasz
 nasza
 nasze
 naszego
 naszych
 natomiast
 natychmiast
 nawet
 nia
 nią
 nic
 nich
 nie
 niech
 niego
 niej
 niemu
 nigdy
 nim
 nimi
 niz
 niż
 no
 o
 obok
 od
 około
 on
 ona
 one
 oni
 ono
 oraz
 oto
 owszem
 pan
 pana
 pani
 po
 pod
 podczas
 pomimo
 ponad
 poniewaz
 ponieważ
 powinien
 powinna
 powinni
 powinno
 poza
 prawie
 przeciez
 przecież
 przed
 przede
 przedtem
 przez
 przy
 roku
 rowniez
 również
 sam
 sama
 są
 sie
 się
 skad
 skąd
 soba
 sobą
 sobie
 sposob
 sposób
 swoje
 ta
 tak
 taka
 taki
 takie
 takze
 także
 tam
 te
 tego
 tej
 ten
 teraz
 też
 to
 toba
 tobą
 tobie
 totez
 toteż
 totobą
 trzeba
 tu
 tutaj
 twoi
 twoim
 twoj
 twoja
 twoje
 twój
 twym
 ty
 tych
 tylko
 tym
 u
 w
 wam
 wami
 was
 wasz
 wasza
 wasze
 we
 według
 wiele
 wielu
 więc
 więcej
 wlasnie
 właśnie
 wszyscy
 wszystkich
 wszystkie
 wszystkim
 wszystko
 wtedy
 wy
 z
 za
 zaden
 zadna
 zadne
 zadnych
 zapewne
 zawsze
 ze
 zeby
 zeznowu
 zł
 znow
 znowu
 znów
 zostal
 został
 żaden
 żadna
 żadne
 żadnych
 że
 żeby
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
`@ -1,87 +1,87 @@`
	`31`
	`33`
	`30`
	`12`
	`32`
	`19`
	`41`
	`15`
	`1`
	`11`
	`15`
	`19`
	`7`
	`9`
	`5`	`5`
	`42`
	`20`
	`36`
	`4`
	`24`
	`21`
	`40`
	`2`
	`0`
	`25`
	`23`
	`10`
	`2`
	`24`
	`44`
	`6`
	`43`
	`26`
	`9`
	`5`
	`3`
	`35`
	`22`
	`10`
	`12`
	`1`
	`27`
	`41`
	`29`	`29`
	`8`	`35`
	`28`	`32`
	`3`
	`7`
	`15`
	`6`
	`13`
	`30`
	`6`
	`23`
	`18`	`18`
	`0`	`20`
	`7`
	`7`
	`6`
	`17`
	`3`	`3`
	`34`	`42`
	`15`	`4`
	`37`	`1`
	`19`	`17`
		`20`
	`16`	`16`
		`44`
		`13`
		`13`
		`41`
		`36`
		`10`
		`0`
		`27`
		`28`
		`8`
	`14`	`14`
		`25`
		`19`
		`2`
	`8`	`8`
	`38`	`38`
	`3`	`40`
		`34`
		`34`
		`15`
		`24`
		`13`
		`9`
		`39`
		`5`
		`2`
	`32`	`32`
	`4`	`4`
	`8`	`33`
	`14`	`15`
	`7`	`21`
		`30`
		`37`
		`9`
		`11`
		`26`
	`15`	`15`
	`25`	`25`
	`18`	`35`
	`29`	`34`
	`39`	`19`
	`11`	`6`
	`15`
	`11`
	`24`
	`14`	`14`
		`16`
		`16`
		`34`
		`7`
		`15`
		`12`
		`34`
		`31`
		`20`
		`7`
		`34`
		`30`
		`35`
		`9`
		`18`
		`10`
		`30`
		`3`
		`43`
		`42`
		`25`
		`6`
		`21`
	`22`	`22`
	`24`	`1`
		`17`
		`1`
		`0`
		`15`
		`7`
		`23`