42 lines
1.6 KiB
Python
42 lines
1.6 KiB
Python
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|||
|
import numpy as np
|
|||
|
import sklearn.metrics
|
|||
|
from sklearn.cluster import KMeans
|
|||
|
|
|||
|
# Lista z https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
|||
|
def remove_stop_words(document, stopwords):
|
|||
|
return " ".join([word for word in document.split() if word not in stopwords])
|
|||
|
|
|||
|
def write_predictions_to_file(predictions, file):
|
|||
|
with open(file, "w") as out_file:
|
|||
|
for prediction in predictions:
|
|||
|
out_file.write(str(prediction) + '\n')
|
|||
|
|
|||
|
def load_stop_words():
|
|||
|
with open('stopwords.txt') as stopwords_file:
|
|||
|
stopwords = []
|
|||
|
for stopword in stopwords_file.readlines():
|
|||
|
# Remove whitespaces from words, to remove all stop words
|
|||
|
stopwords.append(stopword.strip())
|
|||
|
return stopwords
|
|||
|
|
|||
|
def load_tsv_file(file):
|
|||
|
stopwords = load_stop_words()
|
|||
|
with open(file) as in_file:
|
|||
|
documents = []
|
|||
|
for document in in_file.readlines():
|
|||
|
documents.append(remove_stop_words(document, stopwords))
|
|||
|
return documents
|
|||
|
|
|||
|
def prepare_prediction(catalog):
|
|||
|
documents = load_tsv_file(catalog + "/in.tsv")
|
|||
|
document_vectors = TfidfVectorizer().fit_transform(documents)
|
|||
|
|
|||
|
# Z n_clusters w okolicy 20 uzyskiwałem wyniki pokroju 0.85 (według Geval). Jednakże, z n_clusters 50 wyniki było
|
|||
|
# przy każdym uruchomieniu zbliżone do siebie. Przy n = 20 wachają się od 0.7 do 0.85
|
|||
|
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(document_vectors)
|
|||
|
write_predictions_to_file(predictions, catalog + "/out.tsv")
|
|||
|
|
|||
|
prepare_prediction("dev-0")
|
|||
|
prepare_prediction("test-A")
|