from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import sklearn.metrics from sklearn.cluster import KMeans # Lista z https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt def remove_stop_words(document, stopwords): return " ".join([word for word in document.split() if word not in stopwords]) def write_predictions_to_file(predictions, file): with open(file, "w") as out_file: for prediction in predictions: out_file.write(str(prediction) + '\n') def load_stop_words(): with open('stopwords.txt') as stopwords_file: stopwords = [] for stopword in stopwords_file.readlines(): # Remove whitespaces from words, to remove all stop words stopwords.append(stopword.strip()) return stopwords def load_tsv_file(file): stopwords = load_stop_words() with open(file) as in_file: documents = [] for document in in_file.readlines(): documents.append(remove_stop_words(document, stopwords)) return documents def prepare_prediction(catalog): documents = load_tsv_file(catalog + "/in.tsv") document_vectors = TfidfVectorizer().fit_transform(documents) # Z n_clusters w okolicy 20 uzyskiwałem wyniki pokroju 0.85 (według Geval). Jednakże, z n_clusters 50 wyniki było # przy każdym uruchomieniu zbliżone do siebie. Przy n = 20 wachają się od 0.7 do 0.85 predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(document_vectors) write_predictions_to_file(predictions, catalog + "/out.tsv") prepare_prediction("dev-0") prepare_prediction("test-A")