from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import sklearn.metrics from sklearn.cluster import KMeans stopwords = [] with open('./stopwords.txt') as file: for stopword in file.readlines(): stopwords.append(stopword.strip()) with open("./dev-0/in.tsv") as source: documents_tmp = [document for document in source.readlines()] documents = [] for doc in documents_tmp: documents.append(" ".join([word for word in doc.split() if word not in stopwords])) model = TfidfVectorizer() doc_vectors = model.fit_transform(documents) #przy takim ustawieniu zbliżyłem się do 0.8, średnio przy 10 próbach wyszło mi 0.76, jednak wynik nigdy nie spadł ponizej 0.74 pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors) with open("./dev-0/out.tsv", "w") as result: for prediction in pred: result.write(str(prediction) + '\n') with open("./test-A/in.tsv") as source: documents_tmp = [document for document in source.readlines()] documents = [] for doc in documents_tmp: documents.append(" ".join([word for word in doc.split() if word not in stopwords])) model = TfidfVectorizer() doc_vectors = model.fit_transform(documents) pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors) with open("./test-A/out.tsv", "w") as result: for prediction in pred: result.write(str(prediction) + '\n')