from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import sklearn.metrics from sklearn.cluster import KMeans def main(): with open("test-A/in.tsv") as in_file: documents = in_file.readlines() vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False) vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(documents) predictions = KMeans(n_clusters=45).fit_predict(document_vectors) with open("test-A/out.tsv", "w") as out_file: for prediction in predictions: out_file.write(str(prediction) + '\n') if __name__ == '__main__': main()