import sklearn.metrics from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np with open("dev-0/in.tsv") as myFile: cnt = myFile.readlines() vectorizer = TfidfVectorizer(ngram_range = (1,2), use_idf = False) vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(cnt) kmeans = KMeans(n_clusters = 77, random_state = 0).fit(document_vectors) pred_myFile = kmeans.fit_predict(document_vectors) tmp = kmeans.labels_ tmp = np.array2string(tmp, precision = 2, separator = '\n',suppress_small = True) myFile = open("dev-0/out.tsv", "a") myFile.write(tmp)