from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import sklearn.metrics from sklearn.cluster import KMeans def preprocess(document, stopwords): return " ".join([word for word in document.split() if word not in stopwords]) def main(): with open('stopwords.txt') as stopwords_file: stopwords = [stopword.strip() for stopword in stopwords_file.readlines()] with open("test-A/in.tsv") as in_file: documents = [preprocess(document, stopwords) for document in in_file.readlines()] vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(documents) predictions = KMeans( n_clusters=25, max_iter=1000).fit_predict(document_vectors) with open("test-A/out.tsv", "w") as out_file: for prediction in predictions: out_file.write(str(prediction) + '\n') if __name__ == '__main__': main()