from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import sklearn.metrics from sklearn.cluster import KMeans def preprocess(document, stopwords): return " ".join([word for word in document.split() if word not in stopwords]) def predict(in_file, out_file, stopwords): with open(in_file) as in_file: documents = [preprocess(document, stopwords) for document in in_file.readlines()] vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(documents) predictions = KMeans( n_clusters=25, max_iter=1000).fit_predict(document_vectors) with open(out_file, "w") as out_file: for prediction in predictions: out_file.write(str(prediction) + '\n') def main(): with open('stopwords.txt') as stopwords_file: stopwords = [stopword.strip() for stopword in stopwords_file.readlines()] predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords) predict("test-A/in.tsv", "test-A/out.tsv", stopwords) if __name__ == '__main__': main()