38 lines
1.1 KiB
Python
38 lines
1.1 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import numpy as np
|
|
import sklearn.metrics
|
|
from sklearn.cluster import KMeans
|
|
|
|
|
|
def preprocess(document, stopwords):
|
|
return " ".join([word for word in document.split() if word not in stopwords])
|
|
|
|
def predict(in_file, out_file, stopwords):
|
|
|
|
with open(in_file) as in_file:
|
|
documents = [preprocess(document, stopwords)
|
|
for document in in_file.readlines()]
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
|
|
document_vectors = vectorizer.fit_transform(documents)
|
|
predictions = KMeans(
|
|
n_clusters=25, max_iter=1000).fit_predict(document_vectors)
|
|
|
|
with open(out_file, "w") as out_file:
|
|
for prediction in predictions:
|
|
out_file.write(str(prediction) + '\n')
|
|
|
|
|
|
def main():
|
|
with open('stopwords.txt') as stopwords_file:
|
|
stopwords = [stopword.strip()
|
|
for stopword in stopwords_file.readlines()]
|
|
|
|
predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)
|
|
predict("test-A/in.tsv", "test-A/out.tsv", stopwords)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|