24 lines
685 B
Python
24 lines
685 B
Python
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
import numpy as np
|
||
|
import sklearn.metrics
|
||
|
from sklearn.cluster import KMeans
|
||
|
|
||
|
|
||
|
def main():
|
||
|
with open("test-A/in.tsv") as in_file:
|
||
|
documents = in_file.readlines()
|
||
|
|
||
|
vectorizer = TfidfVectorizer(ngram_range=(1, 3), use_idf=False)
|
||
|
vectorizer = TfidfVectorizer()
|
||
|
|
||
|
document_vectors = vectorizer.fit_transform(documents)
|
||
|
predictions = KMeans(n_clusters=45).fit_predict(document_vectors)
|
||
|
|
||
|
with open("test-A/out.tsv", "w") as out_file:
|
||
|
for prediction in predictions:
|
||
|
out_file.write(str(prediction) + '\n')
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|