2021-04-13 00:35:55 +02:00
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
import numpy as np
|
|
|
|
import sklearn.metrics
|
|
|
|
from sklearn.cluster import KMeans
|
|
|
|
|
2021-04-13 11:19:34 +02:00
|
|
|
with open("test-A/in.tsv") as f:
|
2021-04-13 00:35:55 +02:00
|
|
|
content = f.readlines()
|
|
|
|
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
|
|
document_vectors = vectorizer.fit_transform(content)
|
2021-04-13 11:19:34 +02:00
|
|
|
kmeans = KMeans(n_clusters=77, random_state=0).fit(document_vectors)
|
2021-04-13 00:35:55 +02:00
|
|
|
pred_y = kmeans.fit_predict(document_vectors)
|
2021-04-13 11:19:34 +02:00
|
|
|
t=kmeans.labels_
|
|
|
|
t=np.array2string(t, precision=2, separator='\n',suppress_small=True)
|
|
|
|
f = open("test-A/out.tsv", "a")
|
|
|
|
f.write(t)
|