18 lines
669 B
Python
18 lines
669 B
Python
|
import sklearn.metrics
|
||
|
from sklearn.cluster import KMeans
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
with open("dev-0/in.tsv") as myFile:
|
||
|
cnt = myFile.readlines()
|
||
|
vectorizer = TfidfVectorizer(ngram_range = (1,2), use_idf = False)
|
||
|
vectorizer = TfidfVectorizer()
|
||
|
document_vectors = vectorizer.fit_transform(cnt)
|
||
|
kmeans = KMeans(n_clusters = 77, random_state = 0).fit(document_vectors)
|
||
|
pred_myFile = kmeans.fit_predict(document_vectors)
|
||
|
tmp = kmeans.labels_
|
||
|
tmp = np.array2string(tmp, precision = 2, separator = '\n',suppress_small = True)
|
||
|
myFile = open("dev-0/out.tsv", "a")
|
||
|
myFile.write(tmp)
|