20 lines
624 B
Python
20 lines
624 B
Python
|
from sklearn.cluster import KMeans
|
||
|
import numpy as np
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
with open("dev-0/in.tsv") as file:
|
||
|
corpus = file.readlines()
|
||
|
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = True)
|
||
|
vectors = vectorizer.fit_transform(corpus)
|
||
|
|
||
|
kmeans = KMeans(n_clusters=35, random_state = 0).fit(vectors)
|
||
|
prediction = kmeans.fit_predict(vectors)
|
||
|
labels = kmeans.labels_
|
||
|
labels = np.array2string(labels, precision=2, separator='\n', suppress_small=True)
|
||
|
|
||
|
file = open("dev-0/out.tsv", "w")
|
||
|
file.write(labels[1:-1])
|