32 lines
831 B
Python
32 lines
831 B
Python
import matplotlib.pyplot as plt
|
|
from sklearn.cluster import KMeans
|
|
import numpy as np
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.preprocessing import MaxAbsScaler
|
|
|
|
|
|
|
|
K = range(1, 87)
|
|
W = range(1, 683)
|
|
|
|
mms = MaxAbsScaler()
|
|
|
|
with open("test-A/in.tsv") as file:
|
|
corpus = file.readlines()
|
|
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = True)
|
|
vectorizer = TfidfVectorizer()
|
|
vectors = vectorizer.fit_transform(corpus)
|
|
mms.fit(vectors)
|
|
transformed = mms.transform(vectors)
|
|
|
|
Sum_of_squered_distances = []
|
|
|
|
for k in W:
|
|
km = KMeans(n_clusters=k)
|
|
km = km.fit(transformed)
|
|
Sum_of_squered_distances.append(km.inertia_)
|
|
|
|
plt.plot(W, Sum_of_squered_distances, 'bx-')
|
|
plt.xlabel('k')
|
|
plt.ylabel('Sum_of_squered_distances')
|
|
plt.show() |