polish-urban-legends-public/clusters.py
2021-04-24 21:08:04 +02:00

32 lines
831 B
Python

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
K = range(1, 87)
W = range(1, 683)
mms = MaxAbsScaler()
with open("test-A/in.tsv") as file:
corpus = file.readlines()
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = True)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(corpus)
mms.fit(vectors)
transformed = mms.transform(vectors)
Sum_of_squered_distances = []
for k in W:
km = KMeans(n_clusters=k)
km = km.fit(transformed)
Sum_of_squered_distances.append(km.inertia_)
plt.plot(W, Sum_of_squered_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squered_distances')
plt.show()