11 KiB
11 KiB
pip install numpy seaborn scikit-learn matplotlib fasttext
Requirement already satisfied: numpy in c:\users\jedpc\anaconda3\lib\site-packages (1.19.2) Requirement already satisfied: seaborn in c:\users\jedpc\anaconda3\lib\site-packages (0.11.0) Requirement already satisfied: scikit-learn in c:\users\jedpc\anaconda3\lib\site-packages (0.23.2) Requirement already satisfied: matplotlib in c:\users\jedpc\anaconda3\lib\site-packages (3.3.2) Requirement already satisfied: fasttext in c:\users\jedpc\anaconda3\lib\site-packages (0.9.2) Requirement already satisfied: pandas>=0.23 in c:\users\jedpc\anaconda3\lib\site-packages (from seaborn) (1.1.3) Requirement already satisfied: scipy>=1.0 in c:\users\jedpc\anaconda3\lib\site-packages (from seaborn) (1.5.2) Requirement already satisfied: joblib>=0.11 in c:\users\jedpc\anaconda3\lib\site-packages (from scikit-learn) (0.17.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\jedpc\anaconda3\lib\site-packages (from scikit-learn) (2.1.0) Requirement already satisfied: certifi>=2020.06.20 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (2020.6.20) Requirement already satisfied: pillow>=6.2.0 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (8.0.1) Requirement already satisfied: cycler>=0.10 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (0.10.0) Requirement already satisfied: python-dateutil>=2.1 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (2.8.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (1.3.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (2.4.7) Requirement already satisfied: setuptools>=0.7.0 in c:\users\jedpc\anaconda3\lib\site-packages (from fasttext) (50.3.1.post20201107) Requirement already satisfied: pybind11>=2.2 in c:\users\jedpc\anaconda3\lib\site-packages (from fasttext) (2.6.2) Requirement already satisfied: pytz>=2017.2 in c:\users\jedpc\anaconda3\lib\site-packages (from pandas>=0.23->seaborn) (2020.1) Requirement already satisfied: six in c:\users\jedpc\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib) (1.15.0) Note: you may need to restart the kernel to use updated packages.
import numpy as np
import seaborn as sns
import copy
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
from scipy.spatial import distance_matrix
import fasttext
import fasttext.util
from sklearn.feature_extraction.text import TfidfVectorizer
file = open("C:/Users/JedPC/Desktop/ISI/polish-urban-legends-public/dev-0/in.tsv", encoding="utf-8")
all_doc = []
for line in file:
all_doc.append(line)
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(all_doc)
np.max(document_vectors)
0.5937322507759797
ILOSC K
K = 40
OBLICZANIE ILOSCI ZMIENNYCH
ELEMENTS = document_vectors.shape[0]
SIZE = document_vectors.shape[1]
ALGORYTM K SREDNICH
def get_random_centroids():
CENTROIDS = np.zeros((K, SIZE))
for i in range(K):
for j in range(SIZE):
CENTROIDS[i,j] = np.random.uniform(0,2)
if CENTROIDS[i,j] > 1:
CENTROIDS[i,j] = 0
return CENTROIDS
CENTROIDS = get_random_centroids()
def assign_data_to_labels(document_vectors, CENTROIDS):
LABELS = []
for POINT in document_vectors:
DISTANCES = [np.linalg.norm(POINT - CEN) for CEN in CENTROIDS]
LABELS.append(np.argmin(DISTANCES))
return np.array(LABELS)
LABELS = assign_data_to_labels(document_vectors, CENTROIDS)
def get_new_centroids(document_vectors, LABELS, CENTROIDS):
NEW_CENTROIDS = np.zeros_like(CENTROIDS)
for centroid_label in range(K):
CENT_DATA = document_vectors[LABELS == centroid_label]
NEW_CENTROIDS[centroid_label] = np.mean(CENT_DATA)
return NEW_CENTROIDS
NEW_CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)
C:\Users\JedPC\anaconda3\lib\site-packages\scipy\sparse\base.py:581: RuntimeWarning: divide by zero encountered in true_divide return self.astype(np.float_)._mul_scalar(1./other)
CENTROIDS = NEW_CENTROIDS
NEW_CENTROIDS[0]
array([0., 0., 0., ..., 0., 0., 0.])
LITERACJE
NUMBER = 1000
for i in range(NUMBER):
LABELS = assign_data_to_labels(document_vectors, CENTROIDS)
CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)
LABELS.shape[0]
for i in range(LABELS.shape[0]):
print(LABELS[i])
30 0 1 1 30 1 0 30 30 1 0 1 30 1 1 1 1 1 0 0 1 1 0 1 1 30 30 0 0 1 30 0 0 1 1 0 1 1 30 1 0 1 1 0 0 0 1 0 30 1 0 1 0 1 0 0 0 30 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 30 30 30 0 0 30 1 30 1 0 30 1 30
???
LABELS
array([30, 0, 1, 1, 30, 1, 0, 30, 30, 1, 0, 1, 30, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 30, 30, 0, 0, 1, 30, 0, 0, 1, 1, 0, 1, 1, 30, 1, 0, 1, 1, 0, 0, 0, 1, 0, 30, 1, 0, 1, 0, 1, 0, 0, 0, 30, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 30, 30, 30, 0, 0, 30, 1, 30, 1, 0, 30, 1, 30], dtype=int64)