polish-urban-legends-public.../porba1.ipynb
2021-04-25 17:52:29 +02:00

11 KiB

pip install numpy seaborn scikit-learn matplotlib fasttext
Requirement already satisfied: numpy in c:\users\jedpc\anaconda3\lib\site-packages (1.19.2)
Requirement already satisfied: seaborn in c:\users\jedpc\anaconda3\lib\site-packages (0.11.0)
Requirement already satisfied: scikit-learn in c:\users\jedpc\anaconda3\lib\site-packages (0.23.2)
Requirement already satisfied: matplotlib in c:\users\jedpc\anaconda3\lib\site-packages (3.3.2)
Requirement already satisfied: fasttext in c:\users\jedpc\anaconda3\lib\site-packages (0.9.2)
Requirement already satisfied: pandas>=0.23 in c:\users\jedpc\anaconda3\lib\site-packages (from seaborn) (1.1.3)
Requirement already satisfied: scipy>=1.0 in c:\users\jedpc\anaconda3\lib\site-packages (from seaborn) (1.5.2)
Requirement already satisfied: joblib>=0.11 in c:\users\jedpc\anaconda3\lib\site-packages (from scikit-learn) (0.17.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\jedpc\anaconda3\lib\site-packages (from scikit-learn) (2.1.0)
Requirement already satisfied: certifi>=2020.06.20 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (2020.6.20)
Requirement already satisfied: pillow>=6.2.0 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (8.0.1)
Requirement already satisfied: cycler>=0.10 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (2.8.1)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (1.3.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\jedpc\anaconda3\lib\site-packages (from matplotlib) (2.4.7)
Requirement already satisfied: setuptools>=0.7.0 in c:\users\jedpc\anaconda3\lib\site-packages (from fasttext) (50.3.1.post20201107)
Requirement already satisfied: pybind11>=2.2 in c:\users\jedpc\anaconda3\lib\site-packages (from fasttext) (2.6.2)
Requirement already satisfied: pytz>=2017.2 in c:\users\jedpc\anaconda3\lib\site-packages (from pandas>=0.23->seaborn) (2020.1)
Requirement already satisfied: six in c:\users\jedpc\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib) (1.15.0)
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import seaborn as sns
import copy
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
from scipy.spatial import distance_matrix
import fasttext
import fasttext.util
from sklearn.feature_extraction.text import TfidfVectorizer
file = open("C:/Users/JedPC/Desktop/ISI/polish-urban-legends-public/dev-0/in.tsv", encoding="utf-8")
all_doc = []
for line in file:
    all_doc.append(line)
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(all_doc)
np.max(document_vectors)
0.5937322507759797

ILOSC K

K = 40

OBLICZANIE ILOSCI ZMIENNYCH

ELEMENTS = document_vectors.shape[0]
SIZE = document_vectors.shape[1]

ALGORYTM K SREDNICH

def get_random_centroids():
    CENTROIDS = np.zeros((K, SIZE))
    for i in range(K):
        for j in range(SIZE):
            CENTROIDS[i,j] = np.random.uniform(0,2)
            if CENTROIDS[i,j] > 1:
                CENTROIDS[i,j] = 0
    return CENTROIDS
CENTROIDS = get_random_centroids()
def assign_data_to_labels(document_vectors, CENTROIDS):
    LABELS = []
    for POINT in document_vectors:
        DISTANCES = [np.linalg.norm(POINT - CEN) for CEN in CENTROIDS]
        
        LABELS.append(np.argmin(DISTANCES))
    return np.array(LABELS)
LABELS = assign_data_to_labels(document_vectors, CENTROIDS)
def get_new_centroids(document_vectors, LABELS, CENTROIDS):
    NEW_CENTROIDS = np.zeros_like(CENTROIDS)
    for centroid_label in range(K):
        CENT_DATA = document_vectors[LABELS == centroid_label]
        NEW_CENTROIDS[centroid_label] = np.mean(CENT_DATA)            
    return NEW_CENTROIDS
NEW_CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)
C:\Users\JedPC\anaconda3\lib\site-packages\scipy\sparse\base.py:581: RuntimeWarning: divide by zero encountered in true_divide
  return self.astype(np.float_)._mul_scalar(1./other)
CENTROIDS = NEW_CENTROIDS
NEW_CENTROIDS[0]
array([0., 0., 0., ..., 0., 0., 0.])

LITERACJE

NUMBER = 1000
for i in range(NUMBER):
    LABELS = assign_data_to_labels(document_vectors, CENTROIDS)
    CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)
    
LABELS.shape[0]
for i in range(LABELS.shape[0]):
    print(LABELS[i])
30
0
1
1
30
1
0
30
30
1
0
1
30
1
1
1
1
1
0
0
1
1
0
1
1
30
30
0
0
1
30
0
0
1
1
0
1
1
30
1
0
1
1
0
0
0
1
0
30
1
0
1
0
1
0
0
0
30
0
1
0
0
0
0
0
1
0
1
1
1
0
0
0
0
30
30
30
0
0
30
1
30
1
0
30
1
30

???

LABELS
array([30,  0,  1,  1, 30,  1,  0, 30, 30,  1,  0,  1, 30,  1,  1,  1,  1,
        1,  0,  0,  1,  1,  0,  1,  1, 30, 30,  0,  0,  1, 30,  0,  0,  1,
        1,  0,  1,  1, 30,  1,  0,  1,  1,  0,  0,  0,  1,  0, 30,  1,  0,
        1,  0,  1,  0,  0,  0, 30,  0,  1,  0,  0,  0,  0,  0,  1,  0,  1,
        1,  1,  0,  0,  0,  0, 30, 30, 30,  0,  0, 30,  1, 30,  1,  0, 30,
        1, 30], dtype=int64)