{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: numpy in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (1.19.2)\n", "Requirement already satisfied: seaborn in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.11.0)\n", "Requirement already satisfied: scikit-learn in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.23.2)\n", "Requirement already satisfied: matplotlib in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (3.3.2)\n", "Requirement already satisfied: fasttext in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.9.2)\n", "Requirement already satisfied: pandas>=0.23 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from seaborn) (1.1.3)\n", "Requirement already satisfied: scipy>=1.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from seaborn) (1.5.2)\n", "Requirement already satisfied: joblib>=0.11 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from scikit-learn) (0.17.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from scikit-learn) (2.1.0)\n", "Requirement already satisfied: certifi>=2020.06.20 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2020.6.20)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (8.0.1)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (1.3.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2.4.7)\n", "Requirement already satisfied: setuptools>=0.7.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from fasttext) (50.3.1.post20201107)\n", "Requirement already satisfied: pybind11>=2.2 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from fasttext) (2.6.2)\n", "Requirement already satisfied: pytz>=2017.2 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from pandas>=0.23->seaborn) (2020.1)\n", "Requirement already satisfied: six in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install numpy seaborn scikit-learn matplotlib fasttext" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import seaborn as sns\n", "import copy\n", "from scipy.cluster.hierarchy import dendrogram\n", "from scipy.cluster import hierarchy\n", "import matplotlib.pyplot as plt\n", "from scipy.spatial import distance_matrix\n", "import fasttext\n", "import fasttext.util\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "file = open(\"C:/Users/JedPC/Desktop/ISI/polish-urban-legends-public/dev-0/in.tsv\", encoding=\"utf-8\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "all_doc = []\n", "for line in file:\n", " all_doc.append(line)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "vectorizer = TfidfVectorizer()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "document_vectors = vectorizer.fit_transform(all_doc)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5937322507759797" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.max(document_vectors)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ILOSC K" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "K = 40" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# OBLICZANIE ILOSCI ZMIENNYCH" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "ELEMENTS = document_vectors.shape[0]\n", "SIZE = document_vectors.shape[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ALGORYTM K SREDNICH" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def get_random_centroids():\n", " CENTROIDS = np.zeros((K, SIZE))\n", " for i in range(K):\n", " for j in range(SIZE):\n", " CENTROIDS[i,j] = np.random.uniform(0,2)\n", " if CENTROIDS[i,j] > 1:\n", " CENTROIDS[i,j] = 0\n", " return CENTROIDS" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "CENTROIDS = get_random_centroids()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def assign_data_to_labels(document_vectors, CENTROIDS):\n", " LABELS = []\n", " for POINT in document_vectors:\n", " DISTANCES = [np.linalg.norm(POINT - CEN) for CEN in CENTROIDS]\n", " \n", " LABELS.append(np.argmin(DISTANCES))\n", " return np.array(LABELS)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "LABELS = assign_data_to_labels(document_vectors, CENTROIDS)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def get_new_centroids(document_vectors, LABELS, CENTROIDS):\n", " NEW_CENTROIDS = np.zeros_like(CENTROIDS)\n", " for centroid_label in range(K):\n", " CENT_DATA = document_vectors[LABELS == centroid_label]\n", " NEW_CENTROIDS[centroid_label] = np.mean(CENT_DATA) \n", " return NEW_CENTROIDS" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\JedPC\\anaconda3\\lib\\site-packages\\scipy\\sparse\\base.py:581: RuntimeWarning: divide by zero encountered in true_divide\n", " return self.astype(np.float_)._mul_scalar(1./other)\n" ] } ], "source": [ "NEW_CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "CENTROIDS = NEW_CENTROIDS" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0., 0., 0., ..., 0., 0., 0.])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NEW_CENTROIDS[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# LITERACJE" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "NUMBER = 1000" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "for i in range(NUMBER):\n", " LABELS = assign_data_to_labels(document_vectors, CENTROIDS)\n", " CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)\n", " " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "30\n", "0\n", "1\n", "1\n", "30\n", "1\n", "0\n", "30\n", "30\n", "1\n", "0\n", "1\n", "30\n", "1\n", "1\n", "1\n", "1\n", "1\n", "0\n", "0\n", "1\n", "1\n", "0\n", "1\n", "1\n", "30\n", "30\n", "0\n", "0\n", "1\n", "30\n", "0\n", "0\n", "1\n", "1\n", "0\n", "1\n", "1\n", "30\n", "1\n", "0\n", "1\n", "1\n", "0\n", "0\n", "0\n", "1\n", "0\n", "30\n", "1\n", "0\n", "1\n", "0\n", "1\n", "0\n", "0\n", "0\n", "30\n", "0\n", "1\n", "0\n", "0\n", "0\n", "0\n", "0\n", "1\n", "0\n", "1\n", "1\n", "1\n", "0\n", "0\n", "0\n", "0\n", "30\n", "30\n", "30\n", "0\n", "0\n", "30\n", "1\n", "30\n", "1\n", "0\n", "30\n", "1\n", "30\n" ] } ], "source": [ "LABELS.shape[0]\n", "for i in range(LABELS.shape[0]):\n", " print(LABELS[i])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ???" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([30, 0, 1, 1, 30, 1, 0, 30, 30, 1, 0, 1, 30, 1, 1, 1, 1,\n", " 1, 0, 0, 1, 1, 0, 1, 1, 30, 30, 0, 0, 1, 30, 0, 0, 1,\n", " 1, 0, 1, 1, 30, 1, 0, 1, 1, 0, 0, 0, 1, 0, 30, 1, 0,\n", " 1, 0, 1, 0, 0, 0, 30, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,\n", " 1, 1, 0, 0, 0, 0, 30, 30, 30, 0, 0, 30, 1, 30, 1, 0, 30,\n", " 1, 30], dtype=int64)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "LABELS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }