polish-urban-legends-public.../porba1.ipynb

460 lines
11 KiB
Plaintext
Raw Permalink Normal View History

2021-04-25 17:52:29 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: numpy in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (1.19.2)\n",
"Requirement already satisfied: seaborn in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.11.0)\n",
"Requirement already satisfied: scikit-learn in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.23.2)\n",
"Requirement already satisfied: matplotlib in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (3.3.2)\n",
"Requirement already satisfied: fasttext in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.9.2)\n",
"Requirement already satisfied: pandas>=0.23 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from seaborn) (1.1.3)\n",
"Requirement already satisfied: scipy>=1.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from seaborn) (1.5.2)\n",
"Requirement already satisfied: joblib>=0.11 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from scikit-learn) (0.17.0)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from scikit-learn) (2.1.0)\n",
"Requirement already satisfied: certifi>=2020.06.20 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2020.6.20)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (8.0.1)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (0.10.0)\n",
"Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2.8.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (1.3.0)\n",
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2.4.7)\n",
"Requirement already satisfied: setuptools>=0.7.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from fasttext) (50.3.1.post20201107)\n",
"Requirement already satisfied: pybind11>=2.2 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from fasttext) (2.6.2)\n",
"Requirement already satisfied: pytz>=2017.2 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from pandas>=0.23->seaborn) (2020.1)\n",
"Requirement already satisfied: six in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install numpy seaborn scikit-learn matplotlib fasttext"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import seaborn as sns\n",
"import copy\n",
"from scipy.cluster.hierarchy import dendrogram\n",
"from scipy.cluster import hierarchy\n",
"import matplotlib.pyplot as plt\n",
"from scipy.spatial import distance_matrix\n",
"import fasttext\n",
"import fasttext.util\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"file = open(\"C:/Users/JedPC/Desktop/ISI/polish-urban-legends-public/dev-0/in.tsv\", encoding=\"utf-8\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"all_doc = []\n",
"for line in file:\n",
" all_doc.append(line)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"document_vectors = vectorizer.fit_transform(all_doc)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5937322507759797"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.max(document_vectors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ILOSC K"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"K = 40"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# OBLICZANIE ILOSCI ZMIENNYCH"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"ELEMENTS = document_vectors.shape[0]\n",
"SIZE = document_vectors.shape[1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ALGORYTM K SREDNICH"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def get_random_centroids():\n",
" CENTROIDS = np.zeros((K, SIZE))\n",
" for i in range(K):\n",
" for j in range(SIZE):\n",
" CENTROIDS[i,j] = np.random.uniform(0,2)\n",
" if CENTROIDS[i,j] > 1:\n",
" CENTROIDS[i,j] = 0\n",
" return CENTROIDS"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"CENTROIDS = get_random_centroids()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def assign_data_to_labels(document_vectors, CENTROIDS):\n",
" LABELS = []\n",
" for POINT in document_vectors:\n",
" DISTANCES = [np.linalg.norm(POINT - CEN) for CEN in CENTROIDS]\n",
" \n",
" LABELS.append(np.argmin(DISTANCES))\n",
" return np.array(LABELS)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"LABELS = assign_data_to_labels(document_vectors, CENTROIDS)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def get_new_centroids(document_vectors, LABELS, CENTROIDS):\n",
" NEW_CENTROIDS = np.zeros_like(CENTROIDS)\n",
" for centroid_label in range(K):\n",
" CENT_DATA = document_vectors[LABELS == centroid_label]\n",
" NEW_CENTROIDS[centroid_label] = np.mean(CENT_DATA) \n",
" return NEW_CENTROIDS"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\JedPC\\anaconda3\\lib\\site-packages\\scipy\\sparse\\base.py:581: RuntimeWarning: divide by zero encountered in true_divide\n",
" return self.astype(np.float_)._mul_scalar(1./other)\n"
]
}
],
"source": [
"NEW_CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"CENTROIDS = NEW_CENTROIDS"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0., 0., 0., ..., 0., 0., 0.])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"NEW_CENTROIDS[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# LITERACJE"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"NUMBER = 1000"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"for i in range(NUMBER):\n",
" LABELS = assign_data_to_labels(document_vectors, CENTROIDS)\n",
" CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30\n",
"0\n",
"1\n",
"1\n",
"30\n",
"1\n",
"0\n",
"30\n",
"30\n",
"1\n",
"0\n",
"1\n",
"30\n",
"1\n",
"1\n",
"1\n",
"1\n",
"1\n",
"0\n",
"0\n",
"1\n",
"1\n",
"0\n",
"1\n",
"1\n",
"30\n",
"30\n",
"0\n",
"0\n",
"1\n",
"30\n",
"0\n",
"0\n",
"1\n",
"1\n",
"0\n",
"1\n",
"1\n",
"30\n",
"1\n",
"0\n",
"1\n",
"1\n",
"0\n",
"0\n",
"0\n",
"1\n",
"0\n",
"30\n",
"1\n",
"0\n",
"1\n",
"0\n",
"1\n",
"0\n",
"0\n",
"0\n",
"30\n",
"0\n",
"1\n",
"0\n",
"0\n",
"0\n",
"0\n",
"0\n",
"1\n",
"0\n",
"1\n",
"1\n",
"1\n",
"0\n",
"0\n",
"0\n",
"0\n",
"30\n",
"30\n",
"30\n",
"0\n",
"0\n",
"30\n",
"1\n",
"30\n",
"1\n",
"0\n",
"30\n",
"1\n",
"30\n"
]
}
],
"source": [
"LABELS.shape[0]\n",
"for i in range(LABELS.shape[0]):\n",
" print(LABELS[i])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ???"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([30, 0, 1, 1, 30, 1, 0, 30, 30, 1, 0, 1, 30, 1, 1, 1, 1,\n",
" 1, 0, 0, 1, 1, 0, 1, 1, 30, 30, 0, 0, 1, 30, 0, 0, 1,\n",
" 1, 0, 1, 1, 30, 1, 0, 1, 1, 0, 0, 0, 1, 0, 30, 1, 0,\n",
" 1, 0, 1, 0, 0, 0, 30, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,\n",
" 1, 1, 0, 0, 0, 0, 30, 30, 30, 0, 0, 30, 1, 30, 1, 0, 30,\n",
" 1, 30], dtype=int64)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"LABELS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}