From 52b5c0da457faaeba85b147f5585dc067317e1ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pokrywka?= Date: Mon, 21 Mar 2022 10:06:54 +0100 Subject: [PATCH] refactoring --- download_data.ipynb | 193 +++++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 91 deletions(-) diff --git a/download_data.ipynb b/download_data.ipynb index 10d9433..b3764bc 100644 --- a/download_data.ipynb +++ b/download_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "id": "5e2107a5", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "bcc889e5", "metadata": {}, "outputs": [ @@ -20,31 +20,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting kaggle\n", - " Using cached kaggle-1.5.12.tar.gz (58 kB)\n", + "Requirement already satisfied: kaggle in /home/students/s444463/.local/lib/python3.8/site-packages (1.5.12)\n", + "Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n", - "Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n", - "Collecting python-slugify\n", - " Using cached python_slugify-6.1.1-py2.py3-none-any.whl (9.1 kB)\n", + "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n", "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n", - "Collecting tqdm\n", - " Downloading tqdm-4.63.0-py2.py3-none-any.whl (76 kB)\n", - "\u001b[K |████████████████████████████████| 76 kB 1.7 MB/s eta 0:00:011\n", - "\u001b[?25hRequirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n", - "Collecting text-unidecode>=1.3\n", - " Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB)\n", - "Building wheels for collected packages: kaggle\n", - " Building wheel for kaggle (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=02172fba1c0ec42884ad8bcbb3c3b99749f529299444b00aaa946e78b9dfcb1f\n", - " Stored in directory: /home/students/s444463/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106\n", - "Successfully built kaggle\n", - "Installing collected packages: text-unidecode, tqdm, python-slugify, kaggle\n", - "\u001b[33m WARNING: The script tqdm is installed in '/home/students/s444463/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", - "\u001b[33m WARNING: The script kaggle is installed in '/home/students/s444463/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", - "Successfully installed kaggle-1.5.12 python-slugify-6.1.1 text-unidecode-1.3 tqdm-4.63.0\n", + "Requirement already satisfied: tqdm in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (4.63.0)\n", + "Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n", + "Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n", @@ -64,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "id": "02a4034f", "metadata": {}, "outputs": [ @@ -84,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "id": "5035aef0", "metadata": {}, "outputs": [ @@ -102,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 16, "id": "14344d2f", "metadata": {}, "outputs": [ @@ -110,21 +94,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting seaborn\n", - " Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n", - "\u001b[K |████████████████████████████████| 292 kB 2.8 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n", - "Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n", + "Requirement already satisfied: seaborn in /home/students/s444463/.local/lib/python3.8/site-packages (0.11.2)\n", "Requirement already satisfied: numpy>=1.15 in /usr/lib/python3/dist-packages (from seaborn) (1.17.4)\n", + "Requirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n", + "Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n", "Requirement already satisfied: matplotlib>=2.2 in /home/students/s444463/.local/lib/python3.8/site-packages (from seaborn) (3.4.3)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n", "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (8.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n", "Requirement already satisfied: cycler>=0.10 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n", "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n", - "Installing collected packages: seaborn\n", - "Successfully installed seaborn-0.11.2\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] @@ -136,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "id": "0f5ebfab", "metadata": {}, "outputs": [ @@ -539,7 +519,7 @@ "[17880 rows x 18 columns]" ] }, - "execution_count": 11, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -552,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "edbf49da", "metadata": {}, "outputs": [ @@ -571,40 +551,51 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "bc594582", + "execution_count": 19, + "id": "e60b3f32", "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'np' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_8616/866736318.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mregex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'np' is not defined" - ] + "data": { + "text/plain": [ + "Sales 551\n", + "Engineering 487\n", + "Marketing 401\n", + "Operations 270\n", + "IT 225\n", + " ... \n", + "Capoo 1\n", + "Engineering - Hardware 1\n", + "Utilities 1\n", + "i 1\n", + "TECH 1\n", + "Name: department, Length: 1337, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } ], - "source": [ - "data = data.replace(np.nan, '', regex=True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e60b3f32", - "metadata": {}, - "outputs": [], "source": [ "data[\"department\"].value_counts()" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, + "id": "2612d68b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "data = data.replace(np.nan, '', regex=True)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, "id": "c5ac75f5", "metadata": {}, "outputs": [ @@ -965,20 +956,18 @@ "max NaN 1.000000 " ] }, - "execution_count": 14, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import numpy as np\n", - "data = data.replace(np.nan, '', regex=True)\n", "data.describe(include='all')" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 21, "id": "4b0e77a4", "metadata": {}, "outputs": [ @@ -993,7 +982,7 @@ "dtype: float64" ] }, - "execution_count": 15, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1004,17 +993,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "5a1d8ec7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: scikit-learn in /usr/lib/python3/dist-packages (0.22.2.post1)\n", + "Collecting scikit-learn\n", + " Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n", + "\u001b[K |████████████████████████████████| 26.7 MB 2.9 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n", + "Requirement already satisfied: numpy>=1.14.6 in /usr/lib/python3/dist-packages (from scikit-learn) (1.17.4)\n", + "Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n", + "Collecting threadpoolctl>=2.0.0\n", + " Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)\n", + "Installing collected packages: threadpoolctl, scikit-learn\n", + "Successfully installed scikit-learn-1.0.2 threadpoolctl-3.1.0\n", + "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "pip install -U scikit-learn" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "id": "50813795", "metadata": {}, "outputs": [ @@ -1027,15 +1038,15 @@ "English Teacher Abroad 71\n", "Software Engineer 67\n", " ... \n", - "Physician - MD, CMO 1\n", - "Financial News Editor 1\n", - "Senior Client Services Engineer 1\n", - "Online Marketing Manager Italy 1\n", - "Infrastructure Project Manager 1\n", + "Jr. Flash & HTML Developer 1\n", + " RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n", + "Summer interns - 2014/15 1\n", + "Full-Stack Engineer 1\n", + "Contract Product Designer 1\n", "Name: title, Length: 8461, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1050,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "id": "ea3c9f2e", "metadata": {}, "outputs": [ @@ -1072,28 +1083,28 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 25, "id": "b20cc27a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "English Teacher Abroad 235\n", - "Customer Service Associate 110\n", - "Graduates: English Teacher Abroad (Conversational) 104\n", - "English Teacher Abroad 72\n", - "Software Engineer 68\n", - " ... \n", - "Manager-Plastics Mfg Engineering - Full Time Permanent Job 1\n", - "Ruby on Rails Developer/Programmer 1\n", - "Appliance Technician 1\n", - "Need Oracle Fusion HCM Resource 1\n", - "Recruitment specialist 1\n", - "Name: title, Length: 8761, dtype: int64" + "English Teacher Abroad 230\n", + "Customer Service Associate 106\n", + "Graduates: English Teacher Abroad (Conversational) 96\n", + "English Teacher Abroad 71\n", + "Software Engineer 67\n", + " ... \n", + "Jr. Flash & HTML Developer 1\n", + " RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n", + "Summer interns - 2014/15 1\n", + "Full-Stack Engineer 1\n", + "Contract Product Designer 1\n", + "Name: title, Length: 8461, dtype: int64" ] }, - "execution_count": 32, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" }