refactoring

This commit is contained in:
Mikołaj Pokrywka 2022-03-21 10:06:54 +01:00
parent 358520a402
commit 52b5c0da45

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 12,
"id": "5e2107a5", "id": "5e2107a5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -12,7 +12,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 13,
"id": "bcc889e5", "id": "bcc889e5",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -20,31 +20,15 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Collecting kaggle\n", "Requirement already satisfied: kaggle in /home/students/s444463/.local/lib/python3.8/site-packages (1.5.12)\n",
" Using cached kaggle-1.5.12.tar.gz (58 kB)\n", "Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n", "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n",
"Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n", "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n",
"Collecting python-slugify\n",
" Using cached python_slugify-6.1.1-py2.py3-none-any.whl (9.1 kB)\n",
"Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n", "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n",
"Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n",
"Collecting tqdm\n", "Requirement already satisfied: tqdm in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (4.63.0)\n",
" Downloading tqdm-4.63.0-py2.py3-none-any.whl (76 kB)\n", "Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n",
"\u001b[K |████████████████████████████████| 76 kB 1.7 MB/s eta 0:00:011\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
"\u001b[?25hRequirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n",
"Collecting text-unidecode>=1.3\n",
" Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB)\n",
"Building wheels for collected packages: kaggle\n",
" Building wheel for kaggle (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=02172fba1c0ec42884ad8bcbb3c3b99749f529299444b00aaa946e78b9dfcb1f\n",
" Stored in directory: /home/students/s444463/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106\n",
"Successfully built kaggle\n",
"Installing collected packages: text-unidecode, tqdm, python-slugify, kaggle\n",
"\u001b[33m WARNING: The script tqdm is installed in '/home/students/s444463/.local/bin' which is not on PATH.\n",
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
"\u001b[33m WARNING: The script kaggle is installed in '/home/students/s444463/.local/bin' which is not on PATH.\n",
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
"Successfully installed kaggle-1.5.12 python-slugify-6.1.1 text-unidecode-1.3 tqdm-4.63.0\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n", "Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n",
@ -64,7 +48,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 14,
"id": "02a4034f", "id": "02a4034f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -84,7 +68,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 15,
"id": "5035aef0", "id": "5035aef0",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -102,7 +86,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 16,
"id": "14344d2f", "id": "14344d2f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -110,21 +94,17 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Collecting seaborn\n", "Requirement already satisfied: seaborn in /home/students/s444463/.local/lib/python3.8/site-packages (0.11.2)\n",
" Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n",
"\u001b[K |████████████████████████████████| 292 kB 2.8 MB/s eta 0:00:01\n",
"\u001b[?25hRequirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n",
"Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n",
"Requirement already satisfied: numpy>=1.15 in /usr/lib/python3/dist-packages (from seaborn) (1.17.4)\n", "Requirement already satisfied: numpy>=1.15 in /usr/lib/python3/dist-packages (from seaborn) (1.17.4)\n",
"Requirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n",
"Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n",
"Requirement already satisfied: matplotlib>=2.2 in /home/students/s444463/.local/lib/python3.8/site-packages (from seaborn) (3.4.3)\n", "Requirement already satisfied: matplotlib>=2.2 in /home/students/s444463/.local/lib/python3.8/site-packages (from seaborn) (3.4.3)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (8.3.2)\n", "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (8.3.2)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n",
"Requirement already satisfied: cycler>=0.10 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: cycler>=0.10 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n", "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n",
"Installing collected packages: seaborn\n",
"Successfully installed seaborn-0.11.2\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
] ]
@ -136,7 +116,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 17,
"id": "0f5ebfab", "id": "0f5ebfab",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -539,7 +519,7 @@
"[17880 rows x 18 columns]" "[17880 rows x 18 columns]"
] ]
}, },
"execution_count": 11, "execution_count": 17,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -552,7 +532,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 18,
"id": "edbf49da", "id": "edbf49da",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -571,40 +551,51 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 19,
"id": "bc594582", "id": "e60b3f32",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"ename": "NameError", "data": {
"evalue": "name 'np' is not defined", "text/plain": [
"output_type": "error", "Sales 551\n",
"traceback": [ "Engineering 487\n",
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "Marketing 401\n",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Operations 270\n",
"\u001b[0;32m/tmp/ipykernel_8616/866736318.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mregex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "IT 225\n",
"\u001b[0;31mNameError\u001b[0m: name 'np' is not defined" " ... \n",
] "Capoo 1\n",
"Engineering - Hardware 1\n",
"Utilities 1\n",
"i 1\n",
"TECH 1\n",
"Name: department, Length: 1337, dtype: int64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [
"data = data.replace(np.nan, '', regex=True)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e60b3f32",
"metadata": {},
"outputs": [],
"source": [ "source": [
"data[\"department\"].value_counts()" "data[\"department\"].value_counts()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": null,
"id": "2612d68b",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"data = data.replace(np.nan, '', regex=True)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c5ac75f5", "id": "c5ac75f5",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -965,20 +956,18 @@
"max NaN 1.000000 " "max NaN 1.000000 "
] ]
}, },
"execution_count": 14, "execution_count": 26,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"import numpy as np\n",
"data = data.replace(np.nan, '', regex=True)\n",
"data.describe(include='all')" "data.describe(include='all')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 21,
"id": "4b0e77a4", "id": "4b0e77a4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -993,7 +982,7 @@
"dtype: float64" "dtype: float64"
] ]
}, },
"execution_count": 15, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1004,17 +993,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 22,
"id": "5a1d8ec7", "id": "5a1d8ec7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: scikit-learn in /usr/lib/python3/dist-packages (0.22.2.post1)\n",
"Collecting scikit-learn\n",
" Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
"\u001b[K |████████████████████████████████| 26.7 MB 2.9 MB/s eta 0:00:01\n",
"\u001b[?25hRequirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n",
"Requirement already satisfied: numpy>=1.14.6 in /usr/lib/python3/dist-packages (from scikit-learn) (1.17.4)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n",
"Collecting threadpoolctl>=2.0.0\n",
" Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)\n",
"Installing collected packages: threadpoolctl, scikit-learn\n",
"Successfully installed scikit-learn-1.0.2 threadpoolctl-3.1.0\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [ "source": [
"pip install -U scikit-learn" "pip install -U scikit-learn"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 23,
"id": "50813795", "id": "50813795",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1027,15 +1038,15 @@
"English Teacher Abroad 71\n", "English Teacher Abroad 71\n",
"Software Engineer 67\n", "Software Engineer 67\n",
" ... \n", " ... \n",
"Physician - MD, CMO 1\n", "Jr. Flash & HTML Developer 1\n",
"Financial News Editor 1\n", " RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n",
"Senior Client Services Engineer 1\n", "Summer interns - 2014/15 1\n",
"Online Marketing Manager Italy 1\n", "Full-Stack Engineer 1\n",
"Infrastructure Project Manager 1\n", "Contract Product Designer 1\n",
"Name: title, Length: 8461, dtype: int64" "Name: title, Length: 8461, dtype: int64"
] ]
}, },
"execution_count": 25, "execution_count": 23,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1050,7 +1061,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 24,
"id": "ea3c9f2e", "id": "ea3c9f2e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1072,28 +1083,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 25,
"id": "b20cc27a", "id": "b20cc27a",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"English Teacher Abroad 235\n", "English Teacher Abroad 230\n",
"Customer Service Associate 110\n", "Customer Service Associate 106\n",
"Graduates: English Teacher Abroad (Conversational) 104\n", "Graduates: English Teacher Abroad (Conversational) 96\n",
"English Teacher Abroad 72\n", "English Teacher Abroad 71\n",
"Software Engineer 68\n", "Software Engineer 67\n",
" ... \n", " ... \n",
"Manager-Plastics Mfg Engineering - Full Time Permanent Job 1\n", "Jr. Flash & HTML Developer 1\n",
"Ruby on Rails Developer/Programmer 1\n", " RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n",
"Appliance Technician 1\n", "Summer interns - 2014/15 1\n",
"Need Oracle Fusion HCM Resource 1\n", "Full-Stack Engineer 1\n",
"Recruitment specialist 1\n", "Contract Product Designer 1\n",
"Name: title, Length: 8761, dtype: int64" "Name: title, Length: 8461, dtype: int64"
] ]
}, },
"execution_count": 32, "execution_count": 25,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }