changed Nan to empty strings

This commit is contained in:
Mikołaj Pokrywka 2022-03-20 22:09:51 +01:00
parent 75d242d5c5
commit 8a936743aa

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"id": "5e2107a5", "id": "5e2107a5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -12,7 +12,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 2,
"id": "bcc889e5", "id": "bcc889e5",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -21,21 +21,21 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Requirement already satisfied: kaggle in /home/mikolaj/.local/lib/python3.8/site-packages (1.5.12)\n", "Requirement already satisfied: kaggle in /home/mikolaj/.local/lib/python3.8/site-packages (1.5.12)\n",
"Requirement already satisfied: tqdm in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)\n",
"Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.22)\n", "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.22)\n",
"Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.11.0)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.11.0)\n",
"Requirement already satisfied: requests in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.25.1)\n", "Requirement already satisfied: requests in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.25.1)\n",
"Requirement already satisfied: python-dateutil in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.8.1)\n",
"Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2018.1.18)\n", "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2018.1.18)\n",
"Requirement already satisfied: python-slugify in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", "Requirement already satisfied: python-slugify in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: python-dateutil in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.8.1)\n",
"Requirement already satisfied: tqdm in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)\n",
"Requirement already satisfied: text-unidecode>=1.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: idna<3,>=2.5 in /home/mikolaj/.local/lib/python3.8/site-packages (from requests->kaggle) (2.10)\n",
"Requirement already satisfied: chardet<5,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->kaggle) (3.0.4)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->kaggle) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /home/mikolaj/.local/lib/python3.8/site-packages (from requests->kaggle) (2.10)\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"Requirement already satisfied: pandas in /home/mikolaj/.local/lib/python3.8/site-packages (1.1.5)\n", "Requirement already satisfied: pandas in /home/mikolaj/.local/lib/python3.8/site-packages (1.1.5)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas) (2018.3)\n",
"Requirement already satisfied: numpy>=1.15.4 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (1.19.5)\n", "Requirement already satisfied: numpy>=1.15.4 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (1.19.5)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas) (2018.3)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (2.8.1)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7.3->pandas) (1.11.0)\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7.3->pandas) (1.11.0)\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
@ -45,12 +45,13 @@
], ],
"source": [ "source": [
"!pip install --user kaggle #API Kaggle, do pobrania zbioru\n", "!pip install --user kaggle #API Kaggle, do pobrania zbioru\n",
"!pip install --user pandas" "!pip install --user pandas\n",
"!pip install --user numpy"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 3,
"id": "02a4034f", "id": "02a4034f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -59,9 +60,7 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/mikolaj/.kaggle/kaggle.json'\n", "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/mikolaj/.kaggle/kaggle.json'\n",
"Downloading real-or-fake-fake-jobposting-prediction.zip to /home/mikolaj/ai_tech/inzynieria\n", "real-or-fake-fake-jobposting-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
" 99%|█████████████████████████████████████▊| 16.0M/16.1M [00:01<00:00, 10.2MB/s]\n",
"100%|██████████████████████████████████████| 16.1M/16.1M [00:01<00:00, 9.54MB/s]\n"
] ]
} }
], ],
@ -73,7 +72,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 4,
"id": "5035aef0", "id": "5035aef0",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -92,7 +91,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 5,
"id": "14344d2f", "id": "14344d2f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -100,22 +99,18 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Collecting seaborn\n", "Requirement already satisfied: seaborn in /home/mikolaj/.local/lib/python3.8/site-packages (0.11.2)\n",
" Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n",
" |████████████████████████████████| 292 kB 1.8 MB/s \n",
"\u001b[?25hRequirement already satisfied: numpy>=1.15 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.19.5)\n",
"Requirement already satisfied: scipy>=1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.7.2)\n", "Requirement already satisfied: scipy>=1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.7.2)\n",
"Requirement already satisfied: pandas>=0.23 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.1.5)\n",
"Requirement already satisfied: matplotlib>=2.2 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (3.4.2)\n", "Requirement already satisfied: matplotlib>=2.2 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (3.4.2)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", "Requirement already satisfied: numpy>=1.15 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.19.5)\n",
"Requirement already satisfied: pillow>=6.2.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0)\n", "Requirement already satisfied: pandas>=0.23 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.1.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", "Requirement already satisfied: python-dateutil>=2.7 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", "Requirement already satisfied: pillow>=6.2.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0)\n",
"Requirement already satisfied: cycler>=0.10 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: cycler>=0.10 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas>=0.23->seaborn) (2018.3)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas>=0.23->seaborn) (2018.3)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.11.0)\n", "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.11.0)\n",
"Installing collected packages: seaborn\n",
"Successfully installed seaborn-0.11.2\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
] ]
@ -127,7 +122,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": 22,
"id": "0f5ebfab", "id": "0f5ebfab",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -530,7 +525,7 @@
"[17880 rows x 18 columns]" "[17880 rows x 18 columns]"
] ]
}, },
"execution_count": 43, "execution_count": 22,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -543,7 +538,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": 23,
"id": "edbf49da", "id": "edbf49da",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -562,60 +557,443 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 29,
"id": "bc594582", "id": "bc594582",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>job_id</th>\n",
" <th>title</th>\n",
" <th>location</th>\n",
" <th>department</th>\n",
" <th>salary_range</th>\n",
" <th>company_profile</th>\n",
" <th>description</th>\n",
" <th>requirements</th>\n",
" <th>benefits</th>\n",
" <th>telecommuting</th>\n",
" <th>has_company_logo</th>\n",
" <th>has_questions</th>\n",
" <th>employment_type</th>\n",
" <th>required_experience</th>\n",
" <th>required_education</th>\n",
" <th>industry</th>\n",
" <th>function</th>\n",
" <th>fraudulent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Marketing Intern</td>\n",
" <td>US, NY, New York</td>\n",
" <td>Marketing</td>\n",
" <td></td>\n",
" <td>We're Food52, and we've created a groundbreaki...</td>\n",
" <td>Food52, a fast-growing, James Beard Award-winn...</td>\n",
" <td>Experience with content management systems a m...</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Other</td>\n",
" <td>Internship</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>Marketing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Customer Service - Cloud Video Production</td>\n",
" <td>NZ, , Auckland</td>\n",
" <td>Success</td>\n",
" <td></td>\n",
" <td>90 Seconds, the worlds Cloud Video Production ...</td>\n",
" <td>Organised - Focused - Vibrant - Awesome!Do you...</td>\n",
" <td>What we expect from you:Your key responsibilit...</td>\n",
" <td>What you will get from usThrough being part of...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Full-time</td>\n",
" <td>Not Applicable</td>\n",
" <td></td>\n",
" <td>Marketing and Advertising</td>\n",
" <td>Customer Service</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Commissioning Machinery Assistant (CMA)</td>\n",
" <td>US, IA, Wever</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>Valor Services provides Workforce Solutions th...</td>\n",
" <td>Our client, located in Houston, is actively se...</td>\n",
" <td>Implement pre-commissioning and commissioning ...</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Account Executive - Washington DC</td>\n",
" <td>US, DC, Washington</td>\n",
" <td>Sales</td>\n",
" <td></td>\n",
" <td>Our passion for improving quality of life thro...</td>\n",
" <td>THE COMPANY: ESRI Environmental Systems Rese...</td>\n",
" <td>EDUCATION: Bachelors or Masters in GIS, busi...</td>\n",
" <td>Our culture is anything but corporate—we have ...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td>Bachelor's Degree</td>\n",
" <td>Computer Software</td>\n",
" <td>Sales</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Bill Review Manager</td>\n",
" <td>US, FL, Fort Worth</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>SpotSource Solutions LLC is a Global Human Cap...</td>\n",
" <td>JOB TITLE: Itemization Review ManagerLOCATION:...</td>\n",
" <td>QUALIFICATIONS:RN license in the State of Texa...</td>\n",
" <td>Full Benefits Offered</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td>Bachelor's Degree</td>\n",
" <td>Hospital &amp; Health Care</td>\n",
" <td>Health Care Provider</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17875</th>\n",
" <td>17876</td>\n",
" <td>Account Director - Distribution</td>\n",
" <td>CA, ON, Toronto</td>\n",
" <td>Sales</td>\n",
" <td></td>\n",
" <td>Vend is looking for some awesome new talent to...</td>\n",
" <td>Just in case this is the first time youve vis...</td>\n",
" <td>To ace this role you:Will eat comprehensive St...</td>\n",
" <td>What can you expect from us?We have an open cu...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td></td>\n",
" <td>Computer Software</td>\n",
" <td>Sales</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17876</th>\n",
" <td>17877</td>\n",
" <td>Payroll Accountant</td>\n",
" <td>US, PA, Philadelphia</td>\n",
" <td>Accounting</td>\n",
" <td></td>\n",
" <td>WebLinc is the e-commerce platform and service...</td>\n",
" <td>The Payroll Accountant will focus primarily on...</td>\n",
" <td>- B.A. or B.S. in Accounting- Desire to have f...</td>\n",
" <td>Health &amp;amp; WellnessMedical planPrescription ...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td>Bachelor's Degree</td>\n",
" <td>Internet</td>\n",
" <td>Accounting/Auditing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17877</th>\n",
" <td>17878</td>\n",
" <td>Project Cost Control Staff Engineer - Cost Con...</td>\n",
" <td>US, TX, Houston</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>We Provide Full Time Permanent Positions for m...</td>\n",
" <td>Experienced Project Cost Control Staff Enginee...</td>\n",
" <td>At least 12 years professional experience.Abil...</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Full-time</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17878</th>\n",
" <td>17879</td>\n",
" <td>Graphic Designer</td>\n",
" <td>NG, LA, Lagos</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>Nemsia Studios is looking for an experienced v...</td>\n",
" <td>1. Must be fluent in the latest versions of Co...</td>\n",
" <td>Competitive salary (compensation will be based...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Contract</td>\n",
" <td>Not Applicable</td>\n",
" <td>Professional</td>\n",
" <td>Graphic Design</td>\n",
" <td>Design</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17879</th>\n",
" <td>17880</td>\n",
" <td>Web Application Developers</td>\n",
" <td>NZ, N, Wellington</td>\n",
" <td>Engineering</td>\n",
" <td></td>\n",
" <td>Vend is looking for some awesome new talent to...</td>\n",
" <td>Who are we?Vend is an award winning web based ...</td>\n",
" <td>We want to hear from you if:You have an in-dep...</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n",
" <td></td>\n",
" <td>Computer Software</td>\n",
" <td>Engineering</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17880 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [ "text/plain": [
"English Teacher Abroad 311\n", " job_id title \\\n",
"Customer Service Associate 146\n", "0 1 Marketing Intern \n",
"Graduates: English Teacher Abroad (Conversational) 144\n", "1 2 Customer Service - Cloud Video Production \n",
"English Teacher Abroad 95\n", "2 3 Commissioning Machinery Assistant (CMA) \n",
"Software Engineer 86\n", "3 4 Account Executive - Washington DC \n",
" ... \n", "4 5 Bill Review Manager \n",
"Welcome desk Administrator 1\n", "... ... ... \n",
"Game Studio Manager 1\n", "17875 17876 Account Director - Distribution \n",
"Head of Content (m/f) 1\n", "17876 17877 Payroll Accountant \n",
"International Broadcaster, Bambara and Songhai Language 1\n", "17877 17878 Project Cost Control Staff Engineer - Cost Con... \n",
"Community / Marketing Coordinator 1\n", "17878 17879 Graphic Designer \n",
"Name: title, Length: 11231, dtype: int64" "17879 17880 Web Application Developers \n",
"\n",
" location department salary_range \\\n",
"0 US, NY, New York Marketing \n",
"1 NZ, , Auckland Success \n",
"2 US, IA, Wever \n",
"3 US, DC, Washington Sales \n",
"4 US, FL, Fort Worth \n",
"... ... ... ... \n",
"17875 CA, ON, Toronto Sales \n",
"17876 US, PA, Philadelphia Accounting \n",
"17877 US, TX, Houston \n",
"17878 NG, LA, Lagos \n",
"17879 NZ, N, Wellington Engineering \n",
"\n",
" company_profile \\\n",
"0 We're Food52, and we've created a groundbreaki... \n",
"1 90 Seconds, the worlds Cloud Video Production ... \n",
"2 Valor Services provides Workforce Solutions th... \n",
"3 Our passion for improving quality of life thro... \n",
"4 SpotSource Solutions LLC is a Global Human Cap... \n",
"... ... \n",
"17875 Vend is looking for some awesome new talent to... \n",
"17876 WebLinc is the e-commerce platform and service... \n",
"17877 We Provide Full Time Permanent Positions for m... \n",
"17878 \n",
"17879 Vend is looking for some awesome new talent to... \n",
"\n",
" description \\\n",
"0 Food52, a fast-growing, James Beard Award-winn... \n",
"1 Organised - Focused - Vibrant - Awesome!Do you... \n",
"2 Our client, located in Houston, is actively se... \n",
"3 THE COMPANY: ESRI Environmental Systems Rese... \n",
"4 JOB TITLE: Itemization Review ManagerLOCATION:... \n",
"... ... \n",
"17875 Just in case this is the first time youve vis... \n",
"17876 The Payroll Accountant will focus primarily on... \n",
"17877 Experienced Project Cost Control Staff Enginee... \n",
"17878 Nemsia Studios is looking for an experienced v... \n",
"17879 Who are we?Vend is an award winning web based ... \n",
"\n",
" requirements \\\n",
"0 Experience with content management systems a m... \n",
"1 What we expect from you:Your key responsibilit... \n",
"2 Implement pre-commissioning and commissioning ... \n",
"3 EDUCATION: Bachelors or Masters in GIS, busi... \n",
"4 QUALIFICATIONS:RN license in the State of Texa... \n",
"... ... \n",
"17875 To ace this role you:Will eat comprehensive St... \n",
"17876 - B.A. or B.S. in Accounting- Desire to have f... \n",
"17877 At least 12 years professional experience.Abil... \n",
"17878 1. Must be fluent in the latest versions of Co... \n",
"17879 We want to hear from you if:You have an in-dep... \n",
"\n",
" benefits telecommuting \\\n",
"0 0 \n",
"1 What you will get from usThrough being part of... 0 \n",
"2 0 \n",
"3 Our culture is anything but corporate—we have ... 0 \n",
"4 Full Benefits Offered 0 \n",
"... ... ... \n",
"17875 What can you expect from us?We have an open cu... 0 \n",
"17876 Health &amp; WellnessMedical planPrescription ... 0 \n",
"17877 0 \n",
"17878 Competitive salary (compensation will be based... 0 \n",
"17879 0 \n",
"\n",
" has_company_logo has_questions employment_type required_experience \\\n",
"0 1 0 Other Internship \n",
"1 1 0 Full-time Not Applicable \n",
"2 1 0 \n",
"3 1 0 Full-time Mid-Senior level \n",
"4 1 1 Full-time Mid-Senior level \n",
"... ... ... ... ... \n",
"17875 1 1 Full-time Mid-Senior level \n",
"17876 1 1 Full-time Mid-Senior level \n",
"17877 0 0 Full-time \n",
"17878 0 1 Contract Not Applicable \n",
"17879 1 1 Full-time Mid-Senior level \n",
"\n",
" required_education industry function \\\n",
"0 Marketing \n",
"1 Marketing and Advertising Customer Service \n",
"2 \n",
"3 Bachelor's Degree Computer Software Sales \n",
"4 Bachelor's Degree Hospital & Health Care Health Care Provider \n",
"... ... ... ... \n",
"17875 Computer Software Sales \n",
"17876 Bachelor's Degree Internet Accounting/Auditing \n",
"17877 \n",
"17878 Professional Graphic Design Design \n",
"17879 Computer Software Engineering \n",
"\n",
" fraudulent \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"... ... \n",
"17875 0 \n",
"17876 0 \n",
"17877 0 \n",
"17878 0 \n",
"17879 0 \n",
"\n",
"[17880 rows x 18 columns]"
] ]
}, },
"execution_count": 35, "execution_count": 29,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"data[\"title\"].value_counts()" "data = data.replace(np.nan, '', regex=True)\n",
"data"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 30,
"id": "e60b3f32", "id": "e60b3f32",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"Sales 551\n", " 11547\n",
"Engineering 487\n", "Sales 551\n",
"Marketing 401\n", "Engineering 487\n",
"Operations 270\n", "Marketing 401\n",
"IT 225\n", "Operations 270\n",
" ... \n", " ... \n",
"Audiology 1\n", "Pricing 1\n",
"Professional Association 1\n", "Mobility 1\n",
"Multiwork 1\n", "Housekeeping 1\n",
"Dev2 to 5 years experience in a client-facing technical role. Ability to diplomatically address customer concerns and provide feedback. A firm understanding of the technology stacks common to the Web ecosystem. A demonstrated history of creating non-trivi 1\n", "An Impact Engine Company 1\n",
"Business Strategy 1\n", "Trainee 1\n",
"Name: department, Length: 1337, dtype: int64" "Name: department, Length: 1338, dtype: int64"
] ]
}, },
"execution_count": 36, "execution_count": 30,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -626,7 +1004,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 32,
"id": "c5ac75f5", "id": "c5ac75f5",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -676,42 +1054,42 @@
" <th>count</th>\n", " <th>count</th>\n",
" <td>17880.000000</td>\n", " <td>17880.000000</td>\n",
" <td>17880</td>\n", " <td>17880</td>\n",
" <td>17534</td>\n", " <td>17880</td>\n",
" <td>6333</td>\n", " <td>17880</td>\n",
" <td>2868</td>\n", " <td>17880</td>\n",
" <td>14572</td>\n", " <td>17880</td>\n",
" <td>17879</td>\n", " <td>17880</td>\n",
" <td>15185</td>\n", " <td>17880</td>\n",
" <td>10670</td>\n", " <td>17880</td>\n",
" <td>17880.000000</td>\n", " <td>17880.000000</td>\n",
" <td>17880.000000</td>\n", " <td>17880.000000</td>\n",
" <td>17880.000000</td>\n", " <td>17880.000000</td>\n",
" <td>14409</td>\n", " <td>17880</td>\n",
" <td>10830</td>\n", " <td>17880</td>\n",
" <td>9775</td>\n", " <td>17880</td>\n",
" <td>12977</td>\n", " <td>17880</td>\n",
" <td>11425</td>\n", " <td>17880</td>\n",
" <td>17880.000000</td>\n", " <td>17880.000000</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>unique</th>\n", " <th>unique</th>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>11231</td>\n", " <td>11231</td>\n",
" <td>3105</td>\n", " <td>3106</td>\n",
" <td>1337</td>\n", " <td>1338</td>\n",
" <td>874</td>\n", " <td>875</td>\n",
" <td>1709</td>\n", " <td>1710</td>\n",
" <td>14801</td>\n", " <td>14802</td>\n",
" <td>11968</td>\n", " <td>11969</td>\n",
" <td>6205</td>\n", " <td>6206</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>5</td>\n", " <td>6</td>\n",
" <td>7</td>\n", " <td>8</td>\n",
" <td>13</td>\n", " <td>14</td>\n",
" <td>131</td>\n", " <td>132</td>\n",
" <td>37</td>\n", " <td>38</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@ -719,20 +1097,20 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>English Teacher Abroad</td>\n", " <td>English Teacher Abroad</td>\n",
" <td>GB, LND, London</td>\n", " <td>GB, LND, London</td>\n",
" <td>Sales</td>\n", " <td></td>\n",
" <td>0-0</td>\n", " <td></td>\n",
" <td>We help teachers get safe &amp;amp; secure jobs ab...</td>\n", " <td></td>\n",
" <td>Play with kids, get paid for it Love travel? J...</td>\n", " <td>Play with kids, get paid for it Love travel? J...</td>\n",
" <td>University degree required. TEFL / TESOL / CEL...</td>\n", " <td></td>\n",
" <td>See job description</td>\n", " <td></td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>Full-time</td>\n", " <td>Full-time</td>\n",
" <td>Mid-Senior level</td>\n", " <td></td>\n",
" <td>Bachelor's Degree</td>\n", " <td></td>\n",
" <td>Information Technology and Services</td>\n", " <td></td>\n",
" <td>Information Technology</td>\n", " <td></td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@ -740,20 +1118,20 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>311</td>\n", " <td>311</td>\n",
" <td>718</td>\n", " <td>718</td>\n",
" <td>551</td>\n", " <td>11547</td>\n",
" <td>142</td>\n", " <td>15012</td>\n",
" <td>726</td>\n", " <td>3308</td>\n",
" <td>379</td>\n", " <td>379</td>\n",
" <td>410</td>\n", " <td>2695</td>\n",
" <td>726</td>\n", " <td>7210</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>11620</td>\n", " <td>11620</td>\n",
" <td>3809</td>\n", " <td>7050</td>\n",
" <td>5145</td>\n", " <td>8105</td>\n",
" <td>1734</td>\n", " <td>4903</td>\n",
" <td>1749</td>\n", " <td>6455</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@ -909,10 +1287,10 @@
], ],
"text/plain": [ "text/plain": [
" job_id title location department \\\n", " job_id title location department \\\n",
"count 17880.000000 17880 17534 6333 \n", "count 17880.000000 17880 17880 17880 \n",
"unique NaN 11231 3105 1337 \n", "unique NaN 11231 3106 1338 \n",
"top NaN English Teacher Abroad GB, LND, London Sales \n", "top NaN English Teacher Abroad GB, LND, London \n",
"freq NaN 311 718 551 \n", "freq NaN 311 718 11547 \n",
"mean 8940.500000 NaN NaN NaN \n", "mean 8940.500000 NaN NaN NaN \n",
"std 5161.655742 NaN NaN NaN \n", "std 5161.655742 NaN NaN NaN \n",
"min 1.000000 NaN NaN NaN \n", "min 1.000000 NaN NaN NaN \n",
@ -921,104 +1299,79 @@
"75% 13410.250000 NaN NaN NaN \n", "75% 13410.250000 NaN NaN NaN \n",
"max 17880.000000 NaN NaN NaN \n", "max 17880.000000 NaN NaN NaN \n",
"\n", "\n",
" salary_range company_profile \\\n", " salary_range company_profile \\\n",
"count 2868 14572 \n", "count 17880 17880 \n",
"unique 874 1709 \n", "unique 875 1710 \n",
"top 0-0 We help teachers get safe &amp; secure jobs ab... \n", "top \n",
"freq 142 726 \n", "freq 15012 3308 \n",
"mean NaN NaN \n", "mean NaN NaN \n",
"std NaN NaN \n", "std NaN NaN \n",
"min NaN NaN \n", "min NaN NaN \n",
"25% NaN NaN \n", "25% NaN NaN \n",
"50% NaN NaN \n", "50% NaN NaN \n",
"75% NaN NaN \n", "75% NaN NaN \n",
"max NaN NaN \n", "max NaN NaN \n",
"\n", "\n",
" description \\\n", " description requirements \\\n",
"count 17879 \n", "count 17880 17880 \n",
"unique 14801 \n", "unique 14802 11969 \n",
"top Play with kids, get paid for it Love travel? J... \n", "top Play with kids, get paid for it Love travel? J... \n",
"freq 379 \n", "freq 379 2695 \n",
"mean NaN \n", "mean NaN NaN \n",
"std NaN \n", "std NaN NaN \n",
"min NaN \n", "min NaN NaN \n",
"25% NaN \n", "25% NaN NaN \n",
"50% NaN \n", "50% NaN NaN \n",
"75% NaN \n", "75% NaN NaN \n",
"max NaN \n", "max NaN NaN \n",
"\n", "\n",
" requirements \\\n", " benefits telecommuting has_company_logo has_questions \\\n",
"count 15185 \n", "count 17880 17880.000000 17880.000000 17880.000000 \n",
"unique 11968 \n", "unique 6206 NaN NaN NaN \n",
"top University degree required. TEFL / TESOL / CEL... \n", "top NaN NaN NaN \n",
"freq 410 \n", "freq 7210 NaN NaN NaN \n",
"mean NaN \n", "mean NaN 0.042897 0.795302 0.491723 \n",
"std NaN \n", "std NaN 0.202631 0.403492 0.499945 \n",
"min NaN \n", "min NaN 0.000000 0.000000 0.000000 \n",
"25% NaN \n", "25% NaN 0.000000 1.000000 0.000000 \n",
"50% NaN \n", "50% NaN 0.000000 1.000000 0.000000 \n",
"75% NaN \n", "75% NaN 0.000000 1.000000 1.000000 \n",
"max NaN \n", "max NaN 1.000000 1.000000 1.000000 \n",
"\n", "\n",
" benefits telecommuting has_company_logo has_questions \\\n", " employment_type required_experience required_education industry \\\n",
"count 10670 17880.000000 17880.000000 17880.000000 \n", "count 17880 17880 17880 17880 \n",
"unique 6205 NaN NaN NaN \n", "unique 6 8 14 132 \n",
"top See job description NaN NaN NaN \n", "top Full-time \n",
"freq 726 NaN NaN NaN \n", "freq 11620 7050 8105 4903 \n",
"mean NaN 0.042897 0.795302 0.491723 \n", "mean NaN NaN NaN NaN \n",
"std NaN 0.202631 0.403492 0.499945 \n", "std NaN NaN NaN NaN \n",
"min NaN 0.000000 0.000000 0.000000 \n", "min NaN NaN NaN NaN \n",
"25% NaN 0.000000 1.000000 0.000000 \n", "25% NaN NaN NaN NaN \n",
"50% NaN 0.000000 1.000000 0.000000 \n", "50% NaN NaN NaN NaN \n",
"75% NaN 0.000000 1.000000 1.000000 \n", "75% NaN NaN NaN NaN \n",
"max NaN 1.000000 1.000000 1.000000 \n", "max NaN NaN NaN NaN \n",
"\n", "\n",
" employment_type required_experience required_education \\\n", " function fraudulent \n",
"count 14409 10830 9775 \n", "count 17880 17880.000000 \n",
"unique 5 7 13 \n", "unique 38 NaN \n",
"top Full-time Mid-Senior level Bachelor's Degree \n", "top NaN \n",
"freq 11620 3809 5145 \n", "freq 6455 NaN \n",
"mean NaN NaN NaN \n", "mean NaN 0.048434 \n",
"std NaN NaN NaN \n", "std NaN 0.214688 \n",
"min NaN NaN NaN \n", "min NaN 0.000000 \n",
"25% NaN NaN NaN \n", "25% NaN 0.000000 \n",
"50% NaN NaN NaN \n", "50% NaN 0.000000 \n",
"75% NaN NaN NaN \n", "75% NaN 0.000000 \n",
"max NaN NaN NaN \n", "max NaN 1.000000 "
"\n",
" industry function \\\n",
"count 12977 11425 \n",
"unique 131 37 \n",
"top Information Technology and Services Information Technology \n",
"freq 1734 1749 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" fraudulent \n",
"count 17880.000000 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean 0.048434 \n",
"std 0.214688 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 0.000000 \n",
"max 1.000000 "
] ]
}, },
"execution_count": 37, "execution_count": 32,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"data = data.replace(np.nan, '', regex=True)\n",
"data.describe(include='all')" "data.describe(include='all')"
] ]
}, },