1546 lines
64 KiB
Plaintext
1546 lines
64 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "5e2107a5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#Skrypt do ściagnięcia zbiory danych\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "bcc889e5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: kaggle in /home/mikolaj/.local/lib/python3.8/site-packages (1.5.12)\n",
|
||
"Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.22)\n",
|
||
"Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.11.0)\n",
|
||
"Requirement already satisfied: requests in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.25.1)\n",
|
||
"Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2018.1.18)\n",
|
||
"Requirement already satisfied: python-slugify in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
|
||
"Requirement already satisfied: python-dateutil in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.8.1)\n",
|
||
"Requirement already satisfied: tqdm in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)\n",
|
||
"Requirement already satisfied: text-unidecode>=1.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
|
||
"Requirement already satisfied: chardet<5,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->kaggle) (3.0.4)\n",
|
||
"Requirement already satisfied: idna<3,>=2.5 in /home/mikolaj/.local/lib/python3.8/site-packages (from requests->kaggle) (2.10)\n",
|
||
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
|
||
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
|
||
"Requirement already satisfied: pandas in /home/mikolaj/.local/lib/python3.8/site-packages (1.1.5)\n",
|
||
"Requirement already satisfied: numpy>=1.15.4 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (1.19.5)\n",
|
||
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas) (2018.3)\n",
|
||
"Requirement already satisfied: python-dateutil>=2.7.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (2.8.1)\n",
|
||
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7.3->pandas) (1.11.0)\n",
|
||
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
|
||
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!pip install --user kaggle #API Kaggle, do pobrania zbioru\n",
|
||
"!pip install --user pandas\n",
|
||
"!pip install --user numpy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "02a4034f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/mikolaj/.kaggle/kaggle.json'\n",
|
||
"real-or-fake-fake-jobposting-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Żeby poniższa komenda zadziałała, musisz posiadać plik ~/.kaggle/kaggle.json, zawierający Kaggle API token.\n",
|
||
"# Instrukcje: https://www.kaggle.com/docs/api\n",
|
||
"!kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "5035aef0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Archive: real-or-fake-fake-jobposting-prediction.zip\n",
|
||
" inflating: fake_job_postings.csv \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!unzip -o real-or-fake-fake-jobposting-prediction.zip"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "14344d2f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: seaborn in /home/mikolaj/.local/lib/python3.8/site-packages (0.11.2)\n",
|
||
"Requirement already satisfied: scipy>=1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.7.2)\n",
|
||
"Requirement already satisfied: matplotlib>=2.2 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (3.4.2)\n",
|
||
"Requirement already satisfied: numpy>=1.15 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.19.5)\n",
|
||
"Requirement already satisfied: pandas>=0.23 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.1.5)\n",
|
||
"Requirement already satisfied: python-dateutil>=2.7 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n",
|
||
"Requirement already satisfied: pillow>=6.2.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0)\n",
|
||
"Requirement already satisfied: cycler>=0.10 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
|
||
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n",
|
||
"Requirement already satisfied: pyparsing>=2.2.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n",
|
||
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas>=0.23->seaborn) (2018.3)\n",
|
||
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.11.0)\n",
|
||
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
|
||
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!pip install --user seaborn"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "0f5ebfab",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>job_id</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>location</th>\n",
|
||
" <th>department</th>\n",
|
||
" <th>salary_range</th>\n",
|
||
" <th>company_profile</th>\n",
|
||
" <th>description</th>\n",
|
||
" <th>requirements</th>\n",
|
||
" <th>benefits</th>\n",
|
||
" <th>telecommuting</th>\n",
|
||
" <th>has_company_logo</th>\n",
|
||
" <th>has_questions</th>\n",
|
||
" <th>employment_type</th>\n",
|
||
" <th>required_experience</th>\n",
|
||
" <th>required_education</th>\n",
|
||
" <th>industry</th>\n",
|
||
" <th>function</th>\n",
|
||
" <th>fraudulent</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Marketing Intern</td>\n",
|
||
" <td>US, NY, New York</td>\n",
|
||
" <td>Marketing</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>We're Food52, and we've created a groundbreaki...</td>\n",
|
||
" <td>Food52, a fast-growing, James Beard Award-winn...</td>\n",
|
||
" <td>Experience with content management systems a m...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Other</td>\n",
|
||
" <td>Internship</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Marketing</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Customer Service - Cloud Video Production</td>\n",
|
||
" <td>NZ, , Auckland</td>\n",
|
||
" <td>Success</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>90 Seconds, the worlds Cloud Video Production ...</td>\n",
|
||
" <td>Organised - Focused - Vibrant - Awesome!Do you...</td>\n",
|
||
" <td>What we expect from you:Your key responsibilit...</td>\n",
|
||
" <td>What you will get from usThrough being part of...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Not Applicable</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Marketing and Advertising</td>\n",
|
||
" <td>Customer Service</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Commissioning Machinery Assistant (CMA)</td>\n",
|
||
" <td>US, IA, Wever</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Valor Services provides Workforce Solutions th...</td>\n",
|
||
" <td>Our client, located in Houston, is actively se...</td>\n",
|
||
" <td>Implement pre-commissioning and commissioning ...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>Account Executive - Washington DC</td>\n",
|
||
" <td>US, DC, Washington</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Our passion for improving quality of life thro...</td>\n",
|
||
" <td>THE COMPANY: ESRI – Environmental Systems Rese...</td>\n",
|
||
" <td>EDUCATION: Bachelor’s or Master’s in GIS, busi...</td>\n",
|
||
" <td>Our culture is anything but corporate—we have ...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>Bachelor's Degree</td>\n",
|
||
" <td>Computer Software</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>Bill Review Manager</td>\n",
|
||
" <td>US, FL, Fort Worth</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>SpotSource Solutions LLC is a Global Human Cap...</td>\n",
|
||
" <td>JOB TITLE: Itemization Review ManagerLOCATION:...</td>\n",
|
||
" <td>QUALIFICATIONS:RN license in the State of Texa...</td>\n",
|
||
" <td>Full Benefits Offered</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>Bachelor's Degree</td>\n",
|
||
" <td>Hospital & Health Care</td>\n",
|
||
" <td>Health Care Provider</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17875</th>\n",
|
||
" <td>17876</td>\n",
|
||
" <td>Account Director - Distribution</td>\n",
|
||
" <td>CA, ON, Toronto</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Vend is looking for some awesome new talent to...</td>\n",
|
||
" <td>Just in case this is the first time you’ve vis...</td>\n",
|
||
" <td>To ace this role you:Will eat comprehensive St...</td>\n",
|
||
" <td>What can you expect from us?We have an open cu...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Computer Software</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17876</th>\n",
|
||
" <td>17877</td>\n",
|
||
" <td>Payroll Accountant</td>\n",
|
||
" <td>US, PA, Philadelphia</td>\n",
|
||
" <td>Accounting</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>WebLinc is the e-commerce platform and service...</td>\n",
|
||
" <td>The Payroll Accountant will focus primarily on...</td>\n",
|
||
" <td>- B.A. or B.S. in Accounting- Desire to have f...</td>\n",
|
||
" <td>Health &amp; WellnessMedical planPrescription ...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>Bachelor's Degree</td>\n",
|
||
" <td>Internet</td>\n",
|
||
" <td>Accounting/Auditing</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17877</th>\n",
|
||
" <td>17878</td>\n",
|
||
" <td>Project Cost Control Staff Engineer - Cost Con...</td>\n",
|
||
" <td>US, TX, Houston</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>We Provide Full Time Permanent Positions for m...</td>\n",
|
||
" <td>Experienced Project Cost Control Staff Enginee...</td>\n",
|
||
" <td>At least 12 years professional experience.Abil...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17878</th>\n",
|
||
" <td>17879</td>\n",
|
||
" <td>Graphic Designer</td>\n",
|
||
" <td>NG, LA, Lagos</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Nemsia Studios is looking for an experienced v...</td>\n",
|
||
" <td>1. Must be fluent in the latest versions of Co...</td>\n",
|
||
" <td>Competitive salary (compensation will be based...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Contract</td>\n",
|
||
" <td>Not Applicable</td>\n",
|
||
" <td>Professional</td>\n",
|
||
" <td>Graphic Design</td>\n",
|
||
" <td>Design</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17879</th>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>Web Application Developers</td>\n",
|
||
" <td>NZ, N, Wellington</td>\n",
|
||
" <td>Engineering</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Vend is looking for some awesome new talent to...</td>\n",
|
||
" <td>Who are we?Vend is an award winning web based ...</td>\n",
|
||
" <td>We want to hear from you if:You have an in-dep...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Computer Software</td>\n",
|
||
" <td>Engineering</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>17880 rows × 18 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" job_id title \\\n",
|
||
"0 1 Marketing Intern \n",
|
||
"1 2 Customer Service - Cloud Video Production \n",
|
||
"2 3 Commissioning Machinery Assistant (CMA) \n",
|
||
"3 4 Account Executive - Washington DC \n",
|
||
"4 5 Bill Review Manager \n",
|
||
"... ... ... \n",
|
||
"17875 17876 Account Director - Distribution \n",
|
||
"17876 17877 Payroll Accountant \n",
|
||
"17877 17878 Project Cost Control Staff Engineer - Cost Con... \n",
|
||
"17878 17879 Graphic Designer \n",
|
||
"17879 17880 Web Application Developers \n",
|
||
"\n",
|
||
" location department salary_range \\\n",
|
||
"0 US, NY, New York Marketing NaN \n",
|
||
"1 NZ, , Auckland Success NaN \n",
|
||
"2 US, IA, Wever NaN NaN \n",
|
||
"3 US, DC, Washington Sales NaN \n",
|
||
"4 US, FL, Fort Worth NaN NaN \n",
|
||
"... ... ... ... \n",
|
||
"17875 CA, ON, Toronto Sales NaN \n",
|
||
"17876 US, PA, Philadelphia Accounting NaN \n",
|
||
"17877 US, TX, Houston NaN NaN \n",
|
||
"17878 NG, LA, Lagos NaN NaN \n",
|
||
"17879 NZ, N, Wellington Engineering NaN \n",
|
||
"\n",
|
||
" company_profile \\\n",
|
||
"0 We're Food52, and we've created a groundbreaki... \n",
|
||
"1 90 Seconds, the worlds Cloud Video Production ... \n",
|
||
"2 Valor Services provides Workforce Solutions th... \n",
|
||
"3 Our passion for improving quality of life thro... \n",
|
||
"4 SpotSource Solutions LLC is a Global Human Cap... \n",
|
||
"... ... \n",
|
||
"17875 Vend is looking for some awesome new talent to... \n",
|
||
"17876 WebLinc is the e-commerce platform and service... \n",
|
||
"17877 We Provide Full Time Permanent Positions for m... \n",
|
||
"17878 NaN \n",
|
||
"17879 Vend is looking for some awesome new talent to... \n",
|
||
"\n",
|
||
" description \\\n",
|
||
"0 Food52, a fast-growing, James Beard Award-winn... \n",
|
||
"1 Organised - Focused - Vibrant - Awesome!Do you... \n",
|
||
"2 Our client, located in Houston, is actively se... \n",
|
||
"3 THE COMPANY: ESRI – Environmental Systems Rese... \n",
|
||
"4 JOB TITLE: Itemization Review ManagerLOCATION:... \n",
|
||
"... ... \n",
|
||
"17875 Just in case this is the first time you’ve vis... \n",
|
||
"17876 The Payroll Accountant will focus primarily on... \n",
|
||
"17877 Experienced Project Cost Control Staff Enginee... \n",
|
||
"17878 Nemsia Studios is looking for an experienced v... \n",
|
||
"17879 Who are we?Vend is an award winning web based ... \n",
|
||
"\n",
|
||
" requirements \\\n",
|
||
"0 Experience with content management systems a m... \n",
|
||
"1 What we expect from you:Your key responsibilit... \n",
|
||
"2 Implement pre-commissioning and commissioning ... \n",
|
||
"3 EDUCATION: Bachelor’s or Master’s in GIS, busi... \n",
|
||
"4 QUALIFICATIONS:RN license in the State of Texa... \n",
|
||
"... ... \n",
|
||
"17875 To ace this role you:Will eat comprehensive St... \n",
|
||
"17876 - B.A. or B.S. in Accounting- Desire to have f... \n",
|
||
"17877 At least 12 years professional experience.Abil... \n",
|
||
"17878 1. Must be fluent in the latest versions of Co... \n",
|
||
"17879 We want to hear from you if:You have an in-dep... \n",
|
||
"\n",
|
||
" benefits telecommuting \\\n",
|
||
"0 NaN 0 \n",
|
||
"1 What you will get from usThrough being part of... 0 \n",
|
||
"2 NaN 0 \n",
|
||
"3 Our culture is anything but corporate—we have ... 0 \n",
|
||
"4 Full Benefits Offered 0 \n",
|
||
"... ... ... \n",
|
||
"17875 What can you expect from us?We have an open cu... 0 \n",
|
||
"17876 Health & WellnessMedical planPrescription ... 0 \n",
|
||
"17877 NaN 0 \n",
|
||
"17878 Competitive salary (compensation will be based... 0 \n",
|
||
"17879 NaN 0 \n",
|
||
"\n",
|
||
" has_company_logo has_questions employment_type required_experience \\\n",
|
||
"0 1 0 Other Internship \n",
|
||
"1 1 0 Full-time Not Applicable \n",
|
||
"2 1 0 NaN NaN \n",
|
||
"3 1 0 Full-time Mid-Senior level \n",
|
||
"4 1 1 Full-time Mid-Senior level \n",
|
||
"... ... ... ... ... \n",
|
||
"17875 1 1 Full-time Mid-Senior level \n",
|
||
"17876 1 1 Full-time Mid-Senior level \n",
|
||
"17877 0 0 Full-time NaN \n",
|
||
"17878 0 1 Contract Not Applicable \n",
|
||
"17879 1 1 Full-time Mid-Senior level \n",
|
||
"\n",
|
||
" required_education industry function \\\n",
|
||
"0 NaN NaN Marketing \n",
|
||
"1 NaN Marketing and Advertising Customer Service \n",
|
||
"2 NaN NaN NaN \n",
|
||
"3 Bachelor's Degree Computer Software Sales \n",
|
||
"4 Bachelor's Degree Hospital & Health Care Health Care Provider \n",
|
||
"... ... ... ... \n",
|
||
"17875 NaN Computer Software Sales \n",
|
||
"17876 Bachelor's Degree Internet Accounting/Auditing \n",
|
||
"17877 NaN NaN NaN \n",
|
||
"17878 Professional Graphic Design Design \n",
|
||
"17879 NaN Computer Software Engineering \n",
|
||
"\n",
|
||
" fraudulent \n",
|
||
"0 0 \n",
|
||
"1 0 \n",
|
||
"2 0 \n",
|
||
"3 0 \n",
|
||
"4 0 \n",
|
||
"... ... \n",
|
||
"17875 0 \n",
|
||
"17876 0 \n",
|
||
"17877 0 \n",
|
||
"17878 0 \n",
|
||
"17879 0 \n",
|
||
"\n",
|
||
"[17880 rows x 18 columns]"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"data=pd.read_csv('fake_job_postings.csv')\n",
|
||
"data\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "edbf49da",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"17880 fake_job_postings.csv\r\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#Wielkosc zbioru\n",
|
||
"!wc -l fake_job_postings.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "bc594582",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>job_id</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>location</th>\n",
|
||
" <th>department</th>\n",
|
||
" <th>salary_range</th>\n",
|
||
" <th>company_profile</th>\n",
|
||
" <th>description</th>\n",
|
||
" <th>requirements</th>\n",
|
||
" <th>benefits</th>\n",
|
||
" <th>telecommuting</th>\n",
|
||
" <th>has_company_logo</th>\n",
|
||
" <th>has_questions</th>\n",
|
||
" <th>employment_type</th>\n",
|
||
" <th>required_experience</th>\n",
|
||
" <th>required_education</th>\n",
|
||
" <th>industry</th>\n",
|
||
" <th>function</th>\n",
|
||
" <th>fraudulent</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Marketing Intern</td>\n",
|
||
" <td>US, NY, New York</td>\n",
|
||
" <td>Marketing</td>\n",
|
||
" <td></td>\n",
|
||
" <td>We're Food52, and we've created a groundbreaki...</td>\n",
|
||
" <td>Food52, a fast-growing, James Beard Award-winn...</td>\n",
|
||
" <td>Experience with content management systems a m...</td>\n",
|
||
" <td></td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Other</td>\n",
|
||
" <td>Internship</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>Marketing</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Customer Service - Cloud Video Production</td>\n",
|
||
" <td>NZ, , Auckland</td>\n",
|
||
" <td>Success</td>\n",
|
||
" <td></td>\n",
|
||
" <td>90 Seconds, the worlds Cloud Video Production ...</td>\n",
|
||
" <td>Organised - Focused - Vibrant - Awesome!Do you...</td>\n",
|
||
" <td>What we expect from you:Your key responsibilit...</td>\n",
|
||
" <td>What you will get from usThrough being part of...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Not Applicable</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Marketing and Advertising</td>\n",
|
||
" <td>Customer Service</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Commissioning Machinery Assistant (CMA)</td>\n",
|
||
" <td>US, IA, Wever</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>Valor Services provides Workforce Solutions th...</td>\n",
|
||
" <td>Our client, located in Houston, is actively se...</td>\n",
|
||
" <td>Implement pre-commissioning and commissioning ...</td>\n",
|
||
" <td></td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>Account Executive - Washington DC</td>\n",
|
||
" <td>US, DC, Washington</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Our passion for improving quality of life thro...</td>\n",
|
||
" <td>THE COMPANY: ESRI – Environmental Systems Rese...</td>\n",
|
||
" <td>EDUCATION: Bachelor’s or Master’s in GIS, busi...</td>\n",
|
||
" <td>Our culture is anything but corporate—we have ...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>Bachelor's Degree</td>\n",
|
||
" <td>Computer Software</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>Bill Review Manager</td>\n",
|
||
" <td>US, FL, Fort Worth</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>SpotSource Solutions LLC is a Global Human Cap...</td>\n",
|
||
" <td>JOB TITLE: Itemization Review ManagerLOCATION:...</td>\n",
|
||
" <td>QUALIFICATIONS:RN license in the State of Texa...</td>\n",
|
||
" <td>Full Benefits Offered</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>Bachelor's Degree</td>\n",
|
||
" <td>Hospital & Health Care</td>\n",
|
||
" <td>Health Care Provider</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17875</th>\n",
|
||
" <td>17876</td>\n",
|
||
" <td>Account Director - Distribution</td>\n",
|
||
" <td>CA, ON, Toronto</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Vend is looking for some awesome new talent to...</td>\n",
|
||
" <td>Just in case this is the first time you’ve vis...</td>\n",
|
||
" <td>To ace this role you:Will eat comprehensive St...</td>\n",
|
||
" <td>What can you expect from us?We have an open cu...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Computer Software</td>\n",
|
||
" <td>Sales</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17876</th>\n",
|
||
" <td>17877</td>\n",
|
||
" <td>Payroll Accountant</td>\n",
|
||
" <td>US, PA, Philadelphia</td>\n",
|
||
" <td>Accounting</td>\n",
|
||
" <td></td>\n",
|
||
" <td>WebLinc is the e-commerce platform and service...</td>\n",
|
||
" <td>The Payroll Accountant will focus primarily on...</td>\n",
|
||
" <td>- B.A. or B.S. in Accounting- Desire to have f...</td>\n",
|
||
" <td>Health &amp; WellnessMedical planPrescription ...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td>Bachelor's Degree</td>\n",
|
||
" <td>Internet</td>\n",
|
||
" <td>Accounting/Auditing</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17877</th>\n",
|
||
" <td>17878</td>\n",
|
||
" <td>Project Cost Control Staff Engineer - Cost Con...</td>\n",
|
||
" <td>US, TX, Houston</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>We Provide Full Time Permanent Positions for m...</td>\n",
|
||
" <td>Experienced Project Cost Control Staff Enginee...</td>\n",
|
||
" <td>At least 12 years professional experience.Abil...</td>\n",
|
||
" <td></td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17878</th>\n",
|
||
" <td>17879</td>\n",
|
||
" <td>Graphic Designer</td>\n",
|
||
" <td>NG, LA, Lagos</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>Nemsia Studios is looking for an experienced v...</td>\n",
|
||
" <td>1. Must be fluent in the latest versions of Co...</td>\n",
|
||
" <td>Competitive salary (compensation will be based...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Contract</td>\n",
|
||
" <td>Not Applicable</td>\n",
|
||
" <td>Professional</td>\n",
|
||
" <td>Graphic Design</td>\n",
|
||
" <td>Design</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17879</th>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>Web Application Developers</td>\n",
|
||
" <td>NZ, N, Wellington</td>\n",
|
||
" <td>Engineering</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Vend is looking for some awesome new talent to...</td>\n",
|
||
" <td>Who are we?Vend is an award winning web based ...</td>\n",
|
||
" <td>We want to hear from you if:You have an in-dep...</td>\n",
|
||
" <td></td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td>Mid-Senior level</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Computer Software</td>\n",
|
||
" <td>Engineering</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>17880 rows × 18 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" job_id title \\\n",
|
||
"0 1 Marketing Intern \n",
|
||
"1 2 Customer Service - Cloud Video Production \n",
|
||
"2 3 Commissioning Machinery Assistant (CMA) \n",
|
||
"3 4 Account Executive - Washington DC \n",
|
||
"4 5 Bill Review Manager \n",
|
||
"... ... ... \n",
|
||
"17875 17876 Account Director - Distribution \n",
|
||
"17876 17877 Payroll Accountant \n",
|
||
"17877 17878 Project Cost Control Staff Engineer - Cost Con... \n",
|
||
"17878 17879 Graphic Designer \n",
|
||
"17879 17880 Web Application Developers \n",
|
||
"\n",
|
||
" location department salary_range \\\n",
|
||
"0 US, NY, New York Marketing \n",
|
||
"1 NZ, , Auckland Success \n",
|
||
"2 US, IA, Wever \n",
|
||
"3 US, DC, Washington Sales \n",
|
||
"4 US, FL, Fort Worth \n",
|
||
"... ... ... ... \n",
|
||
"17875 CA, ON, Toronto Sales \n",
|
||
"17876 US, PA, Philadelphia Accounting \n",
|
||
"17877 US, TX, Houston \n",
|
||
"17878 NG, LA, Lagos \n",
|
||
"17879 NZ, N, Wellington Engineering \n",
|
||
"\n",
|
||
" company_profile \\\n",
|
||
"0 We're Food52, and we've created a groundbreaki... \n",
|
||
"1 90 Seconds, the worlds Cloud Video Production ... \n",
|
||
"2 Valor Services provides Workforce Solutions th... \n",
|
||
"3 Our passion for improving quality of life thro... \n",
|
||
"4 SpotSource Solutions LLC is a Global Human Cap... \n",
|
||
"... ... \n",
|
||
"17875 Vend is looking for some awesome new talent to... \n",
|
||
"17876 WebLinc is the e-commerce platform and service... \n",
|
||
"17877 We Provide Full Time Permanent Positions for m... \n",
|
||
"17878 \n",
|
||
"17879 Vend is looking for some awesome new talent to... \n",
|
||
"\n",
|
||
" description \\\n",
|
||
"0 Food52, a fast-growing, James Beard Award-winn... \n",
|
||
"1 Organised - Focused - Vibrant - Awesome!Do you... \n",
|
||
"2 Our client, located in Houston, is actively se... \n",
|
||
"3 THE COMPANY: ESRI – Environmental Systems Rese... \n",
|
||
"4 JOB TITLE: Itemization Review ManagerLOCATION:... \n",
|
||
"... ... \n",
|
||
"17875 Just in case this is the first time you’ve vis... \n",
|
||
"17876 The Payroll Accountant will focus primarily on... \n",
|
||
"17877 Experienced Project Cost Control Staff Enginee... \n",
|
||
"17878 Nemsia Studios is looking for an experienced v... \n",
|
||
"17879 Who are we?Vend is an award winning web based ... \n",
|
||
"\n",
|
||
" requirements \\\n",
|
||
"0 Experience with content management systems a m... \n",
|
||
"1 What we expect from you:Your key responsibilit... \n",
|
||
"2 Implement pre-commissioning and commissioning ... \n",
|
||
"3 EDUCATION: Bachelor’s or Master’s in GIS, busi... \n",
|
||
"4 QUALIFICATIONS:RN license in the State of Texa... \n",
|
||
"... ... \n",
|
||
"17875 To ace this role you:Will eat comprehensive St... \n",
|
||
"17876 - B.A. or B.S. in Accounting- Desire to have f... \n",
|
||
"17877 At least 12 years professional experience.Abil... \n",
|
||
"17878 1. Must be fluent in the latest versions of Co... \n",
|
||
"17879 We want to hear from you if:You have an in-dep... \n",
|
||
"\n",
|
||
" benefits telecommuting \\\n",
|
||
"0 0 \n",
|
||
"1 What you will get from usThrough being part of... 0 \n",
|
||
"2 0 \n",
|
||
"3 Our culture is anything but corporate—we have ... 0 \n",
|
||
"4 Full Benefits Offered 0 \n",
|
||
"... ... ... \n",
|
||
"17875 What can you expect from us?We have an open cu... 0 \n",
|
||
"17876 Health & WellnessMedical planPrescription ... 0 \n",
|
||
"17877 0 \n",
|
||
"17878 Competitive salary (compensation will be based... 0 \n",
|
||
"17879 0 \n",
|
||
"\n",
|
||
" has_company_logo has_questions employment_type required_experience \\\n",
|
||
"0 1 0 Other Internship \n",
|
||
"1 1 0 Full-time Not Applicable \n",
|
||
"2 1 0 \n",
|
||
"3 1 0 Full-time Mid-Senior level \n",
|
||
"4 1 1 Full-time Mid-Senior level \n",
|
||
"... ... ... ... ... \n",
|
||
"17875 1 1 Full-time Mid-Senior level \n",
|
||
"17876 1 1 Full-time Mid-Senior level \n",
|
||
"17877 0 0 Full-time \n",
|
||
"17878 0 1 Contract Not Applicable \n",
|
||
"17879 1 1 Full-time Mid-Senior level \n",
|
||
"\n",
|
||
" required_education industry function \\\n",
|
||
"0 Marketing \n",
|
||
"1 Marketing and Advertising Customer Service \n",
|
||
"2 \n",
|
||
"3 Bachelor's Degree Computer Software Sales \n",
|
||
"4 Bachelor's Degree Hospital & Health Care Health Care Provider \n",
|
||
"... ... ... ... \n",
|
||
"17875 Computer Software Sales \n",
|
||
"17876 Bachelor's Degree Internet Accounting/Auditing \n",
|
||
"17877 \n",
|
||
"17878 Professional Graphic Design Design \n",
|
||
"17879 Computer Software Engineering \n",
|
||
"\n",
|
||
" fraudulent \n",
|
||
"0 0 \n",
|
||
"1 0 \n",
|
||
"2 0 \n",
|
||
"3 0 \n",
|
||
"4 0 \n",
|
||
"... ... \n",
|
||
"17875 0 \n",
|
||
"17876 0 \n",
|
||
"17877 0 \n",
|
||
"17878 0 \n",
|
||
"17879 0 \n",
|
||
"\n",
|
||
"[17880 rows x 18 columns]"
|
||
]
|
||
},
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data = data.replace(np.nan, '', regex=True)\n",
|
||
"data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "e60b3f32",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" 11547\n",
|
||
"Sales 551\n",
|
||
"Engineering 487\n",
|
||
"Marketing 401\n",
|
||
"Operations 270\n",
|
||
" ... \n",
|
||
"Pricing 1\n",
|
||
"Mobility 1\n",
|
||
"Housekeeping 1\n",
|
||
"An Impact Engine Company 1\n",
|
||
"Trainee 1\n",
|
||
"Name: department, Length: 1338, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data[\"department\"].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "c5ac75f5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>job_id</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>location</th>\n",
|
||
" <th>department</th>\n",
|
||
" <th>salary_range</th>\n",
|
||
" <th>company_profile</th>\n",
|
||
" <th>description</th>\n",
|
||
" <th>requirements</th>\n",
|
||
" <th>benefits</th>\n",
|
||
" <th>telecommuting</th>\n",
|
||
" <th>has_company_logo</th>\n",
|
||
" <th>has_questions</th>\n",
|
||
" <th>employment_type</th>\n",
|
||
" <th>required_experience</th>\n",
|
||
" <th>required_education</th>\n",
|
||
" <th>industry</th>\n",
|
||
" <th>function</th>\n",
|
||
" <th>fraudulent</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>17880.000000</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880.000000</td>\n",
|
||
" <td>17880.000000</td>\n",
|
||
" <td>17880.000000</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880</td>\n",
|
||
" <td>17880.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>unique</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>11231</td>\n",
|
||
" <td>3106</td>\n",
|
||
" <td>1338</td>\n",
|
||
" <td>875</td>\n",
|
||
" <td>1710</td>\n",
|
||
" <td>14802</td>\n",
|
||
" <td>11969</td>\n",
|
||
" <td>6206</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>132</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>top</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>English Teacher Abroad</td>\n",
|
||
" <td>GB, LND, London</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>Play with kids, get paid for it Love travel? J...</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Full-time</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>freq</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>311</td>\n",
|
||
" <td>718</td>\n",
|
||
" <td>11547</td>\n",
|
||
" <td>15012</td>\n",
|
||
" <td>3308</td>\n",
|
||
" <td>379</td>\n",
|
||
" <td>2695</td>\n",
|
||
" <td>7210</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>11620</td>\n",
|
||
" <td>7050</td>\n",
|
||
" <td>8105</td>\n",
|
||
" <td>4903</td>\n",
|
||
" <td>6455</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>8940.500000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.042897</td>\n",
|
||
" <td>0.795302</td>\n",
|
||
" <td>0.491723</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.048434</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>5161.655742</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.202631</td>\n",
|
||
" <td>0.403492</td>\n",
|
||
" <td>0.499945</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.214688</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>4470.750000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>8940.500000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>13410.250000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>17880.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" job_id title location department \\\n",
|
||
"count 17880.000000 17880 17880 17880 \n",
|
||
"unique NaN 11231 3106 1338 \n",
|
||
"top NaN English Teacher Abroad GB, LND, London \n",
|
||
"freq NaN 311 718 11547 \n",
|
||
"mean 8940.500000 NaN NaN NaN \n",
|
||
"std 5161.655742 NaN NaN NaN \n",
|
||
"min 1.000000 NaN NaN NaN \n",
|
||
"25% 4470.750000 NaN NaN NaN \n",
|
||
"50% 8940.500000 NaN NaN NaN \n",
|
||
"75% 13410.250000 NaN NaN NaN \n",
|
||
"max 17880.000000 NaN NaN NaN \n",
|
||
"\n",
|
||
" salary_range company_profile \\\n",
|
||
"count 17880 17880 \n",
|
||
"unique 875 1710 \n",
|
||
"top \n",
|
||
"freq 15012 3308 \n",
|
||
"mean NaN NaN \n",
|
||
"std NaN NaN \n",
|
||
"min NaN NaN \n",
|
||
"25% NaN NaN \n",
|
||
"50% NaN NaN \n",
|
||
"75% NaN NaN \n",
|
||
"max NaN NaN \n",
|
||
"\n",
|
||
" description requirements \\\n",
|
||
"count 17880 17880 \n",
|
||
"unique 14802 11969 \n",
|
||
"top Play with kids, get paid for it Love travel? J... \n",
|
||
"freq 379 2695 \n",
|
||
"mean NaN NaN \n",
|
||
"std NaN NaN \n",
|
||
"min NaN NaN \n",
|
||
"25% NaN NaN \n",
|
||
"50% NaN NaN \n",
|
||
"75% NaN NaN \n",
|
||
"max NaN NaN \n",
|
||
"\n",
|
||
" benefits telecommuting has_company_logo has_questions \\\n",
|
||
"count 17880 17880.000000 17880.000000 17880.000000 \n",
|
||
"unique 6206 NaN NaN NaN \n",
|
||
"top NaN NaN NaN \n",
|
||
"freq 7210 NaN NaN NaN \n",
|
||
"mean NaN 0.042897 0.795302 0.491723 \n",
|
||
"std NaN 0.202631 0.403492 0.499945 \n",
|
||
"min NaN 0.000000 0.000000 0.000000 \n",
|
||
"25% NaN 0.000000 1.000000 0.000000 \n",
|
||
"50% NaN 0.000000 1.000000 0.000000 \n",
|
||
"75% NaN 0.000000 1.000000 1.000000 \n",
|
||
"max NaN 1.000000 1.000000 1.000000 \n",
|
||
"\n",
|
||
" employment_type required_experience required_education industry \\\n",
|
||
"count 17880 17880 17880 17880 \n",
|
||
"unique 6 8 14 132 \n",
|
||
"top Full-time \n",
|
||
"freq 11620 7050 8105 4903 \n",
|
||
"mean NaN NaN NaN NaN \n",
|
||
"std NaN NaN NaN NaN \n",
|
||
"min NaN NaN NaN NaN \n",
|
||
"25% NaN NaN NaN NaN \n",
|
||
"50% NaN NaN NaN NaN \n",
|
||
"75% NaN NaN NaN NaN \n",
|
||
"max NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" function fraudulent \n",
|
||
"count 17880 17880.000000 \n",
|
||
"unique 38 NaN \n",
|
||
"top NaN \n",
|
||
"freq 6455 NaN \n",
|
||
"mean NaN 0.048434 \n",
|
||
"std NaN 0.214688 \n",
|
||
"min NaN 0.000000 \n",
|
||
"25% NaN 0.000000 \n",
|
||
"50% NaN 0.000000 \n",
|
||
"75% NaN 0.000000 \n",
|
||
"max NaN 1.000000 "
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data = data.replace(np.nan, '', regex=True)\n",
|
||
"data.describe(include='all')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"id": "4b0e77a4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"job_id 8940.5\n",
|
||
"telecommuting 0.0\n",
|
||
"has_company_logo 1.0\n",
|
||
"has_questions 0.0\n",
|
||
"fraudulent 0.0\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.median()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "5a1d8ec7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||
"Collecting scikit-learn\n",
|
||
" Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
|
||
" |████████████████████████████████| 26.7 MB 8.8 MB/s \n",
|
||
"\u001b[?25hRequirement already satisfied: numpy>=1.14.6 in /home/mikolaj/.local/lib/python3.8/site-packages (from scikit-learn) (1.19.5)\n",
|
||
"Requirement already satisfied: scipy>=1.1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from scikit-learn) (1.7.2)\n",
|
||
"Collecting threadpoolctl>=2.0.0\n",
|
||
" Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)\n",
|
||
"Requirement already satisfied: joblib>=0.11 in /home/mikolaj/.local/lib/python3.8/site-packages (from scikit-learn) (1.1.0)\n",
|
||
"Installing collected packages: threadpoolctl, scikit-learn\n",
|
||
"Successfully installed scikit-learn-1.0.2 threadpoolctl-3.1.0\n",
|
||
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
|
||
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
|
||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"pip install -U scikit-learn"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "50813795",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"English Teacher Abroad 235\n",
|
||
"Customer Service Associate 110\n",
|
||
"Graduates: English Teacher Abroad (Conversational) 104\n",
|
||
"English Teacher Abroad 72\n",
|
||
"Software Engineer 68\n",
|
||
" ... \n",
|
||
"Manager-Plastics Mfg Engineering - Full Time Permanent Job 1\n",
|
||
"Ruby on Rails Developer/Programmer 1\n",
|
||
"Appliance Technician 1\n",
|
||
"Need Oracle Fusion HCM Resource 1\n",
|
||
"Recruitment specialist 1\n",
|
||
"Name: title, Length: 8761, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"import sklearn\n",
|
||
"data_train, data_test = train_test_split(data, random_state=1)\n",
|
||
"data_train[\"title\"].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "ea3c9f2e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0.3333333333333333"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data_test.size/data_train.size"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "b20cc27a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"English Teacher Abroad 235\n",
|
||
"Customer Service Associate 110\n",
|
||
"Graduates: English Teacher Abroad (Conversational) 104\n",
|
||
"English Teacher Abroad 72\n",
|
||
"Software Engineer 68\n",
|
||
" ... \n",
|
||
"Manager-Plastics Mfg Engineering - Full Time Permanent Job 1\n",
|
||
"Ruby on Rails Developer/Programmer 1\n",
|
||
"Appliance Technician 1\n",
|
||
"Need Oracle Fusion HCM Resource 1\n",
|
||
"Recruitment specialist 1\n",
|
||
"Name: title, Length: 8761, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data_train[\"title\"].value_counts()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.10"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|