{ "cells": [ { "cell_type": "code", "execution_count": 28, "id": "5e2107a5", "metadata": {}, "outputs": [], "source": [ "#Skrypt do ściagnięcia zbiory danych\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "bcc889e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: kaggle in /home/students/s444463/.local/lib/python3.8/site-packages (1.5.12)\n", "Requirement already satisfied: tqdm in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (4.63.0)\n", "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n", "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n", "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n", "Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", "Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: numpy in /usr/lib/python3/dist-packages (1.17.4)\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install --user kaggle #API Kaggle, do pobrania zbioru\n", "!pip install --user pandas\n", "!pip install --user numpy" ] }, { "cell_type": "code", "execution_count": 30, "id": "02a4034f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: kaggle: command not found\r\n" ] } ], "source": [ "# Żeby poniższa komenda zadziałała, musisz posiadać plik ~/.kaggle/kaggle.json, zawierający Kaggle API token.\n", "# Instrukcje: https://www.kaggle.com/docs/api\n", "!kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction" ] }, { "cell_type": "code", "execution_count": 31, "id": "5035aef0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "unzip: cannot find or open real-or-fake-fake-jobposting-prediction.zip, real-or-fake-fake-jobposting-prediction.zip.zip or real-or-fake-fake-jobposting-prediction.zip.ZIP.\r\n" ] } ], "source": [ "!unzip -o real-or-fake-fake-jobposting-prediction.zip" ] }, { "cell_type": "code", "execution_count": 32, "id": "14344d2f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: seaborn in /home/students/s444463/.local/lib/python3.8/site-packages (0.11.2)\n", "Requirement already satisfied: numpy>=1.15 in /usr/lib/python3/dist-packages (from seaborn) (1.17.4)\n", "Requirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n", "Requirement already satisfied: matplotlib>=2.2 in /home/students/s444463/.local/lib/python3.8/site-packages (from seaborn) (3.4.3)\n", "Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n", "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (8.3.2)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", "Requirement already satisfied: cycler>=0.10 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n", "Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n", "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install --user seaborn" ] }, { "cell_type": "code", "execution_count": 33, "id": "0f5ebfab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_idtitlelocationdepartmentsalary_rangecompany_profiledescriptionrequirementsbenefitstelecommutinghas_company_logohas_questionsemployment_typerequired_experiencerequired_educationindustryfunctionfraudulent
01Marketing InternUS, NY, New YorkMarketingNaNWe're Food52, and we've created a groundbreaki...Food52, a fast-growing, James Beard Award-winn...Experience with content management systems a m...NaN010OtherInternshipNaNNaNMarketing0
12Customer Service - Cloud Video ProductionNZ, , AucklandSuccessNaN90 Seconds, the worlds Cloud Video Production ...Organised - Focused - Vibrant - Awesome!Do you...What we expect from you:Your key responsibilit...What you will get from usThrough being part of...010Full-timeNot ApplicableNaNMarketing and AdvertisingCustomer Service0
23Commissioning Machinery Assistant (CMA)US, IA, WeverNaNNaNValor Services provides Workforce Solutions th...Our client, located in Houston, is actively se...Implement pre-commissioning and commissioning ...NaN010NaNNaNNaNNaNNaN0
34Account Executive - Washington DCUS, DC, WashingtonSalesNaNOur passion for improving quality of life thro...THE COMPANY: ESRI – Environmental Systems Rese...EDUCATION: Bachelor’s or Master’s in GIS, busi...Our culture is anything but corporate—we have ...010Full-timeMid-Senior levelBachelor's DegreeComputer SoftwareSales0
45Bill Review ManagerUS, FL, Fort WorthNaNNaNSpotSource Solutions LLC is a Global Human Cap...JOB TITLE: Itemization Review ManagerLOCATION:...QUALIFICATIONS:RN license in the State of Texa...Full Benefits Offered011Full-timeMid-Senior levelBachelor's DegreeHospital & Health CareHealth Care Provider0
.........................................................
1787517876Account Director - DistributionCA, ON, TorontoSalesNaNVend is looking for some awesome new talent to...Just in case this is the first time you’ve vis...To ace this role you:Will eat comprehensive St...What can you expect from us?We have an open cu...011Full-timeMid-Senior levelNaNComputer SoftwareSales0
1787617877Payroll AccountantUS, PA, PhiladelphiaAccountingNaNWebLinc is the e-commerce platform and service...The Payroll Accountant will focus primarily on...- B.A. or B.S. in Accounting- Desire to have f...Health & WellnessMedical planPrescription ...011Full-timeMid-Senior levelBachelor's DegreeInternetAccounting/Auditing0
1787717878Project Cost Control Staff Engineer - Cost Con...US, TX, HoustonNaNNaNWe Provide Full Time Permanent Positions for m...Experienced Project Cost Control Staff Enginee...At least 12 years professional experience.Abil...NaN000Full-timeNaNNaNNaNNaN0
1787817879Graphic DesignerNG, LA, LagosNaNNaNNaNNemsia Studios is looking for an experienced v...1. Must be fluent in the latest versions of Co...Competitive salary (compensation will be based...001ContractNot ApplicableProfessionalGraphic DesignDesign0
1787917880Web Application DevelopersNZ, N, WellingtonEngineeringNaNVend is looking for some awesome new talent to...Who are we?Vend is an award winning web based ...We want to hear from you if:You have an in-dep...NaN011Full-timeMid-Senior levelNaNComputer SoftwareEngineering0
\n", "

17880 rows × 18 columns

\n", "
" ], "text/plain": [ " job_id title \\\n", "0 1 Marketing Intern \n", "1 2 Customer Service - Cloud Video Production \n", "2 3 Commissioning Machinery Assistant (CMA) \n", "3 4 Account Executive - Washington DC \n", "4 5 Bill Review Manager \n", "... ... ... \n", "17875 17876 Account Director - Distribution \n", "17876 17877 Payroll Accountant \n", "17877 17878 Project Cost Control Staff Engineer - Cost Con... \n", "17878 17879 Graphic Designer \n", "17879 17880 Web Application Developers \n", "\n", " location department salary_range \\\n", "0 US, NY, New York Marketing NaN \n", "1 NZ, , Auckland Success NaN \n", "2 US, IA, Wever NaN NaN \n", "3 US, DC, Washington Sales NaN \n", "4 US, FL, Fort Worth NaN NaN \n", "... ... ... ... \n", "17875 CA, ON, Toronto Sales NaN \n", "17876 US, PA, Philadelphia Accounting NaN \n", "17877 US, TX, Houston NaN NaN \n", "17878 NG, LA, Lagos NaN NaN \n", "17879 NZ, N, Wellington Engineering NaN \n", "\n", " company_profile \\\n", "0 We're Food52, and we've created a groundbreaki... \n", "1 90 Seconds, the worlds Cloud Video Production ... \n", "2 Valor Services provides Workforce Solutions th... \n", "3 Our passion for improving quality of life thro... \n", "4 SpotSource Solutions LLC is a Global Human Cap... \n", "... ... \n", "17875 Vend is looking for some awesome new talent to... \n", "17876 WebLinc is the e-commerce platform and service... \n", "17877 We Provide Full Time Permanent Positions for m... \n", "17878 NaN \n", "17879 Vend is looking for some awesome new talent to... \n", "\n", " description \\\n", "0 Food52, a fast-growing, James Beard Award-winn... \n", "1 Organised - Focused - Vibrant - Awesome!Do you... \n", "2 Our client, located in Houston, is actively se... \n", "3 THE COMPANY: ESRI – Environmental Systems Rese... \n", "4 JOB TITLE: Itemization Review ManagerLOCATION:... \n", "... ... \n", "17875 Just in case this is the first time you’ve vis... \n", "17876 The Payroll Accountant will focus primarily on... \n", "17877 Experienced Project Cost Control Staff Enginee... \n", "17878 Nemsia Studios is looking for an experienced v... \n", "17879 Who are we?Vend is an award winning web based ... \n", "\n", " requirements \\\n", "0 Experience with content management systems a m... \n", "1 What we expect from you:Your key responsibilit... \n", "2 Implement pre-commissioning and commissioning ... \n", "3 EDUCATION: Bachelor’s or Master’s in GIS, busi... \n", "4 QUALIFICATIONS:RN license in the State of Texa... \n", "... ... \n", "17875 To ace this role you:Will eat comprehensive St... \n", "17876 - B.A. or B.S. in Accounting- Desire to have f... \n", "17877 At least 12 years professional experience.Abil... \n", "17878 1. Must be fluent in the latest versions of Co... \n", "17879 We want to hear from you if:You have an in-dep... \n", "\n", " benefits telecommuting \\\n", "0 NaN 0 \n", "1 What you will get from usThrough being part of... 0 \n", "2 NaN 0 \n", "3 Our culture is anything but corporate—we have ... 0 \n", "4 Full Benefits Offered 0 \n", "... ... ... \n", "17875 What can you expect from us?We have an open cu... 0 \n", "17876 Health & WellnessMedical planPrescription ... 0 \n", "17877 NaN 0 \n", "17878 Competitive salary (compensation will be based... 0 \n", "17879 NaN 0 \n", "\n", " has_company_logo has_questions employment_type required_experience \\\n", "0 1 0 Other Internship \n", "1 1 0 Full-time Not Applicable \n", "2 1 0 NaN NaN \n", "3 1 0 Full-time Mid-Senior level \n", "4 1 1 Full-time Mid-Senior level \n", "... ... ... ... ... \n", "17875 1 1 Full-time Mid-Senior level \n", "17876 1 1 Full-time Mid-Senior level \n", "17877 0 0 Full-time NaN \n", "17878 0 1 Contract Not Applicable \n", "17879 1 1 Full-time Mid-Senior level \n", "\n", " required_education industry function \\\n", "0 NaN NaN Marketing \n", "1 NaN Marketing and Advertising Customer Service \n", "2 NaN NaN NaN \n", "3 Bachelor's Degree Computer Software Sales \n", "4 Bachelor's Degree Hospital & Health Care Health Care Provider \n", "... ... ... ... \n", "17875 NaN Computer Software Sales \n", "17876 Bachelor's Degree Internet Accounting/Auditing \n", "17877 NaN NaN NaN \n", "17878 Professional Graphic Design Design \n", "17879 NaN Computer Software Engineering \n", "\n", " fraudulent \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "17875 0 \n", "17876 0 \n", "17877 0 \n", "17878 0 \n", "17879 0 \n", "\n", "[17880 rows x 18 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "data=pd.read_csv('fake_job_postings.csv')\n", "data\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "edbf49da", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "17880 fake_job_postings.csv\r\n" ] } ], "source": [ "#Wielkosc zbioru\n", "!wc -l fake_job_postings.csv" ] }, { "cell_type": "code", "execution_count": 35, "id": "e60b3f32", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sales 551\n", "Engineering 487\n", "Marketing 401\n", "Operations 270\n", "IT 225\n", " ... \n", "Capoo 1\n", "Engineering - Hardware 1\n", "Utilities 1\n", "i 1\n", "TECH 1\n", "Name: department, Length: 1337, dtype: int64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"department\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 36, "id": "ddb2fc38", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_idtitlelocationdepartmentsalary_rangecompany_profiledescriptionrequirementsbenefitstelecommutinghas_company_logohas_questionsemployment_typerequired_experiencerequired_educationindustryfunctionfraudulent
01Marketing InternUS, NY, New YorkMarketingWe're Food52, and we've created a groundbreaki...Food52, a fast-growing, James Beard Award-winn...Experience with content management systems a m...010OtherInternshipMarketing0
12Customer Service - Cloud Video ProductionNZ, , AucklandSuccess90 Seconds, the worlds Cloud Video Production ...Organised - Focused - Vibrant - Awesome!Do you...What we expect from you:Your key responsibilit...What you will get from usThrough being part of...010Full-timeNot ApplicableMarketing and AdvertisingCustomer Service0
23Commissioning Machinery Assistant (CMA)US, IA, WeverValor Services provides Workforce Solutions th...Our client, located in Houston, is actively se...Implement pre-commissioning and commissioning ...0100
34Account Executive - Washington DCUS, DC, WashingtonSalesOur passion for improving quality of life thro...THE COMPANY: ESRI – Environmental Systems Rese...EDUCATION: Bachelor’s or Master’s in GIS, busi...Our culture is anything but corporate—we have ...010Full-timeMid-Senior levelBachelor's DegreeComputer SoftwareSales0
45Bill Review ManagerUS, FL, Fort WorthSpotSource Solutions LLC is a Global Human Cap...JOB TITLE: Itemization Review ManagerLOCATION:...QUALIFICATIONS:RN license in the State of Texa...Full Benefits Offered011Full-timeMid-Senior levelBachelor's DegreeHospital & Health CareHealth Care Provider0
.........................................................
1787517876Account Director - DistributionCA, ON, TorontoSalesVend is looking for some awesome new talent to...Just in case this is the first time you’ve vis...To ace this role you:Will eat comprehensive St...What can you expect from us?We have an open cu...011Full-timeMid-Senior levelComputer SoftwareSales0
1787617877Payroll AccountantUS, PA, PhiladelphiaAccountingWebLinc is the e-commerce platform and service...The Payroll Accountant will focus primarily on...- B.A. or B.S. in Accounting- Desire to have f...Health & WellnessMedical planPrescription ...011Full-timeMid-Senior levelBachelor's DegreeInternetAccounting/Auditing0
1787717878Project Cost Control Staff Engineer - Cost Con...US, TX, HoustonWe Provide Full Time Permanent Positions for m...Experienced Project Cost Control Staff Enginee...At least 12 years professional experience.Abil...000Full-time0
1787817879Graphic DesignerNG, LA, LagosNemsia Studios is looking for an experienced v...1. Must be fluent in the latest versions of Co...Competitive salary (compensation will be based...001ContractNot ApplicableProfessionalGraphic DesignDesign0
1787917880Web Application DevelopersNZ, N, WellingtonEngineeringVend is looking for some awesome new talent to...Who are we?Vend is an award winning web based ...We want to hear from you if:You have an in-dep...011Full-timeMid-Senior levelComputer SoftwareEngineering0
\n", "

17880 rows × 18 columns

\n", "
" ], "text/plain": [ " job_id title \\\n", "0 1 Marketing Intern \n", "1 2 Customer Service - Cloud Video Production \n", "2 3 Commissioning Machinery Assistant (CMA) \n", "3 4 Account Executive - Washington DC \n", "4 5 Bill Review Manager \n", "... ... ... \n", "17875 17876 Account Director - Distribution \n", "17876 17877 Payroll Accountant \n", "17877 17878 Project Cost Control Staff Engineer - Cost Con... \n", "17878 17879 Graphic Designer \n", "17879 17880 Web Application Developers \n", "\n", " location department salary_range \\\n", "0 US, NY, New York Marketing \n", "1 NZ, , Auckland Success \n", "2 US, IA, Wever \n", "3 US, DC, Washington Sales \n", "4 US, FL, Fort Worth \n", "... ... ... ... \n", "17875 CA, ON, Toronto Sales \n", "17876 US, PA, Philadelphia Accounting \n", "17877 US, TX, Houston \n", "17878 NG, LA, Lagos \n", "17879 NZ, N, Wellington Engineering \n", "\n", " company_profile \\\n", "0 We're Food52, and we've created a groundbreaki... \n", "1 90 Seconds, the worlds Cloud Video Production ... \n", "2 Valor Services provides Workforce Solutions th... \n", "3 Our passion for improving quality of life thro... \n", "4 SpotSource Solutions LLC is a Global Human Cap... \n", "... ... \n", "17875 Vend is looking for some awesome new talent to... \n", "17876 WebLinc is the e-commerce platform and service... \n", "17877 We Provide Full Time Permanent Positions for m... \n", "17878 \n", "17879 Vend is looking for some awesome new talent to... \n", "\n", " description \\\n", "0 Food52, a fast-growing, James Beard Award-winn... \n", "1 Organised - Focused - Vibrant - Awesome!Do you... \n", "2 Our client, located in Houston, is actively se... \n", "3 THE COMPANY: ESRI – Environmental Systems Rese... \n", "4 JOB TITLE: Itemization Review ManagerLOCATION:... \n", "... ... \n", "17875 Just in case this is the first time you’ve vis... \n", "17876 The Payroll Accountant will focus primarily on... \n", "17877 Experienced Project Cost Control Staff Enginee... \n", "17878 Nemsia Studios is looking for an experienced v... \n", "17879 Who are we?Vend is an award winning web based ... \n", "\n", " requirements \\\n", "0 Experience with content management systems a m... \n", "1 What we expect from you:Your key responsibilit... \n", "2 Implement pre-commissioning and commissioning ... \n", "3 EDUCATION: Bachelor’s or Master’s in GIS, busi... \n", "4 QUALIFICATIONS:RN license in the State of Texa... \n", "... ... \n", "17875 To ace this role you:Will eat comprehensive St... \n", "17876 - B.A. or B.S. in Accounting- Desire to have f... \n", "17877 At least 12 years professional experience.Abil... \n", "17878 1. Must be fluent in the latest versions of Co... \n", "17879 We want to hear from you if:You have an in-dep... \n", "\n", " benefits telecommuting \\\n", "0 0 \n", "1 What you will get from usThrough being part of... 0 \n", "2 0 \n", "3 Our culture is anything but corporate—we have ... 0 \n", "4 Full Benefits Offered 0 \n", "... ... ... \n", "17875 What can you expect from us?We have an open cu... 0 \n", "17876 Health & WellnessMedical planPrescription ... 0 \n", "17877 0 \n", "17878 Competitive salary (compensation will be based... 0 \n", "17879 0 \n", "\n", " has_company_logo has_questions employment_type required_experience \\\n", "0 1 0 Other Internship \n", "1 1 0 Full-time Not Applicable \n", "2 1 0 \n", "3 1 0 Full-time Mid-Senior level \n", "4 1 1 Full-time Mid-Senior level \n", "... ... ... ... ... \n", "17875 1 1 Full-time Mid-Senior level \n", "17876 1 1 Full-time Mid-Senior level \n", "17877 0 0 Full-time \n", "17878 0 1 Contract Not Applicable \n", "17879 1 1 Full-time Mid-Senior level \n", "\n", " required_education industry function \\\n", "0 Marketing \n", "1 Marketing and Advertising Customer Service \n", "2 \n", "3 Bachelor's Degree Computer Software Sales \n", "4 Bachelor's Degree Hospital & Health Care Health Care Provider \n", "... ... ... ... \n", "17875 Computer Software Sales \n", "17876 Bachelor's Degree Internet Accounting/Auditing \n", "17877 \n", "17878 Professional Graphic Design Design \n", "17879 Computer Software Engineering \n", "\n", " fraudulent \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "17875 0 \n", "17876 0 \n", "17877 0 \n", "17878 0 \n", "17879 0 \n", "\n", "[17880 rows x 18 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "data = data.replace(np.nan, '', regex=True)\n", "data" ] }, { "cell_type": "code", "execution_count": 37, "id": "c5ac75f5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_idtitlelocationdepartmentsalary_rangecompany_profiledescriptionrequirementsbenefitstelecommutinghas_company_logohas_questionsemployment_typerequired_experiencerequired_educationindustryfunctionfraudulent
count17880.000000178801788017880178801788017880178801788017880.00000017880.00000017880.000000178801788017880178801788017880.000000
uniqueNaN1123131061338875171014802119696206NaNNaNNaN681413238NaN
topNaNEnglish Teacher AbroadGB, LND, LondonPlay with kids, get paid for it Love travel? J...NaNNaNNaNFull-timeNaN
freqNaN3117181154715012330837926957210NaNNaNNaN116207050810549036455NaN
mean8940.500000NaNNaNNaNNaNNaNNaNNaNNaN0.0428970.7953020.491723NaNNaNNaNNaNNaN0.048434
std5161.655742NaNNaNNaNNaNNaNNaNNaNNaN0.2026310.4034920.499945NaNNaNNaNNaNNaN0.214688
min1.000000NaNNaNNaNNaNNaNNaNNaNNaN0.0000000.0000000.000000NaNNaNNaNNaNNaN0.000000
25%4470.750000NaNNaNNaNNaNNaNNaNNaNNaN0.0000001.0000000.000000NaNNaNNaNNaNNaN0.000000
50%8940.500000NaNNaNNaNNaNNaNNaNNaNNaN0.0000001.0000000.000000NaNNaNNaNNaNNaN0.000000
75%13410.250000NaNNaNNaNNaNNaNNaNNaNNaN0.0000001.0000001.000000NaNNaNNaNNaNNaN0.000000
max17880.000000NaNNaNNaNNaNNaNNaNNaNNaN1.0000001.0000001.000000NaNNaNNaNNaNNaN1.000000
\n", "
" ], "text/plain": [ " job_id title location department \\\n", "count 17880.000000 17880 17880 17880 \n", "unique NaN 11231 3106 1338 \n", "top NaN English Teacher Abroad GB, LND, London \n", "freq NaN 311 718 11547 \n", "mean 8940.500000 NaN NaN NaN \n", "std 5161.655742 NaN NaN NaN \n", "min 1.000000 NaN NaN NaN \n", "25% 4470.750000 NaN NaN NaN \n", "50% 8940.500000 NaN NaN NaN \n", "75% 13410.250000 NaN NaN NaN \n", "max 17880.000000 NaN NaN NaN \n", "\n", " salary_range company_profile \\\n", "count 17880 17880 \n", "unique 875 1710 \n", "top \n", "freq 15012 3308 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " description requirements \\\n", "count 17880 17880 \n", "unique 14802 11969 \n", "top Play with kids, get paid for it Love travel? J... \n", "freq 379 2695 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " benefits telecommuting has_company_logo has_questions \\\n", "count 17880 17880.000000 17880.000000 17880.000000 \n", "unique 6206 NaN NaN NaN \n", "top NaN NaN NaN \n", "freq 7210 NaN NaN NaN \n", "mean NaN 0.042897 0.795302 0.491723 \n", "std NaN 0.202631 0.403492 0.499945 \n", "min NaN 0.000000 0.000000 0.000000 \n", "25% NaN 0.000000 1.000000 0.000000 \n", "50% NaN 0.000000 1.000000 0.000000 \n", "75% NaN 0.000000 1.000000 1.000000 \n", "max NaN 1.000000 1.000000 1.000000 \n", "\n", " employment_type required_experience required_education industry \\\n", "count 17880 17880 17880 17880 \n", "unique 6 8 14 132 \n", "top Full-time \n", "freq 11620 7050 8105 4903 \n", "mean NaN NaN NaN NaN \n", "std NaN NaN NaN NaN \n", "min NaN NaN NaN NaN \n", "25% NaN NaN NaN NaN \n", "50% NaN NaN NaN NaN \n", "75% NaN NaN NaN NaN \n", "max NaN NaN NaN NaN \n", "\n", " function fraudulent \n", "count 17880 17880.000000 \n", "unique 38 NaN \n", "top NaN \n", "freq 6455 NaN \n", "mean NaN 0.048434 \n", "std NaN 0.214688 \n", "min NaN 0.000000 \n", "25% NaN 0.000000 \n", "50% NaN 0.000000 \n", "75% NaN 0.000000 \n", "max NaN 1.000000 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 38, "id": "4b0e77a4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "job_id 8940.5\n", "telecommuting 0.0\n", "has_company_logo 1.0\n", "has_questions 0.0\n", "fraudulent 0.0\n", "dtype: float64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.median()" ] }, { "cell_type": "code", "execution_count": 39, "id": "5a1d8ec7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\n", "Requirement already satisfied: scikit-learn in /home/students/s444463/.local/lib/python3.8/site-packages (1.0.2)\n", "Requirement already satisfied: numpy>=1.14.6 in /usr/lib/python3/dist-packages (from scikit-learn) (1.17.4)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/students/s444463/.local/lib/python3.8/site-packages (from scikit-learn) (3.1.0)\n", "Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n", "Requirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n", "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install -U scikit-learn" ] }, { "cell_type": "code", "execution_count": 40, "id": "50813795", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "English Teacher Abroad 230\n", "Customer Service Associate 106\n", "Graduates: English Teacher Abroad (Conversational) 96\n", "English Teacher Abroad 71\n", "Software Engineer 67\n", " ... \n", "Jr. Flash & HTML Developer 1\n", " RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n", "Summer interns - 2014/15 1\n", "Full-Stack Engineer 1\n", "Contract Product Designer 1\n", "Name: title, Length: 8461, dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "import sklearn\n", "data_train, data_test = train_test_split(data, test_size=5000, random_state=1)\n", "data_dev, data_test = train_test_split(data_test, test_size=2500, random_state=1)\n", "data_train[\"title\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 41, "id": "ea3c9f2e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12880\n", "2500\n", "2500\n" ] } ], "source": [ "print(len(data_train))\n", "print(len(data_dev))\n", "print(len(data_test))" ] }, { "cell_type": "code", "execution_count": 42, "id": "b20cc27a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "English Teacher Abroad 230\n", "Customer Service Associate 106\n", "Graduates: English Teacher Abroad (Conversational) 96\n", "English Teacher Abroad 71\n", "Software Engineer 67\n", " ... \n", "Jr. Flash & HTML Developer 1\n", " RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n", "Summer interns - 2014/15 1\n", "Full-Stack Engineer 1\n", "Contract Product Designer 1\n", "Name: title, Length: 8461, dtype: int64" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_train[\"title\"].value_counts()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }