{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5e2107a5", "metadata": {}, "outputs": [], "source": [ "#Skrypt do ściagnięcia zbiory danych\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "bcc889e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: kaggle in /home/mikolaj/.local/lib/python3.8/site-packages (1.5.12)\n", "Requirement already satisfied: tqdm in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)\n", "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.22)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.11.0)\n", "Requirement already satisfied: requests in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.25.1)\n", "Requirement already satisfied: python-dateutil in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.8.1)\n", "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2018.1.18)\n", "Requirement already satisfied: python-slugify in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /home/mikolaj/.local/lib/python3.8/site-packages (from requests->kaggle) (2.10)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->kaggle) (3.0.4)\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: pandas in /home/mikolaj/.local/lib/python3.8/site-packages (1.1.5)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas) (2018.3)\n", "Requirement already satisfied: numpy>=1.15.4 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (1.19.5)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (2.8.1)\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7.3->pandas) (1.11.0)\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install --user kaggle #API Kaggle, do pobrania zbioru\n", "!pip install --user pandas" ] }, { "cell_type": "code", "execution_count": 17, "id": "02a4034f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/mikolaj/.kaggle/kaggle.json'\n", "Downloading real-or-fake-fake-jobposting-prediction.zip to /home/mikolaj/ai_tech/inzynieria\n", " 99%|█████████████████████████████████████▊| 16.0M/16.1M [00:01<00:00, 10.2MB/s]\n", "100%|██████████████████████████████████████| 16.1M/16.1M [00:01<00:00, 9.54MB/s]\n" ] } ], "source": [ "# Żeby poniższa komenda zadziałała, musisz posiadać plik ~/.kaggle/kaggle.json, zawierający Kaggle API token.\n", "# Instrukcje: https://www.kaggle.com/docs/api\n", "!kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction" ] }, { "cell_type": "code", "execution_count": 7, "id": "5035aef0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: real-or-fake-fake-jobposting-prediction.zip\n", " inflating: fake_job_postings.csv \n" ] } ], "source": [ "!unzip -o real-or-fake-fake-jobposting-prediction.zip" ] }, { "cell_type": "code", "execution_count": 19, "id": "14344d2f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting seaborn\n", " Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n", " |████████████████████████████████| 292 kB 1.8 MB/s \n", "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.19.5)\n", "Requirement already satisfied: scipy>=1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.7.2)\n", "Requirement already satisfied: pandas>=0.23 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.1.5)\n", "Requirement already satisfied: matplotlib>=2.2 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (3.4.2)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", "Requirement already satisfied: pillow>=6.2.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0)\n", "Requirement already satisfied: python-dateutil>=2.7 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", "Requirement already satisfied: cycler>=0.10 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas>=0.23->seaborn) (2018.3)\n", "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.11.0)\n", "Installing collected packages: seaborn\n", "Successfully installed seaborn-0.11.2\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install --user seaborn" ] }, { "cell_type": "code", "execution_count": 43, "id": "0f5ebfab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | job_id | \n", "title | \n", "location | \n", "department | \n", "salary_range | \n", "company_profile | \n", "description | \n", "requirements | \n", "benefits | \n", "telecommuting | \n", "has_company_logo | \n", "has_questions | \n", "employment_type | \n", "required_experience | \n", "required_education | \n", "industry | \n", "function | \n", "fraudulent | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Marketing Intern | \n", "US, NY, New York | \n", "Marketing | \n", "NaN | \n", "We're Food52, and we've created a groundbreaki... | \n", "Food52, a fast-growing, James Beard Award-winn... | \n", "Experience with content management systems a m... | \n", "NaN | \n", "0 | \n", "1 | \n", "0 | \n", "Other | \n", "Internship | \n", "NaN | \n", "NaN | \n", "Marketing | \n", "0 | \n", "
1 | \n", "2 | \n", "Customer Service - Cloud Video Production | \n", "NZ, , Auckland | \n", "Success | \n", "NaN | \n", "90 Seconds, the worlds Cloud Video Production ... | \n", "Organised - Focused - Vibrant - Awesome!Do you... | \n", "What we expect from you:Your key responsibilit... | \n", "What you will get from usThrough being part of... | \n", "0 | \n", "1 | \n", "0 | \n", "Full-time | \n", "Not Applicable | \n", "NaN | \n", "Marketing and Advertising | \n", "Customer Service | \n", "0 | \n", "
2 | \n", "3 | \n", "Commissioning Machinery Assistant (CMA) | \n", "US, IA, Wever | \n", "NaN | \n", "NaN | \n", "Valor Services provides Workforce Solutions th... | \n", "Our client, located in Houston, is actively se... | \n", "Implement pre-commissioning and commissioning ... | \n", "NaN | \n", "0 | \n", "1 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "
3 | \n", "4 | \n", "Account Executive - Washington DC | \n", "US, DC, Washington | \n", "Sales | \n", "NaN | \n", "Our passion for improving quality of life thro... | \n", "THE COMPANY: ESRI – Environmental Systems Rese... | \n", "EDUCATION: Bachelor’s or Master’s in GIS, busi... | \n", "Our culture is anything but corporate—we have ... | \n", "0 | \n", "1 | \n", "0 | \n", "Full-time | \n", "Mid-Senior level | \n", "Bachelor's Degree | \n", "Computer Software | \n", "Sales | \n", "0 | \n", "
4 | \n", "5 | \n", "Bill Review Manager | \n", "US, FL, Fort Worth | \n", "NaN | \n", "NaN | \n", "SpotSource Solutions LLC is a Global Human Cap... | \n", "JOB TITLE: Itemization Review ManagerLOCATION:... | \n", "QUALIFICATIONS:RN license in the State of Texa... | \n", "Full Benefits Offered | \n", "0 | \n", "1 | \n", "1 | \n", "Full-time | \n", "Mid-Senior level | \n", "Bachelor's Degree | \n", "Hospital & Health Care | \n", "Health Care Provider | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
17875 | \n", "17876 | \n", "Account Director - Distribution | \n", "CA, ON, Toronto | \n", "Sales | \n", "NaN | \n", "Vend is looking for some awesome new talent to... | \n", "Just in case this is the first time you’ve vis... | \n", "To ace this role you:Will eat comprehensive St... | \n", "What can you expect from us?We have an open cu... | \n", "0 | \n", "1 | \n", "1 | \n", "Full-time | \n", "Mid-Senior level | \n", "NaN | \n", "Computer Software | \n", "Sales | \n", "0 | \n", "
17876 | \n", "17877 | \n", "Payroll Accountant | \n", "US, PA, Philadelphia | \n", "Accounting | \n", "NaN | \n", "WebLinc is the e-commerce platform and service... | \n", "The Payroll Accountant will focus primarily on... | \n", "- B.A. or B.S. in Accounting- Desire to have f... | \n", "Health & WellnessMedical planPrescription ... | \n", "0 | \n", "1 | \n", "1 | \n", "Full-time | \n", "Mid-Senior level | \n", "Bachelor's Degree | \n", "Internet | \n", "Accounting/Auditing | \n", "0 | \n", "
17877 | \n", "17878 | \n", "Project Cost Control Staff Engineer - Cost Con... | \n", "US, TX, Houston | \n", "NaN | \n", "NaN | \n", "We Provide Full Time Permanent Positions for m... | \n", "Experienced Project Cost Control Staff Enginee... | \n", "At least 12 years professional experience.Abil... | \n", "NaN | \n", "0 | \n", "0 | \n", "0 | \n", "Full-time | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "
17878 | \n", "17879 | \n", "Graphic Designer | \n", "NG, LA, Lagos | \n", "NaN | \n", "NaN | \n", "NaN | \n", "Nemsia Studios is looking for an experienced v... | \n", "1. Must be fluent in the latest versions of Co... | \n", "Competitive salary (compensation will be based... | \n", "0 | \n", "0 | \n", "1 | \n", "Contract | \n", "Not Applicable | \n", "Professional | \n", "Graphic Design | \n", "Design | \n", "0 | \n", "
17879 | \n", "17880 | \n", "Web Application Developers | \n", "NZ, N, Wellington | \n", "Engineering | \n", "NaN | \n", "Vend is looking for some awesome new talent to... | \n", "Who are we?Vend is an award winning web based ... | \n", "We want to hear from you if:You have an in-dep... | \n", "NaN | \n", "0 | \n", "1 | \n", "1 | \n", "Full-time | \n", "Mid-Senior level | \n", "NaN | \n", "Computer Software | \n", "Engineering | \n", "0 | \n", "
17880 rows × 18 columns
\n", "\n", " | job_id | \n", "title | \n", "location | \n", "department | \n", "salary_range | \n", "company_profile | \n", "description | \n", "requirements | \n", "benefits | \n", "telecommuting | \n", "has_company_logo | \n", "has_questions | \n", "employment_type | \n", "required_experience | \n", "required_education | \n", "industry | \n", "function | \n", "fraudulent | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "17880.000000 | \n", "17880 | \n", "17534 | \n", "6333 | \n", "2868 | \n", "14572 | \n", "17879 | \n", "15185 | \n", "10670 | \n", "17880.000000 | \n", "17880.000000 | \n", "17880.000000 | \n", "14409 | \n", "10830 | \n", "9775 | \n", "12977 | \n", "11425 | \n", "17880.000000 | \n", "
unique | \n", "NaN | \n", "11231 | \n", "3105 | \n", "1337 | \n", "874 | \n", "1709 | \n", "14801 | \n", "11968 | \n", "6205 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "5 | \n", "7 | \n", "13 | \n", "131 | \n", "37 | \n", "NaN | \n", "
top | \n", "NaN | \n", "English Teacher Abroad | \n", "GB, LND, London | \n", "Sales | \n", "0-0 | \n", "We help teachers get safe & secure jobs ab... | \n", "Play with kids, get paid for it Love travel? J... | \n", "University degree required. TEFL / TESOL / CEL... | \n", "See job description | \n", "NaN | \n", "NaN | \n", "NaN | \n", "Full-time | \n", "Mid-Senior level | \n", "Bachelor's Degree | \n", "Information Technology and Services | \n", "Information Technology | \n", "NaN | \n", "
freq | \n", "NaN | \n", "311 | \n", "718 | \n", "551 | \n", "142 | \n", "726 | \n", "379 | \n", "410 | \n", "726 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "11620 | \n", "3809 | \n", "5145 | \n", "1734 | \n", "1749 | \n", "NaN | \n", "
mean | \n", "8940.500000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.042897 | \n", "0.795302 | \n", "0.491723 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.048434 | \n", "
std | \n", "5161.655742 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.202631 | \n", "0.403492 | \n", "0.499945 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.214688 | \n", "
min | \n", "1.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "
25% | \n", "4470.750000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "1.000000 | \n", "0.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "
50% | \n", "8940.500000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "1.000000 | \n", "0.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "
75% | \n", "13410.250000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "1.000000 | \n", "1.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "
max | \n", "17880.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1.000000 | \n", "