diff --git a/download_data.ipynb b/download_data.ipynb index c4b700c..10d9433 100644 --- a/download_data.ipynb +++ b/download_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "id": "5e2107a5", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "id": "bcc889e5", "metadata": {}, "outputs": [ @@ -20,25 +20,38 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: kaggle in /home/mikolaj/.local/lib/python3.8/site-packages (1.5.12)\n", - "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.22)\n", - "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.11.0)\n", - "Requirement already satisfied: requests in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.25.1)\n", - "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2018.1.18)\n", - "Requirement already satisfied: python-slugify in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", - "Requirement already satisfied: python-dateutil in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.8.1)\n", - "Requirement already satisfied: tqdm in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)\n", - "Requirement already satisfied: text-unidecode>=1.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n", - "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->kaggle) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /home/mikolaj/.local/lib/python3.8/site-packages (from requests->kaggle) (2.10)\n", - "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", + "Collecting kaggle\n", + " Using cached kaggle-1.5.12.tar.gz (58 kB)\n", + "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n", + "Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n", + "Collecting python-slugify\n", + " Using cached python_slugify-6.1.1-py2.py3-none-any.whl (9.1 kB)\n", + "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n", + "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n", + "Collecting tqdm\n", + " Downloading tqdm-4.63.0-py2.py3-none-any.whl (76 kB)\n", + "\u001b[K |████████████████████████████████| 76 kB 1.7 MB/s eta 0:00:011\n", + "\u001b[?25hRequirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n", + "Collecting text-unidecode>=1.3\n", + " Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB)\n", + "Building wheels for collected packages: kaggle\n", + " Building wheel for kaggle (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=02172fba1c0ec42884ad8bcbb3c3b99749f529299444b00aaa946e78b9dfcb1f\n", + " Stored in directory: /home/students/s444463/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106\n", + "Successfully built kaggle\n", + "Installing collected packages: text-unidecode, tqdm, python-slugify, kaggle\n", + "\u001b[33m WARNING: The script tqdm is installed in '/home/students/s444463/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", + "\u001b[33m WARNING: The script kaggle is installed in '/home/students/s444463/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", + "Successfully installed kaggle-1.5.12 python-slugify-6.1.1 text-unidecode-1.3 tqdm-4.63.0\n", + "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", - "Requirement already satisfied: pandas in /home/mikolaj/.local/lib/python3.8/site-packages (1.1.5)\n", - "Requirement already satisfied: numpy>=1.15.4 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (1.19.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas) (2018.3)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7.3->pandas) (1.11.0)\n", - "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", + "Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n", + "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", + "Requirement already satisfied: numpy in /usr/lib/python3/dist-packages (1.17.4)\n", + "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } @@ -51,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "id": "02a4034f", "metadata": {}, "outputs": [ @@ -59,8 +72,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/mikolaj/.kaggle/kaggle.json'\n", - "real-or-fake-fake-jobposting-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)\n" + "/bin/bash: kaggle: command not found\r\n" ] } ], @@ -72,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "id": "5035aef0", "metadata": {}, "outputs": [ @@ -80,8 +92,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Archive: real-or-fake-fake-jobposting-prediction.zip\n", - " inflating: fake_job_postings.csv \n" + "unzip: cannot find or open real-or-fake-fake-jobposting-prediction.zip, real-or-fake-fake-jobposting-prediction.zip.zip or real-or-fake-fake-jobposting-prediction.zip.ZIP.\r\n" ] } ], @@ -91,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "id": "14344d2f", "metadata": {}, "outputs": [ @@ -99,19 +110,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: seaborn in /home/mikolaj/.local/lib/python3.8/site-packages (0.11.2)\n", - "Requirement already satisfied: scipy>=1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.7.2)\n", - "Requirement already satisfied: matplotlib>=2.2 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (3.4.2)\n", - "Requirement already satisfied: numpy>=1.15 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.19.5)\n", - "Requirement already satisfied: pandas>=0.23 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", - "Requirement already satisfied: pillow>=6.2.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", - "Requirement already satisfied: pyparsing>=2.2.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas>=0.23->seaborn) (2018.3)\n", - "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.11.0)\n", - "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n", + "Collecting seaborn\n", + " Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n", + "\u001b[K |████████████████████████████████| 292 kB 2.8 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n", + "Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n", + "Requirement already satisfied: numpy>=1.15 in /usr/lib/python3/dist-packages (from seaborn) (1.17.4)\n", + "Requirement already satisfied: matplotlib>=2.2 in /home/students/s444463/.local/lib/python3.8/site-packages (from seaborn) (3.4.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (8.3.2)\n", + "Requirement already satisfied: cycler>=0.10 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n", + "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n", + "Installing collected packages: seaborn\n", + "Successfully installed seaborn-0.11.2\n", + "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } @@ -122,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 11, "id": "0f5ebfab", "metadata": {}, "outputs": [ @@ -525,7 +539,7 @@ "[17880 rows x 18 columns]" ] }, - "execution_count": 22, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -538,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "edbf49da", "metadata": {}, "outputs": [ @@ -557,412 +571,20 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 13, "id": "bc594582", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - " | job_id | \n", - "title | \n", - "location | \n", - "department | \n", - "salary_range | \n", - "company_profile | \n", - "description | \n", - "requirements | \n", - "benefits | \n", - "telecommuting | \n", - "has_company_logo | \n", - "has_questions | \n", - "employment_type | \n", - "required_experience | \n", - "required_education | \n", - "industry | \n", - "function | \n", - "fraudulent | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "1 | \n", - "Marketing Intern | \n", - "US, NY, New York | \n", - "Marketing | \n", - "\n", - " | We're Food52, and we've created a groundbreaki... | \n", - "Food52, a fast-growing, James Beard Award-winn... | \n", - "Experience with content management systems a m... | \n", - "\n", - " | 0 | \n", - "1 | \n", - "0 | \n", - "Other | \n", - "Internship | \n", - "\n", - " | \n", - " | Marketing | \n", - "0 | \n", - "
1 | \n", - "2 | \n", - "Customer Service - Cloud Video Production | \n", - "NZ, , Auckland | \n", - "Success | \n", - "\n", - " | 90 Seconds, the worlds Cloud Video Production ... | \n", - "Organised - Focused - Vibrant - Awesome!Do you... | \n", - "What we expect from you:Your key responsibilit... | \n", - "What you will get from usThrough being part of... | \n", - "0 | \n", - "1 | \n", - "0 | \n", - "Full-time | \n", - "Not Applicable | \n", - "\n", - " | Marketing and Advertising | \n", - "Customer Service | \n", - "0 | \n", - "
2 | \n", - "3 | \n", - "Commissioning Machinery Assistant (CMA) | \n", - "US, IA, Wever | \n", - "\n", - " | \n", - " | Valor Services provides Workforce Solutions th... | \n", - "Our client, located in Houston, is actively se... | \n", - "Implement pre-commissioning and commissioning ... | \n", - "\n", - " | 0 | \n", - "1 | \n", - "0 | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | 0 | \n", - "
3 | \n", - "4 | \n", - "Account Executive - Washington DC | \n", - "US, DC, Washington | \n", - "Sales | \n", - "\n", - " | Our passion for improving quality of life thro... | \n", - "THE COMPANY: ESRI – Environmental Systems Rese... | \n", - "EDUCATION: Bachelor’s or Master’s in GIS, busi... | \n", - "Our culture is anything but corporate—we have ... | \n", - "0 | \n", - "1 | \n", - "0 | \n", - "Full-time | \n", - "Mid-Senior level | \n", - "Bachelor's Degree | \n", - "Computer Software | \n", - "Sales | \n", - "0 | \n", - "
4 | \n", - "5 | \n", - "Bill Review Manager | \n", - "US, FL, Fort Worth | \n", - "\n", - " | \n", - " | SpotSource Solutions LLC is a Global Human Cap... | \n", - "JOB TITLE: Itemization Review ManagerLOCATION:... | \n", - "QUALIFICATIONS:RN license in the State of Texa... | \n", - "Full Benefits Offered | \n", - "0 | \n", - "1 | \n", - "1 | \n", - "Full-time | \n", - "Mid-Senior level | \n", - "Bachelor's Degree | \n", - "Hospital & Health Care | \n", - "Health Care Provider | \n", - "0 | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
17875 | \n", - "17876 | \n", - "Account Director - Distribution | \n", - "CA, ON, Toronto | \n", - "Sales | \n", - "\n", - " | Vend is looking for some awesome new talent to... | \n", - "Just in case this is the first time you’ve vis... | \n", - "To ace this role you:Will eat comprehensive St... | \n", - "What can you expect from us?We have an open cu... | \n", - "0 | \n", - "1 | \n", - "1 | \n", - "Full-time | \n", - "Mid-Senior level | \n", - "\n", - " | Computer Software | \n", - "Sales | \n", - "0 | \n", - "
17876 | \n", - "17877 | \n", - "Payroll Accountant | \n", - "US, PA, Philadelphia | \n", - "Accounting | \n", - "\n", - " | WebLinc is the e-commerce platform and service... | \n", - "The Payroll Accountant will focus primarily on... | \n", - "- B.A. or B.S. in Accounting- Desire to have f... | \n", - "Health & WellnessMedical planPrescription ... | \n", - "0 | \n", - "1 | \n", - "1 | \n", - "Full-time | \n", - "Mid-Senior level | \n", - "Bachelor's Degree | \n", - "Internet | \n", - "Accounting/Auditing | \n", - "0 | \n", - "
17877 | \n", - "17878 | \n", - "Project Cost Control Staff Engineer - Cost Con... | \n", - "US, TX, Houston | \n", - "\n", - " | \n", - " | We Provide Full Time Permanent Positions for m... | \n", - "Experienced Project Cost Control Staff Enginee... | \n", - "At least 12 years professional experience.Abil... | \n", - "\n", - " | 0 | \n", - "0 | \n", - "0 | \n", - "Full-time | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | 0 | \n", - "
17878 | \n", - "17879 | \n", - "Graphic Designer | \n", - "NG, LA, Lagos | \n", - "\n", - " | \n", - " | \n", - " | Nemsia Studios is looking for an experienced v... | \n", - "1. Must be fluent in the latest versions of Co... | \n", - "Competitive salary (compensation will be based... | \n", - "0 | \n", - "0 | \n", - "1 | \n", - "Contract | \n", - "Not Applicable | \n", - "Professional | \n", - "Graphic Design | \n", - "Design | \n", - "0 | \n", - "
17879 | \n", - "17880 | \n", - "Web Application Developers | \n", - "NZ, N, Wellington | \n", - "Engineering | \n", - "\n", - " | Vend is looking for some awesome new talent to... | \n", - "Who are we?Vend is an award winning web based ... | \n", - "We want to hear from you if:You have an in-dep... | \n", - "\n", - " | 0 | \n", - "1 | \n", - "1 | \n", - "Full-time | \n", - "Mid-Senior level | \n", - "\n", - " | Computer Software | \n", - "Engineering | \n", - "0 | \n", - "
17880 rows × 18 columns
\n", - "