This commit is contained in:
Mikołaj Pokrywka 2022-03-27 20:01:28 +02:00
commit 21879ae350
3 changed files with 140 additions and 135 deletions

10
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,10 @@
pipeline {
agent any
stages {
stage('Stage 1') {
steps {
echo 'Hello world!'
}
}
}
}

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 28,
"id": "5e2107a5",
"metadata": {},
"outputs": [],
@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 29,
"id": "bcc889e5",
"metadata": {},
"outputs": [
@ -20,25 +20,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in /home/mikolaj/.local/lib/python3.8/site-packages (1.5.12)\n",
"Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.22)\n",
"Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.11.0)\n",
"Requirement already satisfied: requests in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.25.1)\n",
"Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2018.1.18)\n",
"Requirement already satisfied: python-slugify in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: python-dateutil in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (2.8.1)\n",
"Requirement already satisfied: tqdm in /home/mikolaj/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)\n",
"Requirement already satisfied: text-unidecode>=1.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: chardet<5,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->kaggle) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /home/mikolaj/.local/lib/python3.8/site-packages (from requests->kaggle) (2.10)\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
"Requirement already satisfied: kaggle in /home/students/s444463/.local/lib/python3.8/site-packages (1.5.12)\n",
"Requirement already satisfied: tqdm in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (4.63.0)\n",
"Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n",
"Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n",
"Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n",
"Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n",
"Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n",
"Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"Requirement already satisfied: pandas in /home/mikolaj/.local/lib/python3.8/site-packages (1.1.5)\n",
"Requirement already satisfied: numpy>=1.15.4 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (1.19.5)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas) (2018.3)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /home/mikolaj/.local/lib/python3.8/site-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7.3->pandas) (1.11.0)\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
"Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"Requirement already satisfied: numpy in /usr/lib/python3/dist-packages (1.17.4)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
@ -51,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 30,
"id": "02a4034f",
"metadata": {},
"outputs": [
@ -59,8 +56,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/mikolaj/.kaggle/kaggle.json'\n",
"real-or-fake-fake-jobposting-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
"/bin/bash: kaggle: command not found\r\n"
]
}
],
@ -72,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 31,
"id": "5035aef0",
"metadata": {},
"outputs": [
@ -80,8 +76,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: real-or-fake-fake-jobposting-prediction.zip\n",
" inflating: fake_job_postings.csv \n"
"unzip: cannot find or open real-or-fake-fake-jobposting-prediction.zip, real-or-fake-fake-jobposting-prediction.zip.zip or real-or-fake-fake-jobposting-prediction.zip.ZIP.\r\n"
]
}
],
@ -91,7 +86,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 32,
"id": "14344d2f",
"metadata": {},
"outputs": [
@ -99,19 +94,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: seaborn in /home/mikolaj/.local/lib/python3.8/site-packages (0.11.2)\n",
"Requirement already satisfied: scipy>=1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.7.2)\n",
"Requirement already satisfied: matplotlib>=2.2 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (3.4.2)\n",
"Requirement already satisfied: numpy>=1.15 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.19.5)\n",
"Requirement already satisfied: pandas>=0.23 in /home/mikolaj/.local/lib/python3.8/site-packages (from seaborn) (1.1.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0)\n",
"Requirement already satisfied: cycler>=0.10 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in /home/mikolaj/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3/dist-packages (from pandas>=0.23->seaborn) (2018.3)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.11.0)\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
"Requirement already satisfied: seaborn in /home/students/s444463/.local/lib/python3.8/site-packages (0.11.2)\n",
"Requirement already satisfied: numpy>=1.15 in /usr/lib/python3/dist-packages (from seaborn) (1.17.4)\n",
"Requirement already satisfied: scipy>=1.0 in /usr/lib/python3/dist-packages (from seaborn) (1.3.3)\n",
"Requirement already satisfied: matplotlib>=2.2 in /home/students/s444463/.local/lib/python3.8/site-packages (from seaborn) (3.4.3)\n",
"Requirement already satisfied: pandas>=0.23 in /usr/lib/python3/dist-packages (from seaborn) (0.25.3)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (8.3.2)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n",
"Requirement already satisfied: cycler>=0.10 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
@ -122,7 +116,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 33,
"id": "0f5ebfab",
"metadata": {},
"outputs": [
@ -525,7 +519,7 @@
"[17880 rows x 18 columns]"
]
},
"execution_count": 22,
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
@ -538,7 +532,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 34,
"id": "edbf49da",
"metadata": {},
"outputs": [
@ -557,8 +551,40 @@
},
{
"cell_type": "code",
"execution_count": 29,
"id": "bc594582",
"execution_count": 35,
"id": "e60b3f32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Sales 551\n",
"Engineering 487\n",
"Marketing 401\n",
"Operations 270\n",
"IT 225\n",
" ... \n",
"Capoo 1\n",
"Engineering - Hardware 1\n",
"Utilities 1\n",
"i 1\n",
"TECH 1\n",
"Name: department, Length: 1337, dtype: int64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"department\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "ddb2fc38",
"metadata": {},
"outputs": [
{
@ -960,51 +986,20 @@
"[17880 rows x 18 columns]"
]
},
"execution_count": 29,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"data = data.replace(np.nan, '', regex=True)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "e60b3f32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" 11547\n",
"Sales 551\n",
"Engineering 487\n",
"Marketing 401\n",
"Operations 270\n",
" ... \n",
"Pricing 1\n",
"Mobility 1\n",
"Housekeeping 1\n",
"An Impact Engine Company 1\n",
"Trainee 1\n",
"Name: department, Length: 1338, dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"department\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 37,
"id": "c5ac75f5",
"metadata": {},
"outputs": [
@ -1365,19 +1360,18 @@
"max NaN 1.000000 "
]
},
"execution_count": 32,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = data.replace(np.nan, '', regex=True)\n",
"data.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 38,
"id": "4b0e77a4",
"metadata": {},
"outputs": [
@ -1392,7 +1386,7 @@
"dtype: float64"
]
},
"execution_count": 39,
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@ -1403,7 +1397,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 39,
"id": "5a1d8ec7",
"metadata": {},
"outputs": [
@ -1412,17 +1406,12 @@
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting scikit-learn\n",
" Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
" |████████████████████████████████| 26.7 MB 8.8 MB/s \n",
"\u001b[?25hRequirement already satisfied: numpy>=1.14.6 in /home/mikolaj/.local/lib/python3.8/site-packages (from scikit-learn) (1.19.5)\n",
"Requirement already satisfied: scipy>=1.1.0 in /home/mikolaj/.local/lib/python3.8/site-packages (from scikit-learn) (1.7.2)\n",
"Collecting threadpoolctl>=2.0.0\n",
" Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)\n",
"Requirement already satisfied: joblib>=0.11 in /home/mikolaj/.local/lib/python3.8/site-packages (from scikit-learn) (1.1.0)\n",
"Installing collected packages: threadpoolctl, scikit-learn\n",
"Successfully installed scikit-learn-1.0.2 threadpoolctl-3.1.0\n",
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
"Requirement already satisfied: scikit-learn in /home/students/s444463/.local/lib/python3.8/site-packages (1.0.2)\n",
"Requirement already satisfied: numpy>=1.14.6 in /usr/lib/python3/dist-packages (from scikit-learn) (1.17.4)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /home/students/s444463/.local/lib/python3.8/site-packages (from scikit-learn) (3.1.0)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n",
"Requirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
@ -1434,28 +1423,28 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 40,
"id": "50813795",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"English Teacher Abroad 235\n",
"Customer Service Associate 110\n",
"Graduates: English Teacher Abroad (Conversational) 104\n",
"English Teacher Abroad 72\n",
"Software Engineer 68\n",
" ... \n",
"Manager-Plastics Mfg Engineering - Full Time Permanent Job 1\n",
"Ruby on Rails Developer/Programmer 1\n",
"Appliance Technician 1\n",
"Need Oracle Fusion HCM Resource 1\n",
"Recruitment specialist 1\n",
"Name: title, Length: 8761, dtype: int64"
"English Teacher Abroad 230\n",
"Customer Service Associate 106\n",
"Graduates: English Teacher Abroad (Conversational) 96\n",
"English Teacher Abroad 71\n",
"Software Engineer 67\n",
" ... \n",
"Jr. Flash & HTML Developer 1\n",
" RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n",
"Summer interns - 2014/15 1\n",
"Full-Stack Engineer 1\n",
"Contract Product Designer 1\n",
"Name: title, Length: 8461, dtype: int64"
]
},
"execution_count": 26,
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@ -1463,55 +1452,57 @@
"source": [
"from sklearn.model_selection import train_test_split\n",
"import sklearn\n",
"data_train, data_test = train_test_split(data, random_state=1)\n",
"data_train, data_test = train_test_split(data, test_size=5000, random_state=1)\n",
"data_dev, data_test = train_test_split(data_test, test_size=2500, random_state=1)\n",
"data_train[\"title\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 41,
"id": "ea3c9f2e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.3333333333333333"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"12880\n",
"2500\n",
"2500\n"
]
}
],
"source": [
"data_test.size/data_train.size"
"print(len(data_train))\n",
"print(len(data_dev))\n",
"print(len(data_test))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 42,
"id": "b20cc27a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"English Teacher Abroad 235\n",
"Customer Service Associate 110\n",
"Graduates: English Teacher Abroad (Conversational) 104\n",
"English Teacher Abroad 72\n",
"Software Engineer 68\n",
" ... \n",
"Manager-Plastics Mfg Engineering - Full Time Permanent Job 1\n",
"Ruby on Rails Developer/Programmer 1\n",
"Appliance Technician 1\n",
"Need Oracle Fusion HCM Resource 1\n",
"Recruitment specialist 1\n",
"Name: title, Length: 8761, dtype: int64"
"English Teacher Abroad 230\n",
"Customer Service Associate 106\n",
"Graduates: English Teacher Abroad (Conversational) 96\n",
"English Teacher Abroad 71\n",
"Software Engineer 67\n",
" ... \n",
"Jr. Flash & HTML Developer 1\n",
" RNs Needed ASAP - ER & ICU - APPLY NOW!! 1\n",
"Summer interns - 2014/15 1\n",
"Full-Stack Engineer 1\n",
"Contract Product Designer 1\n",
"Name: title, Length: 8461, dtype: int64"
]
},
"execution_count": 32,
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
@ -1537,7 +1528,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.10"
}
},
"nbformat": 4,

4
process_data.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
echo "welcome"
ls
echo "this is the whole list of dir"