diff --git a/.gitignore b/.gitignore index 1d63b15..39618d9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ data_dev.csv data_train.csv data.csv data_not_shuf.csv -data_not_cutted.csv \ No newline at end of file +data_not_cutted.csv +venv \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..440e401 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,34 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/ium_444463.iml b/.idea/ium_444463.iml new file mode 100644 index 0000000..1dcb0e8 --- /dev/null +++ b/.idea/ium_444463.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..ae9a2c3 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a0b5093 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..f6ea053 --- /dev/null +++ b/main.py @@ -0,0 +1,113 @@ +import pandas as pd +import numpy as np +import scipy +import torch +import pandas as pd +from sklearn.model_selection import train_test_split +import kaggle +from sklearn.feature_extraction.text import TfidfVectorizer +from torch import nn +from torch import optim +import matplotlib.pyplot as plt + +if __name__ == "__main__": + # kaggle.api.authenticate() + # kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.', + # unzip=True) + + data = pd.read_csv('fake_job_postings.csv', engine='python') + data = data.replace(np.nan, '', regex=True) + + data_train, data_test = train_test_split(data, test_size=3000, random_state=1) + data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1) + + x_train = data_train["title"] + x_dev = data_dev["title"] + x_test = data_test["title"] + + y_train = data_train["fraudulent"] + y_dev = data_dev["fraudulent"] + y_test = data_test["fraudulent"] + + x_train = np.array(x_train) + x_dev = np.array(x_dev) + + y_train = np.array(y_train) + y_dev = np.array(y_dev) + + vectorizer = TfidfVectorizer() + + x_train = vectorizer.fit_transform(x_train) + x_dev = vectorizer.transform(x_dev) + + x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float() + x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float() + + y_train = torch.tensor(y_train) + y_dev = torch.tensor(y_dev) + + from torch import nn + + model = nn.Sequential( + nn.Linear(x_train.shape[1], 64), + nn.ReLU(), + nn.Linear(64, data_train["title"].nunique()), + nn.LogSoftmax(dim=1)) + + # Define the loss + criterion = nn.NLLLoss() # Forward pass, log + logps = model(x_train) # Calculate the loss with the logits and the labels + loss = criterion(logps, y_train) + loss.backward() # Optimizers need parameters to optimize and a learning rate + optimizer = optim.Adam(model.parameters(), lr=0.002) + + train_losses = [] + test_losses = [] + test_accuracies = [] + + epochs = 5 + for e in range(epochs): + optimizer.zero_grad() + + output = model.forward(x_train) + loss = criterion(output, y_train) + loss.backward() + train_loss = loss.item() + train_losses.append(train_loss) + + optimizer.step() + + # Turn off gradients for validation, saves memory and computations + with torch.no_grad(): + model.eval() + log_ps = model(x_dev) + test_loss = criterion(log_ps, y_dev) + test_losses.append(test_loss) + + ps = torch.exp(log_ps) + top_p, top_class = ps.topk(1, dim=1) + equals = top_class == y_dev.view(*top_class.shape) + test_accuracy = torch.mean(equals.float()) + test_accuracies.append(test_accuracy) + + model.train() + + print(f"Epoch: {e + 1}/{epochs}.. ", + f"Training Loss: {train_loss:.3f}.. ", + f"Test Loss: {test_loss:.3f}.. ", + f"Test Accuracy: {test_accuracy:.3f}") + + plt.figure(figsize=(12, 5)) + ax = plt.subplot(121) + plt.xlabel('epochs') + plt.ylabel('negative log likelihood loss') + plt.plot(train_losses, label='Training loss') + plt.plot(test_losses, label='Validation loss') + plt.legend(frameon=False) + plt.subplot(122) + plt.xlabel('epochs') + plt.ylabel('test accuracy') + plt.plot(test_accuracies) + plt.show() + + print('Succes') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c37d049..d46299b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ pandas numpy -kaggle \ No newline at end of file +kaggle +torch +matplotlib +sklearn +scipy diff --git a/Dockerfile b/stare_zadania/Dockerfile similarity index 100% rename from Dockerfile rename to stare_zadania/Dockerfile diff --git a/Jenkinsfile b/stare_zadania/Jenkinsfile similarity index 100% rename from Jenkinsfile rename to stare_zadania/Jenkinsfile diff --git a/Jenkinsfile.stats b/stare_zadania/Jenkinsfile.stats similarity index 100% rename from Jenkinsfile.stats rename to stare_zadania/Jenkinsfile.stats diff --git a/README.md b/stare_zadania/README.md similarity index 100% rename from README.md rename to stare_zadania/README.md diff --git a/download_data.ipynb b/stare_zadania/download_data.ipynb similarity index 97% rename from download_data.ipynb rename to stare_zadania/download_data.ipynb index a4165ac..18e2aa5 100644 --- a/download_data.ipynb +++ b/stare_zadania/download_data.ipynb @@ -4,7 +4,11 @@ "cell_type": "code", "execution_count": 28, "id": "5e2107a5", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "#Skrypt do ściagnięcia zbiory danych\n" @@ -14,7 +18,11 @@ "cell_type": "code", "execution_count": 29, "id": "bcc889e5", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -29,14 +37,14 @@ "Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", "Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n", - "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", + "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n", "Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n", - "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", + "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n", "Requirement already satisfied: numpy in /usr/lib/python3/dist-packages (1.17.4)\n", - "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" + "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n" ] } ], @@ -50,7 +58,11 @@ "cell_type": "code", "execution_count": 30, "id": "02a4034f", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -70,7 +82,11 @@ "cell_type": "code", "execution_count": 31, "id": "5035aef0", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -88,7 +104,11 @@ "cell_type": "code", "execution_count": 32, "id": "14344d2f", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -105,8 +125,8 @@ "Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n", "Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n", "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n", - "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" + "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n" ] } ], @@ -118,7 +138,11 @@ "cell_type": "code", "execution_count": 33, "id": "0f5ebfab", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -534,7 +558,11 @@ "cell_type": "code", "execution_count": 34, "id": "edbf49da", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -553,7 +581,11 @@ "cell_type": "code", "execution_count": 35, "id": "e60b3f32", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -585,7 +617,11 @@ "cell_type": "code", "execution_count": 36, "id": "ddb2fc38", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -1001,7 +1037,11 @@ "cell_type": "code", "execution_count": 37, "id": "c5ac75f5", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -1373,7 +1413,11 @@ "cell_type": "code", "execution_count": 38, "id": "4b0e77a4", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -1399,7 +1443,11 @@ "cell_type": "code", "execution_count": 39, "id": "5a1d8ec7", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -1411,8 +1459,8 @@ "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/students/s444463/.local/lib/python3.8/site-packages (from scikit-learn) (3.1.0)\n", "Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n", "Requirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n", - "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n", + "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -1425,7 +1473,11 @@ "cell_type": "code", "execution_count": 40, "id": "50813795", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -1461,7 +1513,11 @@ "cell_type": "code", "execution_count": 41, "id": "ea3c9f2e", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -1483,7 +1539,11 @@ "cell_type": "code", "execution_count": 42, "id": "b20cc27a", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -1533,4 +1593,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/download_data_and_process.py b/stare_zadania/download_data_and_process.py similarity index 100% rename from download_data_and_process.py rename to stare_zadania/download_data_and_process.py diff --git a/process_data.sh b/stare_zadania/process_data.sh similarity index 100% rename from process_data.sh rename to stare_zadania/process_data.sh diff --git a/real-or-fake-fake-jobposting-prediction.zip b/stare_zadania/real-or-fake-fake-jobposting-prediction.zip similarity index 100% rename from real-or-fake-fake-jobposting-prediction.zip rename to stare_zadania/real-or-fake-fake-jobposting-prediction.zip diff --git a/stare_zadania/requirements.txt b/stare_zadania/requirements.txt new file mode 100644 index 0000000..c37d049 --- /dev/null +++ b/stare_zadania/requirements.txt @@ -0,0 +1,3 @@ +pandas +numpy +kaggle \ No newline at end of file diff --git a/stats.py b/stare_zadania/stats.py similarity index 100% rename from stats.py rename to stare_zadania/stats.py diff --git a/stats.sh b/stare_zadania/stats.sh similarity index 100% rename from stats.sh rename to stare_zadania/stats.sh