diff --git a/.gitignore b/.gitignore
index 1d63b15..39618d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ data_dev.csv
data_train.csv
data.csv
data_not_shuf.csv
-data_not_cutted.csv
\ No newline at end of file
+data_not_cutted.csv
+venv
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..440e401
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,34 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/ium_444463.iml b/.idea/ium_444463.iml
new file mode 100644
index 0000000..1dcb0e8
--- /dev/null
+++ b/.idea/ium_444463.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..ae9a2c3
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..a0b5093
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..f6ea053
--- /dev/null
+++ b/main.py
@@ -0,0 +1,113 @@
+import pandas as pd
+import numpy as np
+import scipy
+import torch
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import kaggle
+from sklearn.feature_extraction.text import TfidfVectorizer
+from torch import nn
+from torch import optim
+import matplotlib.pyplot as plt
+
+if __name__ == "__main__":
+ # kaggle.api.authenticate()
+ # kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
+ # unzip=True)
+
+ data = pd.read_csv('fake_job_postings.csv', engine='python')
+ data = data.replace(np.nan, '', regex=True)
+
+ data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
+ data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
+
+ x_train = data_train["title"]
+ x_dev = data_dev["title"]
+ x_test = data_test["title"]
+
+ y_train = data_train["fraudulent"]
+ y_dev = data_dev["fraudulent"]
+ y_test = data_test["fraudulent"]
+
+ x_train = np.array(x_train)
+ x_dev = np.array(x_dev)
+
+ y_train = np.array(y_train)
+ y_dev = np.array(y_dev)
+
+ vectorizer = TfidfVectorizer()
+
+ x_train = vectorizer.fit_transform(x_train)
+ x_dev = vectorizer.transform(x_dev)
+
+ x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
+ x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
+
+ y_train = torch.tensor(y_train)
+ y_dev = torch.tensor(y_dev)
+
+ from torch import nn
+
+ model = nn.Sequential(
+ nn.Linear(x_train.shape[1], 64),
+ nn.ReLU(),
+ nn.Linear(64, data_train["title"].nunique()),
+ nn.LogSoftmax(dim=1))
+
+ # Define the loss
+ criterion = nn.NLLLoss() # Forward pass, log
+ logps = model(x_train) # Calculate the loss with the logits and the labels
+ loss = criterion(logps, y_train)
+ loss.backward() # Optimizers need parameters to optimize and a learning rate
+ optimizer = optim.Adam(model.parameters(), lr=0.002)
+
+ train_losses = []
+ test_losses = []
+ test_accuracies = []
+
+ epochs = 5
+ for e in range(epochs):
+ optimizer.zero_grad()
+
+ output = model.forward(x_train)
+ loss = criterion(output, y_train)
+ loss.backward()
+ train_loss = loss.item()
+ train_losses.append(train_loss)
+
+ optimizer.step()
+
+ # Turn off gradients for validation, saves memory and computations
+ with torch.no_grad():
+ model.eval()
+ log_ps = model(x_dev)
+ test_loss = criterion(log_ps, y_dev)
+ test_losses.append(test_loss)
+
+ ps = torch.exp(log_ps)
+ top_p, top_class = ps.topk(1, dim=1)
+ equals = top_class == y_dev.view(*top_class.shape)
+ test_accuracy = torch.mean(equals.float())
+ test_accuracies.append(test_accuracy)
+
+ model.train()
+
+ print(f"Epoch: {e + 1}/{epochs}.. ",
+ f"Training Loss: {train_loss:.3f}.. ",
+ f"Test Loss: {test_loss:.3f}.. ",
+ f"Test Accuracy: {test_accuracy:.3f}")
+
+ plt.figure(figsize=(12, 5))
+ ax = plt.subplot(121)
+ plt.xlabel('epochs')
+ plt.ylabel('negative log likelihood loss')
+ plt.plot(train_losses, label='Training loss')
+ plt.plot(test_losses, label='Validation loss')
+ plt.legend(frameon=False)
+ plt.subplot(122)
+ plt.xlabel('epochs')
+ plt.ylabel('test accuracy')
+ plt.plot(test_accuracies)
+ plt.show()
+
+ print('Succes')
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index c37d049..d46299b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,7 @@
pandas
numpy
-kaggle
\ No newline at end of file
+kaggle
+torch
+matplotlib
+sklearn
+scipy
diff --git a/Dockerfile b/stare_zadania/Dockerfile
similarity index 100%
rename from Dockerfile
rename to stare_zadania/Dockerfile
diff --git a/Jenkinsfile b/stare_zadania/Jenkinsfile
similarity index 100%
rename from Jenkinsfile
rename to stare_zadania/Jenkinsfile
diff --git a/Jenkinsfile.stats b/stare_zadania/Jenkinsfile.stats
similarity index 100%
rename from Jenkinsfile.stats
rename to stare_zadania/Jenkinsfile.stats
diff --git a/README.md b/stare_zadania/README.md
similarity index 100%
rename from README.md
rename to stare_zadania/README.md
diff --git a/download_data.ipynb b/stare_zadania/download_data.ipynb
similarity index 97%
rename from download_data.ipynb
rename to stare_zadania/download_data.ipynb
index a4165ac..18e2aa5 100644
--- a/download_data.ipynb
+++ b/stare_zadania/download_data.ipynb
@@ -4,7 +4,11 @@
"cell_type": "code",
"execution_count": 28,
"id": "5e2107a5",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"#Skrypt do ściagnięcia zbiory danych\n"
@@ -14,7 +18,11 @@
"cell_type": "code",
"execution_count": 29,
"id": "bcc889e5",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -29,14 +37,14 @@
"Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n",
"Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
- "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
- "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
+ "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
+ "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n",
"Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n",
- "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
- "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
+ "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
+ "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n",
"Requirement already satisfied: numpy in /usr/lib/python3/dist-packages (1.17.4)\n",
- "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
- "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
+ "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
+ "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
]
}
],
@@ -50,7 +58,11 @@
"cell_type": "code",
"execution_count": 30,
"id": "02a4034f",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -70,7 +82,11 @@
"cell_type": "code",
"execution_count": 31,
"id": "5035aef0",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -88,7 +104,11 @@
"cell_type": "code",
"execution_count": 32,
"id": "14344d2f",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -105,8 +125,8 @@
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n",
- "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
- "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
+ "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
+ "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
]
}
],
@@ -118,7 +138,11 @@
"cell_type": "code",
"execution_count": 33,
"id": "0f5ebfab",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -534,7 +558,11 @@
"cell_type": "code",
"execution_count": 34,
"id": "edbf49da",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -553,7 +581,11 @@
"cell_type": "code",
"execution_count": 35,
"id": "e60b3f32",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -585,7 +617,11 @@
"cell_type": "code",
"execution_count": 36,
"id": "ddb2fc38",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -1001,7 +1037,11 @@
"cell_type": "code",
"execution_count": 37,
"id": "c5ac75f5",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -1373,7 +1413,11 @@
"cell_type": "code",
"execution_count": 38,
"id": "4b0e77a4",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -1399,7 +1443,11 @@
"cell_type": "code",
"execution_count": 39,
"id": "5a1d8ec7",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -1411,8 +1459,8 @@
"Requirement already satisfied: threadpoolctl>=2.0.0 in /home/students/s444463/.local/lib/python3.8/site-packages (from scikit-learn) (3.1.0)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n",
"Requirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n",
- "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
- "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
+ "\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
+ "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
@@ -1425,7 +1473,11 @@
"cell_type": "code",
"execution_count": 40,
"id": "50813795",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -1461,7 +1513,11 @@
"cell_type": "code",
"execution_count": 41,
"id": "ea3c9f2e",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -1483,7 +1539,11 @@
"cell_type": "code",
"execution_count": 42,
"id": "b20cc27a",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"data": {
@@ -1533,4 +1593,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/download_data_and_process.py b/stare_zadania/download_data_and_process.py
similarity index 100%
rename from download_data_and_process.py
rename to stare_zadania/download_data_and_process.py
diff --git a/process_data.sh b/stare_zadania/process_data.sh
similarity index 100%
rename from process_data.sh
rename to stare_zadania/process_data.sh
diff --git a/real-or-fake-fake-jobposting-prediction.zip b/stare_zadania/real-or-fake-fake-jobposting-prediction.zip
similarity index 100%
rename from real-or-fake-fake-jobposting-prediction.zip
rename to stare_zadania/real-or-fake-fake-jobposting-prediction.zip
diff --git a/stare_zadania/requirements.txt b/stare_zadania/requirements.txt
new file mode 100644
index 0000000..c37d049
--- /dev/null
+++ b/stare_zadania/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+kaggle
\ No newline at end of file
diff --git a/stats.py b/stare_zadania/stats.py
similarity index 100%
rename from stats.py
rename to stare_zadania/stats.py
diff --git a/stats.sh b/stare_zadania/stats.sh
similarity index 100%
rename from stats.sh
rename to stare_zadania/stats.sh