Compare commits

...

10 Commits

Author SHA1 Message Date
e2ce3a6b9f Prezentacja 2024-06-09 20:39:24 +02:00
4c143f2574 IUM_7 2024-06-01 18:26:54 +02:00
feef756ed0 IUM_7 2024-06-01 18:21:06 +02:00
0ff7d1c06f IUM_7 2024-06-01 17:40:19 +02:00
cc56865cc1 IUM_7 2024-06-01 17:24:14 +02:00
cf648b6c12 IUM_10 2024-05-26 11:38:15 +02:00
cb74efc384 IUM_10 2024-05-26 10:35:27 +02:00
dc7777ef23 IUM_10 2024-05-26 10:30:32 +02:00
e9194b950d IUM_09 2024-05-15 09:45:01 +02:00
f7b13459a3 IUM_08 2024-05-12 12:14:43 +02:00
61 changed files with 570305 additions and 64 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

4
.dvc/config Normal file
View File

@ -0,0 +1,4 @@
[core]
remote = ium_ssh_remote
['remote "ium_ssh_remote"']
url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

4
.gitignore vendored
View File

@ -1,5 +1,5 @@
creditcardfraud.zip
creditcard.csv
data
model/model.keras
stats_data
/creditcard.csv
/creditcardfraud.zip

View File

@ -1,5 +1,5 @@
FROM ubuntu:latest
RUN apt update && apt install -y python3-pip
RUN apt update && apt install -y python3-pip git
RUN pip install pandas numpy scikit-learn tensorflow
RUN pip install pandas numpy scikit-learn tensorflow sacred pymongo --break-system-packages

BIN
IUM_12.pptx Normal file

Binary file not shown.

21
Jenkinsfile vendored
View File

@ -46,10 +46,29 @@ pipeline {
}
}
stage('Archive Artifacts') {
stage('Archive Artifacts from create-dataset') {
steps {
archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
}
}
stage('Experiments') {
agent {
dockerfile {
reuseNode true
}
}
steps {
sh 'chmod +x sacred/sacred_train_evaluation.py'
sh 'python3 sacred/sacred_train_evaluation.py'
}
}
stage('Archive Artifacts from Experiments') {
steps {
archiveArtifacts artifacts: 'experiments/**/*.*', onlyIfSuccessful: true
}
}
}
}

View File

@ -1,42 +0,0 @@
#!/bin/bash
# Install the Kaggle API
pip install kaggle
# Download the dataset from Kaggle
kaggle datasets download -d mlg-ulb/creditcardfraud
# Unzip the dataset
unzip -o creditcardfraud.zip
# Remove the zip file
rm creditcardfraud.zip
# Create a header file
head -n 1 creditcard.csv > creditcard_header.csv
# Remove the header from the dataset
tail -n +2 creditcard.csv > creditcard_no_header.csv
# Remove the original dataset
rm creditcard.csv
# Shuffle the dataset
shuf creditcard_no_header.csv > creditcard_shuf_no_header.csv
# Remove the unshuffled dataset
rm creditcard_no_header.csv
# Add the header back to the shuffled dataset
cat creditcard_header.csv creditcard_shuf_no_header.csv > creditcard_shuf.csv
# Split the dataset into training and testing
tail -n +10001 creditcard_shuf_no_header.csv > creditcard_train_no_header.csv
head -n 10000 creditcard_shuf_no_header.csv > creditcard_test_no_header.csv
# Add the header back to the training and testing datasets
cat creditcard_header.csv creditcard_train_no_header.csv > creditcard_train.csv
cat creditcard_header.csv creditcard_test_no_header.csv > creditcard_test.csv
# Remove the intermediate files
rm creditcard_header.csv creditcard_shuf_no_header.csv creditcard_train_no_header.csv creditcard_test_no_header.csv
# Create a directory for the data
mkdir -p data
# Move the datasets to the data directory
mv creditcard_shuf.csv creditcard_train.csv creditcard_test.csv data/

5
creditcard.csv.dvc Normal file
View File

@ -0,0 +1,5 @@
outs:
- md5: e90efcb83d69faf99fcab8b0255024de
size: 150828752
hash: md5
path: creditcard.csv

5
creditcardfraud.zip.dvc Normal file
View File

@ -0,0 +1,5 @@
outs:
- md5: bf8e9842731ab6f9b8ab51e1a6741f8b
size: 69155672
hash: md5
path: creditcardfraud.zip

View File

@ -1,12 +0,0 @@
#!/bin/bash
# Count the number of lines in the original dataset
wc -l < data/creditcard_shuf.csv > stats.txt
# Count the number of lines in the training and testing datasets
wc -l < data/creditcard_train.csv > stats_train.txt
wc -l < data/creditcard_test.csv > stats_test.txt
# Create a directory for the statistics
mkdir -p stats_data
# Move the statistics to the stats directory
mv stats.txt stats_train.txt stats_test.txt stats_data/

94
dvc.lock Normal file
View File

@ -0,0 +1,94 @@
schema: '2.0'
stages:
prepare_data:
cmd: python ./create-dataset.py
deps:
- path: create-dataset.py
hash: md5
md5: 0903460139f5b57b9759f4de37b2d5e4
size: 1531
- path: creditcard.csv
hash: md5
md5: e90efcb83d69faf99fcab8b0255024de
size: 150828752
outs:
- path: data/X_test.csv
hash: md5
md5: 46ff52696af9a4c06f6b25639525dda6
size: 30947960
- path: data/X_train.csv
hash: md5
md5: 7505524c54858300bbd92094092a6c39
size: 92838653
- path: data/X_val.csv
hash: md5
md5: 4d078882cc1898640ddaf4ad9117f543
size: 30946540
- path: data/creditcard.csv
hash: md5
md5: 4b81435690147d1e624a8b06c5520629
size: 155302541
- path: data/y_test.csv
hash: md5
md5: a6bc4827feae19934c4021d1f10f5963
size: 170893
- path: data/y_train.csv
hash: md5
md5: 8112a5cf4faac882c421bcb7e3d42044
size: 512656
- path: data/y_val.csv
hash: md5
md5: 1155f648650986d8866eba603b86560c
size: 170893
train_model:
cmd: python ./train_model.py
deps:
- path: data/X_train.csv
hash: md5
md5: 7505524c54858300bbd92094092a6c39
size: 92838653
- path: data/X_val.csv
hash: md5
md5: 4d078882cc1898640ddaf4ad9117f543
size: 30946540
- path: data/y_train.csv
hash: md5
md5: 8112a5cf4faac882c421bcb7e3d42044
size: 512656
- path: data/y_val.csv
hash: md5
md5: 1155f648650986d8866eba603b86560c
size: 170893
- path: train_model.py
hash: md5
md5: 00b8bac043f4d7a56dec95f2f1bb1b49
size: 1540
outs:
- path: model/model.keras
hash: md5
md5: 1d1df55ad26a8c0689efa4a86a86c217
size: 1476738
evaluate_model:
cmd: python ./predict.py
deps:
- path: data/X_test.csv
hash: md5
md5: 46ff52696af9a4c06f6b25639525dda6
size: 30947960
- path: data/y_test.csv
hash: md5
md5: a6bc4827feae19934c4021d1f10f5963
size: 170893
- path: model/model.keras
hash: md5
md5: 1d1df55ad26a8c0689efa4a86a86c217
size: 1476738
- path: predict.py
hash: md5
md5: a61388aabf381779b38e2f32a4d0df7b
size: 660
outs:
- path: data/y_pred.csv
hash: md5
md5: be150c2fbf1914102b479edbe0a4cf43
size: 1481012

35
dvc.yaml Normal file
View File

@ -0,0 +1,35 @@
stages:
prepare_data:
cmd: python ./create-dataset.py
deps:
- create-dataset.py
- creditcard.csv
outs:
- data/creditcard.csv
- data/X_train.csv
- data/X_val.csv
- data/X_test.csv
- data/y_train.csv
- data/y_val.csv
- data/y_test.csv
train_model:
cmd: python ./train_model.py
deps:
- train_model.py
- data/X_train.csv
- data/X_val.csv
- data/y_train.csv
- data/y_val.csv
outs:
- model/model.keras
evaluate_model:
cmd: python ./predict.py
deps:
- predict.py
- model/model.keras
- data/X_test.csv
- data/y_test.csv
outs:
- data/y_pred.csv

BIN
environment.yml Normal file

Binary file not shown.

View File

@ -0,0 +1,5 @@
{
"epochs": 5,
"learning_rate": 0.001,
"seed": 7929899
}

14
experiments/708/cout.txt Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,8 @@
{
"metrics": [
{
"id": "665b3cd5c1ae3ab5cc15d3d9",
"name": "accuracy"
}
]
}

View File

@ -0,0 +1,13 @@
{
"accuracy": {
"steps": [
0
],
"timestamps": [
"2024-06-01T15:23:02.056704"
],
"values": [
0.8217821782178217
]
}
}

BIN
experiments/708/model.keras Normal file

Binary file not shown.

102
experiments/708/run.json Normal file
View File

@ -0,0 +1,102 @@
{
"artifacts": [
"model.keras"
],
"command": "main",
"experiment": {
"base_dir": "C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\sacred",
"dependencies": [
"keras==3.1.1",
"numpy==1.26.3",
"sacred==0.8.5",
"scikit-learn==1.4.1.post1"
],
"mainfile": "sacred_train_evaluation.py",
"name": "464913",
"repositories": [
{
"commit": "cf648b6c128aae353730cdad0c6972df3438c4cd",
"dirty": true,
"url": "https://git.wmi.amu.edu.pl/s464913/ium_464913.git"
}
],
"sources": [
[
"sacred_train_evaluation.py",
"_sources\\sacred_train_evaluation_69085ae4bcdbd49594dbaeed1ddb2e93.py"
]
]
},
"heartbeat": "2024-06-01T15:23:02.067455",
"host": {
"ENV": {},
"cpu": "AMD Ryzen 5 5500U with Radeon Graphics",
"hostname": "Dell",
"os": [
"Windows",
"Windows-11-10.0.22631-SP0"
],
"python_version": "3.12.3"
},
"meta": {
"command": "main",
"config_updates": {},
"named_configs": [],
"options": {
"--beat-interval": null,
"--capture": null,
"--comment": null,
"--debug": false,
"--enforce_clean": false,
"--file_storage": null,
"--force": false,
"--help": false,
"--id": null,
"--loglevel": null,
"--mongo_db": null,
"--name": null,
"--pdb": false,
"--print-config": false,
"--priority": null,
"--queue": false,
"--s3": null,
"--sql": null,
"--tiny_db": null,
"--unobserved": false,
"COMMAND": null,
"UPDATE": [],
"help": false,
"with": false
}
},
"resources": [
[
"C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\data\\X_train.csv",
"experiments\\_resources\\X_train_7505524c54858300bbd92094092a6c39.csv"
],
[
"C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\data\\X_val.csv",
"experiments\\_resources\\X_val_4d078882cc1898640ddaf4ad9117f543.csv"
],
[
"C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\data\\y_train.csv",
"experiments\\_resources\\y_train_8112a5cf4faac882c421bcb7e3d42044.csv"
],
[
"C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\data\\y_val.csv",
"experiments\\_resources\\y_val_1155f648650986d8866eba603b86560c.csv"
],
[
"C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\data\\X_test.csv",
"experiments\\_resources\\X_test_46ff52696af9a4c06f6b25639525dda6.csv"
],
[
"C:\\Users\\skype\\source\\repos\\In\u017cynieria Uczenia Maszynowego\\data\\y_test.csv",
"experiments\\_resources\\y_test_a6bc4827feae19934c4021d1f10f5963.csv"
]
],
"result": null,
"start_time": "2024-06-01T15:20:05.925811",
"status": "COMPLETED",
"stop_time": "2024-06-01T15:23:02.065167"
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,100 @@
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
from keras.models import Sequential
from keras.layers import BatchNormalization, Dropout, Dense, Flatten, Conv1D
from keras.optimizers import Adam
import pandas as pd
from sklearn.metrics import confusion_matrix
from sacred import Experiment
from sacred.observers import FileStorageObserver, MongoObserver
ex = Experiment("464913")
ex.observers.append(
MongoObserver.create(
url="mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017",
db_name="sacred",
)
)
ex.observers.append(FileStorageObserver("experiments"))
@ex.config
def my_config():
learning_rate = 0.001
epochs = 5
@ex.capture
def train_and_evaluate(_run, learning_rate, epochs):
X_train = _run.open_resource("data/X_train.csv")
X_val = _run.open_resource("data/X_val.csv")
y_train = _run.open_resource("data/y_train.csv")
y_val = _run.open_resource("data/y_val.csv")
X_train = pd.read_csv(X_train)
X_val = pd.read_csv(X_val)
y_train = pd.read_csv(y_train)
y_val = pd.read_csv(y_val)
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
model = Sequential(
[
Conv1D(32, 2, activation="relu", input_shape=X_train[0].shape),
BatchNormalization(),
Dropout(0.2),
Conv1D(64, 2, activation="relu"),
BatchNormalization(),
Dropout(0.5),
Flatten(),
Dense(64, activation="relu"),
Dropout(0.5),
Dense(1, activation="sigmoid"),
]
)
model.compile(
optimizer=Adam(learning_rate=learning_rate),
loss="binary_crossentropy",
metrics=["accuracy"],
)
model.fit(
X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
verbose=1,
)
model.save("sacred/model.keras")
_run.add_artifact("sacred/model.keras")
X_test = _run.open_resource("data/X_test.csv")
y_test = _run.open_resource("data/y_test.csv")
X_test = pd.read_csv(X_test)
y_test = pd.read_csv(y_test)
y_pred = model.predict(X_test)
y_pred = y_pred >= 0.5
cm = confusion_matrix(y_test, y_pred)
accuracy = cm[1, 1] / (cm[1, 0] + cm[1, 1])
_run.log_scalar("accuracy", accuracy)
@ex.automain
def main(learning_rate, epochs):
train_and_evaluate()

10
mlflow/MLproject Normal file
View File

@ -0,0 +1,10 @@
name: Credit card fraud MLFlow - s464913
conda_env: conda.yaml
entry_points:
main:
parameters:
learning_rate: { type: float, default: 0.001 }
epochs: { type: int, default: 5 }
command: 'python mlflow_train_evaluation.py {learning_rate} {epochs}'

11
mlflow/conda.yaml Normal file
View File

@ -0,0 +1,11 @@
name: Credit card fraud MLFlow - s464913
channels:
- defaults
dependencies:
- python=3.12
- pip
- pip:
- mlflow
- tensorflow
- pandas
- scikit-learn

View File

@ -0,0 +1,82 @@
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
from keras.models import Sequential
from keras.layers import BatchNormalization, Dropout, Dense, Flatten, Conv1D
from keras.optimizers import Adam
import pandas as pd
import sys
import mlflow
from sklearn.metrics import confusion_matrix
mlflow.set_tracking_uri("http://localhost:5000")
def main():
X_train = pd.read_csv("../data/X_train.csv")
X_val = pd.read_csv("../data/X_val.csv")
y_train = pd.read_csv("../data/y_train.csv")
y_val = pd.read_csv("../data/y_val.csv")
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
learning_rate = float(sys.argv[1])
epochs = int(sys.argv[2])
with mlflow.start_run() as run:
print("MLflow run experiment_id: {0}".format(run.info.experiment_id))
print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri))
model = Sequential(
[
Conv1D(32, 2, activation="relu", input_shape=X_train[0].shape),
BatchNormalization(),
Dropout(0.2),
Conv1D(64, 2, activation="relu"),
BatchNormalization(),
Dropout(0.5),
Flatten(),
Dense(64, activation="relu"),
Dropout(0.5),
Dense(1, activation="sigmoid"),
]
)
model.compile(
optimizer=Adam(learning_rate=learning_rate),
loss="binary_crossentropy",
metrics=["accuracy"],
)
model.fit(
X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
verbose=1,
)
mlflow.log_param("learning_rate", learning_rate)
mlflow.log_param("epochs", epochs)
X_test = pd.read_csv("../data/X_test.csv")
y_test = pd.read_csv("../data/y_test.csv")
y_pred = model.predict(X_test)
y_pred = y_pred >= 0.5
cm = confusion_matrix(y_test, y_pred)
accuracy = cm[1, 1] / (cm[1, 0] + cm[1, 1])
mlflow.log_metric("accuracy", accuracy)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,15 @@
artifact_uri: mlflow-artifacts:/0/3c46f6c4b15743faa0119c4b9b804825/artifacts
end_time: 1715508788768
entry_point_name: ''
experiment_id: '0'
lifecycle_stage: active
run_id: 3c46f6c4b15743faa0119c4b9b804825
run_name: dapper-hog-137
run_uuid: 3c46f6c4b15743faa0119c4b9b804825
source_name: ''
source_type: 4
source_version: ''
start_time: 1715508594003
status: 3
tags: []
user_id: skype

View File

@ -0,0 +1 @@
1715508787882 0.8217821782178217 0

View File

@ -0,0 +1 @@
5

View File

@ -0,0 +1 @@
0.001

View File

@ -0,0 +1 @@
https://git.wmi.amu.edu.pl/s464913/ium_464913.git

View File

@ -0,0 +1 @@
dapper-hog-137

View File

@ -0,0 +1 @@
a6be9a729562db8c47bc5fec88ad8f5216af0cf3

View File

@ -0,0 +1 @@
https://git.wmi.amu.edu.pl/s464913/ium_464913.git

View File

@ -0,0 +1 @@
file://C:\Users\skype\source\repos\Inżynieria Uczenia Maszynowego#\mlflow

View File

@ -0,0 +1 @@
PROJECT

View File

@ -0,0 +1 @@
skype

View File

@ -0,0 +1,15 @@
artifact_uri: mlflow-artifacts:/0/706dcf453a0842aaa48647e15521bb7b/artifacts
end_time: 1715508573447
entry_point_name: ''
experiment_id: '0'
lifecycle_stage: active
run_id: 706dcf453a0842aaa48647e15521bb7b
run_name: loud-whale-40
run_uuid: 706dcf453a0842aaa48647e15521bb7b
source_name: ''
source_type: 4
source_version: ''
start_time: 1715508159092
status: 3
tags: []
user_id: skype

View File

@ -0,0 +1 @@
1715508572612 0.7524752475247525 0

View File

@ -0,0 +1 @@
7

View File

@ -0,0 +1 @@
0.001

View File

@ -0,0 +1 @@
https://git.wmi.amu.edu.pl/s464913/ium_464913.git

View File

@ -0,0 +1 @@
loud-whale-40

View File

@ -0,0 +1 @@
a6be9a729562db8c47bc5fec88ad8f5216af0cf3

View File

@ -0,0 +1 @@
https://git.wmi.amu.edu.pl/s464913/ium_464913.git

View File

@ -0,0 +1 @@
file://C:\Users\skype\source\repos\Inżynieria Uczenia Maszynowego#\mlflow

View File

@ -0,0 +1 @@
PROJECT

View File

@ -0,0 +1 @@
skype

View File

@ -0,0 +1,6 @@
artifact_location: mlflow-artifacts:/0
creation_time: 1715508147231
experiment_id: '0'
last_update_time: 1715508147231
lifecycle_stage: active
name: Default

BIN
sacred/model.keras Normal file

Binary file not shown.

View File

@ -0,0 +1,100 @@
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
from keras.models import Sequential
from keras.layers import BatchNormalization, Dropout, Dense, Flatten, Conv1D
from keras.optimizers import Adam
import pandas as pd
from sklearn.metrics import confusion_matrix
from sacred import Experiment
from sacred.observers import FileStorageObserver, MongoObserver
ex = Experiment("464913")
ex.observers.append(
MongoObserver.create(
url="mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017",
db_name="sacred",
)
)
ex.observers.append(FileStorageObserver("experiments"))
@ex.config
def my_config():
learning_rate = 0.001
epochs = 5
@ex.capture
def train_and_evaluate(_run, learning_rate, epochs):
X_train = _run.open_resource("data/X_train.csv")
X_val = _run.open_resource("data/X_val.csv")
y_train = _run.open_resource("data/y_train.csv")
y_val = _run.open_resource("data/y_val.csv")
X_train = pd.read_csv(X_train)
X_val = pd.read_csv(X_val)
y_train = pd.read_csv(y_train)
y_val = pd.read_csv(y_val)
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
model = Sequential(
[
Conv1D(32, 2, activation="relu", input_shape=X_train[0].shape),
BatchNormalization(),
Dropout(0.2),
Conv1D(64, 2, activation="relu"),
BatchNormalization(),
Dropout(0.5),
Flatten(),
Dense(64, activation="relu"),
Dropout(0.5),
Dense(1, activation="sigmoid"),
]
)
model.compile(
optimizer=Adam(learning_rate=learning_rate),
loss="binary_crossentropy",
metrics=["accuracy"],
)
model.fit(
X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
verbose=1,
)
model.save("sacred/model.keras")
_run.add_artifact("sacred/model.keras")
X_test = _run.open_resource("data/X_test.csv")
y_test = _run.open_resource("data/y_test.csv")
X_test = pd.read_csv(X_test)
y_test = pd.read_csv(y_test)
y_pred = model.predict(X_test)
y_pred = y_pred >= 0.5
cm = confusion_matrix(y_test, y_pred)
accuracy = cm[1, 1] / (cm[1, 0] + cm[1, 1])
_run.log_scalar("accuracy", accuracy)
@ex.automain
def main(learning_rate, epochs):
train_and_evaluate()