diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..c421e3d --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = ium_ssh_remote +['remote "ium_ssh_remote"'] + url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl + user = ium-sftp diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json new file mode 100644 index 0000000..af1b48d --- /dev/null +++ b/.dvc/plots/confusion.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "max", + "field": "xy_count", + "as": "max_count" + } + ], + "groupby": [] + }, + { + "calculate": "datum.xy_count / datum.max_count", + "as": "percent_of_max" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "xy_count", + "type": "quantitative", + "title": "", + "scale": { + "domainMin": 0, + "nice": true + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "xy_count", + "type": "quantitative" + }, + "color": { + "condition": { + "test": "datum.percent_of_max > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json new file mode 100644 index 0000000..1d38849 --- /dev/null +++ b/.dvc/plots/confusion_normalized.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "sum", + "field": "xy_count", + "as": "sum_y" + } + ], + "groupby": [ + "" + ] + }, + { + "calculate": "datum.xy_count / datum.sum_y", + "as": "percent_of_y" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "percent_of_y", + "type": "quantitative", + "title": "", + "scale": { + "domain": [ + 0, + 1 + ] + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "percent_of_y", + "type": "quantitative", + "format": ".2f" + }, + "color": { + "condition": { + "test": "datum.percent_of_y > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json new file mode 100644 index 0000000..9cf71ce --- /dev/null +++ b/.dvc/plots/default.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + } +} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json new file mode 100644 index 0000000..65549f9 --- /dev/null +++ b/.dvc/plots/linear.json @@ -0,0 +1,116 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "line" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "mark": { + "type": "rule", + "color": "gray" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative" + } + } + }, + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json new file mode 100644 index 0000000..9af9304 --- /dev/null +++ b/.dvc/plots/scatter.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "point" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json new file mode 100644 index 0000000..d497ce7 --- /dev/null +++ b/.dvc/plots/smooth.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "transform": [ + { + "loess": "", + "on": "", + "groupby": [ + "rev" + ], + "bandwidth": 0.3 + } + ] +} diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ad61e40 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/train_dataset_dvc.csv +/test_dataset_dvc.csv +/logs_dvc.txt diff --git a/create_data.py b/create_data.py index 5a6aeac..902977a 100644 --- a/create_data.py +++ b/create_data.py @@ -58,4 +58,7 @@ test_dataset = pd.concat([X_test, y_test], axis=1) train_dataset.to_csv('train_dataset.csv', index=False) test_dataset.to_csv('test_dataset.csv', index=False) +train_dataset.to_csv('train_dataset_dvc.csv', index=False) +test_dataset.to_csv('test_dataset_dvc.csv', index=False) + print("Quiting create_data.py") diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..c12baea --- /dev/null +++ b/dvc.lock @@ -0,0 +1,31 @@ +schema: '2.0' +stages: + prepare: + cmd: python create_data.py + deps: + - path: KaggleV2-May-2016.csv + md5: cc55525de6e2b615aeba50095e8aaa95 + size: 10850063 + - path: create_data.py + md5: 0cefc3631ca8d15d62df548c92d53eb3 + size: 2437 + outs: + - path: test_dataset_dvc.csv + md5: b0d2e25243ff9f564546becf8464af55 + size: 836063 + - path: train_dataset_dvc.csv + md5: a17d28d659e1ba5f62a4203b7635ccfe + size: 3345298 + train: + cmd: python train_model.py + deps: + - path: test_dataset_dvc.csv + md5: b0d2e25243ff9f564546becf8464af55 + size: 836063 + - path: train_dataset_dvc.csv + md5: a17d28d659e1ba5f62a4203b7635ccfe + size: 3345298 + outs: + - path: logs_dvc.txt + md5: 506eda87493d4f7cc0ba2b7389b5a182 + size: 52 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..c54ee92 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,16 @@ +stages: + prepare: + cmd: python create_data.py + deps: + - create_data.py + - KaggleV2-May-2016.csv + outs: + - train_dataset_dvc.csv + - test_dataset_dvc.csv + train: + cmd: python train_model.py + deps: + - train_dataset_dvc.csv + - test_dataset_dvc.csv + outs: + - logs_dvc.txt diff --git a/logs.txt b/logs.txt index 890f69c..f22a020 100644 --- a/logs.txt +++ b/logs.txt @@ -1 +1,2 @@ -loss=0.48354023694992065, accuracy=79.3711829902737 +loss=0.48352938890457153, accuracy=79.37457588780819 +loss=0.483554482460022, accuracy=79.36213526351504 diff --git a/train_model.py b/train_model.py index 0c6c16c..5e1c428 100644 --- a/train_model.py +++ b/train_model.py @@ -6,8 +6,8 @@ import matplotlib import matplotlib.pyplot as plt import seaborn as sns -train_dataset = pd.read_csv('train_dataset.csv') -test_dataset = pd.read_csv('test_dataset.csv') +train_dataset = pd.read_csv('train_dataset_dvc.csv') +test_dataset = pd.read_csv('test_dataset_dvc.csv') X_train = train_dataset.drop(columns=['No-show']).to_numpy() X_test = test_dataset.drop(columns=['No-show']).to_numpy() @@ -79,4 +79,8 @@ print(f"Iteration: {iter}. \nTest - Loss: {loss_test.item()}. Accuracy: {accurac print(f"Train - Loss: {loss.item()}. Accuracy: {accuracy}\n") with open("logs.txt", "a") as myfile: - myfile.write(f"loss={loss.item()}, accuracy={accuracy}\n") \ No newline at end of file + myfile.write(f"loss={loss.item()}, accuracy={accuracy}\n") + +with open("logs_dvc.txt", "a") as myfile: + myfile.write(f"loss={loss.item()}, accuracy={accuracy}\n") +