diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..629b7f1 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + remote = my_local_remote +['remote "ium_ssh_remote"'] + url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl +['remote "my_local_remote"'] + url = .. diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8781153 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/Steel_industry_data.csv diff --git a/Steel_industry_data.csv.dvc b/Steel_industry_data.csv.dvc new file mode 100644 index 0000000..696706b --- /dev/null +++ b/Steel_industry_data.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: dc217c9856d659f8cf61d3156397e535 + size: 2731389 + path: Steel_industry_data.csv diff --git a/download.sh b/download.sh index 497211a..e7ef608 100644 --- a/download.sh +++ b/download.sh @@ -1,6 +1,3 @@ -kaggle datasets download -d csafrit2/steel-industry-energy-consumption --force -unzip -o steel-industry-energy-consumption.zip - head -n $CUTOFF Steel_industry_data.csv | tail -n +2 | cut -d, -f8 --complement | shuf > steel_industry_data_shuffled.csv number_of_lines=$(wc -l steel_industry_data_shuffled.csv | awk '{print $1}') test_len=$((number_of_lines/10))