diff --git a/Jenkinsfile b/Jenkinsfile index 3b9d9c6..325c7e9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -35,14 +35,17 @@ pipeline { steps { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { - sh 'chmod 777 ./data_processing.sh' - sh "./data_processing.sh ${params.CUTOFF}" + sh 'chmod 777 ./data_processing.py' + sh "./data_processing.py ${params.CUTOFF}" } } } stage('Artifacts') { steps { - archiveArtifacts artifacts: 'processed_data.txt' + def artifactsList = ['hp_train.csv', 'hp_dev.csv', 'hp_test.csv''] + artifactsList.each { artifact -> + archiveArtifacts artifacts: artifact + } } } } diff --git a/data_processing.py b/data_processing.py new file mode 100644 index 0000000..adb9b42 --- /dev/null +++ b/data_processing.py @@ -0,0 +1,19 @@ +import sklearn +from sklearn.preprocessing import OneHotEncoder +from sklearn.model_selection import train_test_split +import pandas as pd +import subprocess + +subprocess.run(["kaggle", "datasets", "download", "muhammadbinimran/housing-price-prediction-data", "--unzip"]) +housing_price_dataset = pd.read_csv('housing_price_dataset.csv') + +hp_train_test, hp_dev = sklearn.model_selection.train_test_split(housing_price_dataset, test_size=0.1) +hp_train, hp_test = sklearn.model_selection.train_test_split(hp_train_test, test_size=1000) + +hp_train = pd.get_dummies(hp_train, columns=['Neighborhood']) +hp_dev = pd.get_dummies(hp_dev, columns=['Neighborhood']) +hp_test = pd.get_dummies(hp_test, columns=['Neighborhood']) + +hp_train.to_csv('hp_train.csv', index=False) +hp_dev.to_csv('hp_dev.csv', index=False) +hp_test.to_csv('hp_test.csv', index=False)