diff --git a/Dockerfile b/Dockerfile index 597b075..4887827 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,4 +11,4 @@ RUN pip install --user geopandas WORKDIR /app -COPY ./get_dataset.sh ./ +COPY IUM_2.py ./ diff --git a/IUM_2.py b/IUM_2.py new file mode 100644 index 0000000..ba0c5d4 --- /dev/null +++ b/IUM_2.py @@ -0,0 +1,48 @@ +import matplotlib.pyplot as plt +import pandas as pd +import kaggle +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +def download_file(): + kaggle.api.authenticate() + kaggle.api.dataset_download_files('nasa/meteorite-landings', path='.', unzip=True) + + +def split(data): + meteorite_train, meteorite_test = train_test_split(data, test_size=0.2, random_state=1) + meteorite_train, meteorite_val = train_test_split(meteorite_train, test_size=0.25, random_state=1) + return meteorite_train, meteorite_test, meteorite_val + +def normalization(data): + scaler = StandardScaler() + data['mass'] = scaler.fit_transform(data[['mass']]) + return data + +def preprocessing(data): + data = data.dropna(subset=['reclat']) + + incorrect_years_index = data.loc[(data['year'] > 2016) | (data['year'] < 860)].index + incorrect_location_index = data.loc[(data['reclat'] == 0) & (data['reclong'] == 0)].index + + data.drop(incorrect_years_index.union(incorrect_location_index), inplace=True) + data.loc[(data['mass'].isnull()) & (data['name'].str.startswith('Österplana')), 'mass'] = 0 + return data + +download_file() +data = pd.read_csv("meteorite-landings.csv") +meteorite_train, meteorite_test, meteorite_val = split(data) + +meteorite_train = normalization(meteorite_train) +meteorite_test = normalization(meteorite_test) +meteorite_val = normalization(meteorite_val) + +meteorite_train = normalization(meteorite_train) +meteorite_test = normalization(meteorite_test) +meteorite_val = normalization(meteorite_val) + +meteorite_train.to_csv('meteorite_train.csv', encoding='utf-8') +meteorite_test.to_csv('meteorite_test.csv', encoding='utf-8') +meteorite_val.to_csv('meteorite_val.csv', encoding='utf-8') + + diff --git a/Jenkinsfile b/Jenkinsfile index d299d7e..caae88a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -6,6 +6,11 @@ pipeline { string(name: 'CUTOFF', defaultValue: '100', description: 'cut off number') } stages { + stage('Git Checkout') { + steps { + checkout scm + } + } stage('Build') { steps { script { @@ -13,9 +18,8 @@ pipeline { "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { def customImage = docker.build("custom-image") customImage.inside { - checkout scm sh 'bash ./get_dataset.sh $CUTOFF' - archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true + archiveArtifacts artifacts: 'meteorite-landings.csv meteorite_train.csv meteorite_test.csv meteorite_val.csv', onlyIfSuccessful: true } } } diff --git a/meteorite-landings.zip b/meteorite-landings.zip deleted file mode 100644 index 8e31256..0000000 Binary files a/meteorite-landings.zip and /dev/null differ