diff --git a/Dockerfile b/Dockerfile
index 31fe6fa..8d205c1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,15 +2,16 @@ FROM ubuntu:latest
 
 WORKDIR /ium
 
-COPY ./skrypt.sh ./
-COPY ./statystyki.sh ./
+RUN apt update && apt install -y python3-pip unzip
 
-RUN chmod u+x ./skrypt.sh ./statystyki.sh
+RUN pip3 install --user kaggle
+RUN echo "export PATH=\"\$HOME/.local/bin:\$PATH\"" >> ~/.bashrc
 
-RUN apt update && apt install -y python3-pip zip
+RUN pip3 install --user pandas
+RUN pip3 install --user numpy
+RUN pip3 install --user sklearn
 
-RUN pip3 install kaggle
+COPY ./download_dataset.sh ./
+COPY ./process_data.py ./
 
-RUN mkdir ~/.kaggle
-RUN echo "{\"username\":\"kaerde\",\"key\":\"9831adcf128300863404c16935674937\"}" > ~/.kaggle/kaggle.json
-RUN echo "export PATH=\"\$HOME/.local/bin:\$PATH\"" >> ~/.bashrc
\ No newline at end of file
+RUN chmod u+x ./download_dataset.sh ./process_data.py
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000..82f4c17
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,32 @@
+pipeline {
+	parameters{
+		string(
+			defaultValue: 'kaerde',
+			description: 'Kaggle username',
+			name: 'KAGGLE_USERNAME',
+			trim: false
+		)
+		password(
+			defaultValue: 'e3fdd794699dbaf3ca7517dd8bb15d16',
+			description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
+			name: 'KAGGLE_KEY'
+		)
+	}
+    agent {
+        dockerfile {
+            additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY}"
+        }
+    }
+    stages {
+        stage('Stage 1') {
+            steps {
+					echo 'Downloading dataset...' 
+					sh './download_dataset.sh'
+					echo 'Dataset downloaded'
+                    echo 'Processing dataset...'
+                    sh 'python3 process_dataset.py'
+					echo 'Dataset processed'
+				}
+            }
+        }		
+    }
diff --git a/download_dataset.sh b/download_dataset.sh
new file mode 100644
index 0000000..bae3c01
--- /dev/null
+++ b/download_dataset.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+kaggle datasets download -d mterzolo/lego-sets
+unzip -o lego-sets.zip 
diff --git a/process_dataset.py b/process_dataset.py
new file mode 100644
index 0000000..2384cc9
--- /dev/null
+++ b/process_dataset.py
@@ -0,0 +1,30 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+# usuwamy przy okazji puste pola
+lego = pd.read_csv('lego_sets.csv').dropna()
+
+# list_price moze byc do dwoch miejsc po przecinku
+lego['list_price'] = lego['list_price'].round(2)
+
+# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi
+lego['num_reviews'] = lego['num_reviews'].apply(np.int64)
+lego['piece_count'] = lego['piece_count'].apply(np.int64)
+lego['prod_id'] = lego['prod_id'].apply(np.int64)
+
+# wglad, statystyki
+print(lego)
+print(lego.describe(include='all'))
+
+# pierwszy podzial, wydzielamy zbior treningowy
+lego_train, lego_rem = train_test_split(lego, train_size=0.8, random_state=1)
+
+# drugi podział, wydzielamy walidacyjny i testowy
+lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)
+
+# zapis
+lego.to_csv('lego_sets_clean.csv', index=None, header=True)
+lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True)
+lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True)
+lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True)
\ No newline at end of file