Dockerfile

2022-04-03 12:24:05 +02:00 · 2022-04-03 12:24:05 +02:00 · fe38ed7a4c
commit fe38ed7a4c
parent 55e68cc2cc
5 changed files with 119 additions and 17 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
+{
+    "python.terminal.executeInFileDir": true
+}
--- a/26
+++ b/26
@ -0,0 +1,26 @@
+# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
+FROM ubuntu:latest
+
+# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
+RUN apt update && apt install -y
+RUN apt-get install -y python3
+RUN apt-get install -y unzip
+RUN apt-get install -y python3-pip
+RUN pip install --upgrade pip
+
+RUN pip install --user kaggle
+RUN pip install --user pandas
+
+# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
+WORKDIR /app
+
+# Skopiujmy nasz skrypt do katalogu /app w kontenerze
+COPY ./startscript1.sh ./
+COPY ./src/task1python.py ./src/task1python.py
+
+ARG KAGGLE_USERNAME
+ARG KAGGLE_KEY
+
+# Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie
+# RUN chmod u+x ./startscript1.sh
+# RUN chmod u+x ./src/task1python.py
--- a/6
+++ b/6
@ -1,5 +1,4 @@
 pipeline {
-    agent any
    parameters{
        string(
            defaultValue: 'mikolaj2', 
@ -17,6 +16,9 @@ pipeline {
            name: 'KAGGLE_KEY'
        )
    }
+    agent {
+        dockerfile {additionalBuildArgs "additionalBuildArgs  '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t my-image"}
+    }
    stages {
        stage("Check out from version control") {
            steps {
@ -26,7 +28,7 @@ pipeline {
        stage("Shell Scripts") {
            steps {
                sh "chmod u+x ./startscript1.sh"
-                sh "KAGGLE_USERNAME=${KAGGLE_USERNAME} KAGGLE_KEY=${env.KAGGLE_KEY} CUTOFF=${CUTOFF} ./startscript1.sh"
+                sh "KAGGLE_USERNAME=${params.KAGGLE_USERNAME} KAGGLE_KEY=${params.KAGGLE_KEY} CUTOFF=${CUTOFF} ./startscript1.sh"
                archiveArtifacts 'data.txt'
            }
        }
--- a/src/task1.ipynb
+++ b/src/task1.ipynb
@ -78,7 +78,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -95,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -104,15 +104,23 @@
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 29451 entries, 0 to 29450\n",
-      "Data columns (total 4 columns):\n",
+      "Data columns (total 12 columns):\n",
      " #   Column                 Non-Null Count  Dtype  \n",
      "---  ------                 --------------  -----  \n",
-      " 0   TARGET(PRICE_IN_LACS)  29451 non-null  float64\n",
-      " 1   SQUARE_FT              29451 non-null  float64\n",
-      " 2   BHK_NO.                29451 non-null  int64  \n",
-      " 3   RESALE                 29451 non-null  int64  \n",
-      "dtypes: float64(2), int64(2)\n",
-      "memory usage: 920.5 KB\n"
+      " 0   POSTED_BY              29451 non-null  object \n",
+      " 1   UNDER_CONSTRUCTION     29451 non-null  int64  \n",
+      " 2   RERA                   29451 non-null  int64  \n",
+      " 3   BHK_NO.                29451 non-null  int64  \n",
+      " 4   BHK_OR_RK              29451 non-null  object \n",
+      " 5   SQUARE_FT              29451 non-null  float64\n",
+      " 6   READY_TO_MOVE          29451 non-null  int64  \n",
+      " 7   RESALE                 29451 non-null  int64  \n",
+      " 8   ADDRESS                29451 non-null  object \n",
+      " 9   LONGITUDE              29451 non-null  float64\n",
+      " 10  LATITUDE               29451 non-null  float64\n",
+      " 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64\n",
+      "dtypes: float64(4), int64(5), object(3)\n",
+      "memory usage: 2.7+ MB\n"
     ]
    }
   ],
@ -122,7 +130,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -177,7 +185,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -226,7 +234,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -235,7 +243,7 @@
       "<AxesSubplot:>"
      ]
     },
-     "execution_count": 11,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -259,7 +267,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -362,7 +370,7 @@
       "max                 1.000000      1.000000     20.000000      1.000000"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
--- a/src/task1python.py
+++ b/src/task1python.py
@ -0,0 +1,63 @@
+import pandas as pd
+
+# paths
+filePathTest = "../Participants_Data_HPP/Train.csv"
+filePathTrain = "../Participants_Data_HPP/Test.csv"
+
+dataTest = pd.read_csv(filePathTest)
+dataTrain = pd.read_csv(filePathTrain)
+
+number_lines = len(dataTest.index)
+row_size = number_lines // 2
+
+# start looping through data writing it to a new file for each set
+# no of csv files with row size
+k = 2
+size = row_size
+
+# split test data to test and dev
+for i in range(k):
+    df = dataTest[size * i:size * (i + 1)]
+    name = ""
+    if i == 0:
+        name = "Dev"
+    else:
+        name = "Test"
+    df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)
+
+#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
+
+#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
+
+#df_2 = pd.read_csv("../Participants_Data_HPP/Train.csv")
+
+dataPath = '../Participants_Data_HPP/Train.csv'
+
+#data informations
+data = pd.read_csv(dataPath)
+
+description = data.describe(include="all")
+
+corr = data.corr()
+
+#select the most significant
+data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
+#normalize price column and flat area using min max technique
+columnName1 = 'TARGET(PRICE_IN_LACS)'
+columnName2 = 'SQUARE_FT'
+
+column1Min = data[columnName1].min()
+column1Max = data[columnName1].max()
+column2Min = data[columnName2].min()
+column2Max = data[columnName2].max()
+
+data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
+data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
+
+print(description)
+
+print(corr)
+
+print(data.describe(include="all"))
+
+print(data.head())