Dockerfile

2022-04-03 12:24:05 +02:00 · 2022-04-03 12:24:05 +02:00 · fe38ed7a4c
commit fe38ed7a4c
parent 55e68cc2cc
5 changed files with 119 additions and 17 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
 {
    "python.terminal.executeInFileDir": true
 }
--- a/26
+++ b/26
@ -0,0 +1,26 @@
 # Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
 FROM ubuntu:latest
 # Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
 RUN apt update && apt install -y
 RUN apt-get install -y python3
 RUN apt-get install -y unzip
 RUN apt-get install -y python3-pip
 RUN pip install --upgrade pip
 RUN pip install --user kaggle
 RUN pip install --user pandas
 # Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
 WORKDIR /app
 # Skopiujmy nasz skrypt do katalogu /app w kontenerze
 COPY ./startscript1.sh ./
 COPY ./src/task1python.py ./src/task1python.py
 ARG KAGGLE_USERNAME
 ARG KAGGLE_KEY
 # Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie
 # RUN chmod u+x ./startscript1.sh
 # RUN chmod u+x ./src/task1python.py
--- a/6
+++ b/6
@ -1,5 +1,4 @@
 pipeline {
    agent any
    parameters{
        string(
            defaultValue: 'mikolaj2', 
@ -17,6 +16,9 @@ pipeline {
            name: 'KAGGLE_KEY'
        )
    }
    agent {
        dockerfile {additionalBuildArgs "additionalBuildArgs  '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t my-image"}
    }
    stages {
        stage("Check out from version control") {
            steps {
@ -26,7 +28,7 @@ pipeline {
        stage("Shell Scripts") {
            steps {
                sh "chmod u+x ./startscript1.sh"
-                sh "KAGGLE_USERNAME=${KAGGLE_USERNAME} KAGGLE_KEY=${env.KAGGLE_KEY} CUTOFF=${CUTOFF} ./startscript1.sh"
+                sh "KAGGLE_USERNAME=${params.KAGGLE_USERNAME} KAGGLE_KEY=${params.KAGGLE_KEY} CUTOFF=${CUTOFF} ./startscript1.sh"
                archiveArtifacts 'data.txt'
            }
        }
--- a/src/task1.ipynb
+++ b/src/task1.ipynb
@ -78,7 +78,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -95,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -104,15 +104,23 @@
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 29451 entries, 0 to 29450\n",
-      "Data columns (total 4 columns):\n",
+      "Data columns (total 12 columns):\n",
      " #   Column                 Non-Null Count  Dtype  \n",
      "---  ------                 --------------  -----  \n",
-      " 0   TARGET(PRICE_IN_LACS)  29451 non-null  float64\n",
+      " 0   POSTED_BY              29451 non-null  object \n",
-      " 1   SQUARE_FT              29451 non-null  float64\n",
+      " 1   UNDER_CONSTRUCTION     29451 non-null  int64  \n",
-      " 2   BHK_NO.                29451 non-null  int64  \n",
+      " 2   RERA                   29451 non-null  int64  \n",
-      " 3   RESALE                 29451 non-null  int64  \n",
+      " 3   BHK_NO.                29451 non-null  int64  \n",
-      "dtypes: float64(2), int64(2)\n",
+      " 4   BHK_OR_RK              29451 non-null  object \n",
-      "memory usage: 920.5 KB\n"
+      " 5   SQUARE_FT              29451 non-null  float64\n",
      " 6   READY_TO_MOVE          29451 non-null  int64  \n",
      " 7   RESALE                 29451 non-null  int64  \n",
      " 8   ADDRESS                29451 non-null  object \n",
      " 9   LONGITUDE              29451 non-null  float64\n",
      " 10  LATITUDE               29451 non-null  float64\n",
      " 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64\n",
      "dtypes: float64(4), int64(5), object(3)\n",
      "memory usage: 2.7+ MB\n"
     ]
    }
   ],
@ -122,7 +130,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -177,7 +185,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -226,7 +234,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -235,7 +243,7 @@
       "<AxesSubplot:>"
      ]
     },
-     "execution_count": 11,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -259,7 +267,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -362,7 +370,7 @@
       "max                 1.000000      1.000000     20.000000      1.000000"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
--- a/src/task1python.py
+++ b/src/task1python.py
@ -0,0 +1,63 @@
 import pandas as pd
 # paths
 filePathTest = "../Participants_Data_HPP/Train.csv"
 filePathTrain = "../Participants_Data_HPP/Test.csv"
 dataTest = pd.read_csv(filePathTest)
 dataTrain = pd.read_csv(filePathTrain)
 number_lines = len(dataTest.index)
 row_size = number_lines // 2
 # start looping through data writing it to a new file for each set
 # no of csv files with row size
 k = 2
 size = row_size
 # split test data to test and dev
 for i in range(k):
    df = dataTest[size * i:size * (i + 1)]
    name = ""
    if i == 0:
        name = "Dev"
    else:
        name = "Test"
    df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)
 #df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
 #df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
 #df_2 = pd.read_csv("../Participants_Data_HPP/Train.csv")
 dataPath = '../Participants_Data_HPP/Train.csv'
 #data informations
 data = pd.read_csv(dataPath)
 description = data.describe(include="all")
 corr = data.corr()
 #select the most significant
 data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
 #normalize price column and flat area using min max technique
 columnName1 = 'TARGET(PRICE_IN_LACS)'
 columnName2 = 'SQUARE_FT'
 column1Min = data[columnName1].min()
 column1Max = data[columnName1].max()
 column2Min = data[columnName2].min()
 column2Max = data[columnName2].max()
 data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
 data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
 print(description)
 print(corr)
 print(data.describe(include="all"))
 print(data.head())