diff --git a/Dockerfile b/Dockerfile index b7c6426..54624d5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM ubuntu:latest # Install required dependencies +RUN export PATH="$PATH:/root/.local/bin" RUN apt update RUN apt-get update RUN apt install -y figlet @@ -13,19 +14,18 @@ RUN pip3 install pandas RUN pip3 install pillow --global-option="build_ext" --global-option="--disable-zlib" --global-option="--disable-jpeg" RUN pip3 install scikit-learn RUN pip3 install matplotlib +RUN pip3 install torchvision + +# Args +ARG KAGGLE_USERNAME +ARG KAGGLE_KEY +ENV IS_DOCKER=True # Create app directory in image WORKDIR /app +# Copy everything from jenkins to /app COPY . . -ARG KAGGLE_USERNAME -ARG KAGGLE_KEY -# Download kaggle dataset -RUN kaggle datasets download -d hakeem/atp-and-wta-tennis-data -RUN unzip -o atp-and-wta-tennis-data.zip - -# Script executed after docker run -RUN python3 ./init.py -RUN chmod a+rwx -R * -RUN ls -la \ No newline at end of file +# Create kaggle catalog for authenticate +RUN mkdir /.kaggle/ && chmod o+w /.kaggle \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 8689984..83b362b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,4 +1,10 @@ -pipeline { +pipeline { + agent { + dockerfile { + additionalBuildArgs '-t ium' + args '-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY}' + } + } parameters { string ( defaultValue: 'wirus006', @@ -12,22 +18,23 @@ pipeline { name: 'KAGGLE_KEY' ) } - agent { - dockerfile { - additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} -t s444498-create-dataset" - } + options { + copyArtifactPermission('s444498-training'); } stages { - stage('Archive dataset') { + stage('Init datasets') { steps { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", - "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { - sh 'echo hello world | figlet' - sh 'chmod a+rwx -R *' - sh 'pwd && ls' - sh 'ls /app/data/' - archiveArtifacts artifacts: '/app/data/*', onlyIfSuccessful: true - } + sh 'python3 init.py' + } + } + stage('Archive datasets') { + steps { + archiveArtifacts artifacts: 'atp_test.csv, atp_train.csv', onlyIfSuccessful: true + } + } + stage('Run training job') { + steps { + build job: "s444498-training/master" } } } diff --git a/Jenkinsfile-training b/Jenkinsfile-training index 0fd7f96..f8cf5dc 100644 --- a/Jenkinsfile-training +++ b/Jenkinsfile-training @@ -1,4 +1,8 @@ pipeline { + agent { + dockerfile true + } + parameters { string( defaultValue: '64', @@ -12,47 +16,47 @@ pipeline { name: 'EPOCHS', trim: true ) - gitParameter branchFilter: 'origin/(.*)', defaultValue: 'main', name: 'BRANCH', type: 'PT_BRANCH' buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR' ) } - - agent { - docker { - image 's444498-create-dataset' - } - } stages { + stage('Copy artifacts') { + steps { + copyArtifacts fingerprintArtifacts: true, projectName: 's444498-create-dataset', selector: buildParameter('BUILD_SELECTOR') + } + } stage('Train model') { steps { - sh "python neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}" + sh "chmod u+x ./neutral_network.py" + sh "python3 neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}" + } + } + stage('Archive model') { + steps { + archiveArtifacts artifacts: "model.zip", onlyIfSuccessful: true } } } - environment { - NOTIFICATION_ADDRESS = 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' - } - post { success { - emailext body: 'SUCCESS', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + emailext body: "SUCCESS", subject: "s444498-training", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms" } failure { - emailext body: 'FAILURE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + emailext body: "FAILURE", subject: "s444498-training", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms" } unstable { - emailext body: 'UNSTABLE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + emailext body: 'UNSTABLE', subject: "s444498-training", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms" } changed { - emailext body: 'CHANGED', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" - } + emailext body: 'CHANGED', subject: "s444498-training", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms" + } } } \ No newline at end of file diff --git a/init.py b/init.py index 3f1fe4b..3743179 100644 --- a/init.py +++ b/init.py @@ -7,41 +7,36 @@ import matplotlib from pathlib import Path # Inicjalizacja danych - +file_exists = exists('./df_atp.csv') +if not file_exists: + subprocess.run(["kaggle", "datasets", "download", "-d", "hakeem/atp-and-wta-tennis-data"]) + subprocess.run(["unzip", "-o", "atp-and-wta-tennis-data.zip"]) atp_data = pd.read_csv('df_atp.csv') print(atp_data) # Średnia ilość gemów w pierwszym secie zwycięzców meczu - print(atp_data[["Winner", "W1"]].mean()) # Minimalna ilość wygranych gemów w pierwszym secie osób wygrywających mecz - print(atp_data[["Winner", "W1"]].min()) # Maksymalna ilość wygranych gemów w pierwszym secie osób wygrywających mecz - print(atp_data[["Winner", "W1"]].max()) # Odchylenie standardowe wygranych gemów w pierwszym secie osób wygrywających mecz - print(atp_data[["Winner", "W1"]].std()) # Mediana wygranych gemów w pierwszym secie osób wygrywających mecz - print(atp_data[["Winner", "W1"]].median()) # Zmiana nazwy nienazwanej kolumny - atp_data.rename(columns={'Unnamed: 0':'ID'}, inplace=True) # Jak często kto był zwycięzcą - print(atp_data.groupby("Winner")["ID"].nunique()) # Normalizacja rund -1: Finał, -2: Półfinał, -3: Ćwiartka, -4: Każdy z każdym # 1: pierwsza runda, 2: druga runda, 3: trzecia runda, 4: czwarta runda - atp_data.loc[atp_data["Round"] == 'The Final', "Round"] = -1 atp_data.loc[atp_data["Round"] == 'Semifinals', "Round"] = -2 atp_data.loc[atp_data["Round"] == 'Quarterfinals', "Round"] = -3 @@ -53,28 +48,19 @@ atp_data.loc[atp_data["Round"] == '4th Round', "Round"] = 4 print(atp_data["Round"]) # Czyszczenie: W polu z datą zamienimy ######## na pustego stringa - atp_data.loc[atp_data["Date"] == '########', "Date"] = '' print(atp_data["Date"]) # Podział na podzbiory: trenujący, testowy, walidujący w proporcjach 6:2:2 - atp_train, atp_test = train_test_split(atp_data, test_size=0.4, random_state=1) atp_dev, atp_test = train_test_split(atp_test, test_size=0.5, random_state=1) # Wielkość zbioru i podzbiorów - print("\nElements of total set: " + str(len(atp_data))) print("\nElements of test set: " + str(len(atp_test))) print("\nElements of dev set: " + str(len(atp_dev))) print("\nElements of train set: " + str(len(atp_train))) # Stworzenie plików z danymi trenującymi i testowymi - -filepath1 = Path('data/atp_test.csv') -filepath2 = Path('data/atp_train.csv') -filepath1.parent.mkdir(parents=True, exist_ok=True) -filepath2.parent.mkdir(parents=True, exist_ok=True) - -atp_test.to_csv(filepath1) -atp_train.to_csv(filepath2) \ No newline at end of file +atp_test.to_csv('atp_test.csv', encoding="utf-8", index=False) +atp_train.to_csv('atp_train.csv', encoding="utf-8", index=False) \ No newline at end of file diff --git a/neutral_network.py b/neutral_network.py index 65961d3..1b269bb 100644 --- a/neutral_network.py +++ b/neutral_network.py @@ -87,8 +87,8 @@ print(f"Using {device} device") args = setup_args() batch_size = args.batchSize -plant_test = AtpDataset('data/atp_test.csv') -plant_train = AtpDataset('data/atp_train.csv') +plant_test = AtpDataset('atp_test.csv') +plant_train = AtpDataset('atp_train.csv') train_dataloader = DataLoader(plant_train, batch_size=batch_size) test_dataloader = DataLoader(plant_test, batch_size=batch_size)