Jenkins-2 Zadanie 1.

This commit is contained in:
Jan Nowak 2021-05-07 20:16:31 +02:00
parent 8a09b3e485
commit 38d765d1f2
18 changed files with 57561 additions and 69 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
venv
.ipynb_checkpoints
.vscode

27
Dockerfile Normal file
View File

@ -0,0 +1,27 @@
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
FROM ubuntu:focal
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
RUN apt update
RUN apt install -y python3 python3-pip dos2unix git
RUN pip3 install kaggle
RUN apt install -y unzip
RUN mkdir /.kaggle
RUN chmod -R 777 /.kaggle
#RUN export KAGGLE_CONFIG_DIR=~/.kaggle
COPY ./requirments.txt ./
RUN pip3 install -r requirments.txt
RUN pip3 install torch==1.8.1+cpu torchvision==0.9.1+cpu torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
WORKDIR /app
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
# COPY ./skrypt.sh ./
# RUN chmod +x skrypt.sh
# RUN dos2unix skrypt.sh
COPY ./dlgssdpytorch.py ./
RUN chmod +x dlgssdpytorch.py
COPY ./create_dataset.py ./
RUN chmod +x create_dataset.py

51291
Global_Superstore2.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
Global_Superstore2.csv.zip Normal file

Binary file not shown.

29
Jenkins_train Normal file
View File

@ -0,0 +1,29 @@
pipeline {
agent any
stages {
stage('checkout') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s426206/ium_426206.git']]])
}
}
stage('docker') {
steps {
script {
def img = docker.build('rokoch/ium:01')
img.inside {
sh 'chmod +x dlgssdpytorch.py'
sh 'python3 ./dlgssdpytorch.py'
}
}
}
}
stage('end') {
steps {
//Zarchiwizuj wynik
archiveArtifacts 'model.pt'
}
}
}
}

97
Jenkinsfile vendored
View File

@ -1,46 +1,51 @@
pipeline {
agent any
parameters {
string(
defaultValue: '0',
description: 'Umożliwia zdefiniowanie wielkości odcięcia zbioru danych.',
name: 'CUTOFF',
trim: false
)
string(
defaultValue: 'rokoch',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
}
stages {
stage('checkout') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s426206/ium_426206.git']]])
}
}
stage('sh') {
steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
sh "chmod +x skrypt.sh"
sh "./skrypt.sh ${params.CUTOFF} | tee output.txt"
}
}
}
stage('end') {
steps {
//Zarchiwizuj wynik
archiveArtifacts 'output.txt,Global_Superstore22.csv,Global_Superstore2.csv.dev,Global_Superstore2.csv.test,Global_Superstore2.csv.train'
}
}
}
}
pipeline {
agent any
parameters {
string(
defaultValue: '',
description: 'Parametry trenowania.',
name: 'PARAMETRY',
trim: false
)
string(
defaultValue: 'rokoch',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
}
stages {
stage('checkout') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s426206/ium_426206.git']]])
}
}
stage('docker') {
steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
script {
def img = docker.build('rokoch/ium:01')
img.inside {
sh 'chmod +x create_dataset.py'
sh 'python3 ./create_dataset.py $PARAMETRY'
}
}
}
}
}
stage('end') {
steps {
//Zarchiwizuj wynik
archiveArtifacts 'train_dataset.pt,val_dataset.pt'
}
}
}
}

32
Jenkinsfile_stats Normal file
View File

@ -0,0 +1,32 @@
pipeline {
//agent { docker {image 'rokoch/ium:01' }}
agent any
parameters {
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR')
}
stages {
stage('Copy artifact') {
steps {
copyArtifacts filter: 'Global_Superstore22.csv,Global_Superstore2.csv.dev,Global_Superstore2.csv.test,Global_Superstore2.csv.train', fingerprintArtifacts: false, projectName: 's426206-create-dataset', selector: buildParameter('BUILD_SELECTOR')
}
}
stage('Clone repo') {
steps {
script {
//docker.withRegistry("https://hub.docker.com/r/rokoch/ium"){
docker.image("rokoch/ium:01").inside {
sh 'rm -rf ium_426206'
sh 'git clone https://git.wmi.amu.edu.pl/s426206/ium_426206.git'
sh "chmod +x ium_426206/stats.sh"
sh "ium_426206/stats.sh | tee output.txt"
archiveArtifacts 'output.txt'
}
//}
}
}
}
}
}

View File

@ -1,3 +1,3 @@
# ium_426206
# ium_426206
Inżynieria Uczenia Maszynowego

59
create_dataset.py Normal file
View File

@ -0,0 +1,59 @@
import zipfile
import torch
import pandas as pd
import datetime
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
import torch.nn as nn
from torch.utils.data.dataset import random_split
from torch.utils.data import Dataset, TensorDataset
from sklearn import preprocessing
api = KaggleApi()
api.authenticate()
api.dataset_download_file('apoorvaappz/global-super-store-dataset',
file_name='Global_Superstore2.csv', path='./')
with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref:
zipref.extractall('.')
data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',')
data["Order Date"] = pd.to_datetime(data["Order Date"])
data = data.sort_values(by="Order Date")
#print(data)
byMonthsYears = {}
for index, row in data.iterrows():
#datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y")
#byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0)
#byMonthsYears[datee.strftime("%m-%Y")] += row['Sales']
byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0)
byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales']
df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'})
#normalizacja danych
flcols = df[['Sales count', 'Sales sum']].columns
x = df[['Sales count', 'Sales sum']].values
# min_max_scaler = preprocessing.MinMaxScaler()
max_abs_scaler = preprocessing.MaxAbsScaler()
# x_scaled = min_max_scaler.fit_transform(x)
x_scaled = max_abs_scaler.fit_transform(x)
normcols = pd.DataFrame(x_scaled, columns=flcols)
for col in flcols:
df[col] = normcols[col]
#df.to_csv('mms_norm.csv')
x_tensor = torch.tensor(df['Sales sum'].values).float()
y_tensor = torch.tensor(df['Sales count'].values).float()
dataset = TensorDataset(x_tensor, y_tensor)
lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)]
train_dataset, val_dataset = random_split(dataset, lengths)
torch.save(train_dataset, 'train_dataset.pt')
torch.save(val_dataset, 'val_dataset.pt')

180
dlgssdpytorch copy.py Normal file
View File

@ -0,0 +1,180 @@
import zipfile
import torch
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchviz import make_dot
from sklearn import preprocessing
# api = KaggleApi()
# api.authenticate()
# api.dataset_download_file('apoorvaappz/global-super-store-dataset',
# file_name='Global_Superstore2.csv', path='./')
# with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref:
# zipref.extractall('.')
data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',')
data["Order Date"] = pd.to_datetime(data["Order Date"])
data = data.sort_values(by="Order Date")
#print(data)
byMonthsYears = {}
for index, row in data.iterrows():
#datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y")
#byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0)
#byMonthsYears[datee.strftime("%m-%Y")] += row['Sales']
byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0)
byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales']
df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'})
#normalizacja danych
flcols = df[['Sales count', 'Sales sum']].columns
x = df[['Sales count', 'Sales sum']].values
# min_max_scaler = preprocessing.MinMaxScaler()
max_abs_scaler = preprocessing.MaxAbsScaler()
# x_scaled = min_max_scaler.fit_transform(x)
x_scaled = max_abs_scaler.fit_transform(x)
normcols = pd.DataFrame(x_scaled, columns=flcols)
for col in flcols:
df[col] = normcols[col]
df.to_csv('mms_norm.csv')
exit()
# fig, ax = plt.subplots()
# fig.set_figheight(15)
# fig.set_figwidth(20)
# ax.scatter(df['Month and Year'], df['Sum of sales'])
#plt.show()
# # Data Generation
# np.random.seed(42)
# x = np.random.rand(100, 1)
# y = 1 + 2 * x + .1 * np.random.randn(100, 1)
# # Shuffles the indices
# idx = np.arange(100)
# np.random.shuffle(idx)
# # Uses first 80 random indices for train
# train_idx = idx[:80]
# # Uses the remaining indices for validation
# val_idx = idx[80:]
# # Generates train and validation sets
# x_train, y_train = x[train_idx], y[train_idx]
# x_val, y_val = x[val_idx], y[val_idx]
# x_tensor = torch.from_numpy(x_train).float()
# y_tensor = torch.from_numpy(y_train).float()
x_tensor = torch.tensor(df['Sales sum'].values).float()
y_tensor = torch.tensor(df['Sales count'].values).float()
dataset = TensorDataset(x_tensor, y_tensor)
#torch.manual_seed(42)
lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)]
train_dataset, val_dataset = random_split(dataset, lengths)
train_loader = DataLoader(dataset=train_dataset)
val_loader = DataLoader(dataset=val_dataset)
class LayerLinearRegression(nn.Module):
def __init__(self):
super().__init__()
# Instead of our custom parameters, we use a Linear layer with single input and single output
self.linear = nn.Linear(1, 1)
def forward(self, x):
# Now it only takes a call to the layer to make predictions
return self.linear(x)
model = LayerLinearRegression()
# Checks model's parameters
#print(model.state_dict())
lr = 1e-3
n_epochs = 100
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)
def make_train_step(model, loss_fn, optimizer):
# Builds function that performs a step in the train loop
def train_step(x, y):
# Sets model to TRAIN mode
model.train()
# Makes predictions
yhat = model(x)
# Computes loss
loss = loss_fn(y, yhat)
# Computes gradients
loss.backward()
# Updates parameters and zeroes gradients
optimizer.step()
optimizer.zero_grad()
# Returns the loss
return loss.item()
# Returns the function that will be called inside the train loop
return train_step
# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
training_losses = []
validation_losses = []
print(model.state_dict())
# For each epoch...
for epoch in range(n_epochs):
losses = []
# Uses loader to fetch one mini-batch for training
for x_batch, y_batch in train_loader:
# NOW, sends the mini-batch data to the device
# so it matches location of the MODEL
# x_batch = x_batch.to(device)
# y_batch = y_batch.to(device)
# One stpe of training
loss = train_step(x_batch, y_batch)
losses.append(loss)
training_loss = np.mean(losses)
training_losses.append(training_loss)
# After finishing training steps for all mini-batches,
# it is time for evaluation!
# We tell PyTorch to NOT use autograd...
# Do you remember why?
with torch.no_grad():
val_losses = []
# Uses loader to fetch one mini-batch for validation
for x_val, y_val in val_loader:
# Again, sends data to same device as model
# x_val = x_val.to(device)
# y_val = y_val.to(device)
# What is that?!
model.eval()
# Makes predictions
yhat = model(x_val)
# Computes validation loss
val_loss = loss_fn(y_val, yhat)
val_losses.append(val_loss.item())
validation_loss = np.mean(val_losses)
validation_losses.append(validation_loss)
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
# Checks model's parameters
print(model.state_dict())
print(np.mean(losses))
print(np.mean(val_losses))

115
dlgssdpytorch.py Normal file
View File

@ -0,0 +1,115 @@
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import argparse
parser = argparse.ArgumentParser(description='Program do uczenia modelu')
parser.add_argument('-l', '--lr', type=float, default=1e-3, help="Współczynik uczenia (lr)", required=False)
parser.add_argument('-e', '--epochs', type=int, default=100, help="Liczba epok", required=False)
args = parser.parse_args()
lr = args.lr
n_epochs = args.epochs
train_dataset = torch.load('train_dataset.pt')
val_dataset = torch.load('val_dataset.pt')
train_loader = DataLoader(dataset=train_dataset)
val_loader = DataLoader(dataset=val_dataset)
class LayerLinearRegression(nn.Module):
def __init__(self):
super().__init__()
# Instead of our custom parameters, we use a Linear layer with single input and single output
self.linear = nn.Linear(1, 1)
def forward(self, x):
# Now it only takes a call to the layer to make predictions
return self.linear(x)
model = LayerLinearRegression()
# Checks model's parameters
#print(model.state_dict())
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)
def make_train_step(model, loss_fn, optimizer):
# Builds function that performs a step in the train loop
def train_step(x, y):
# Sets model to TRAIN mode
model.train()
# Makes predictions
yhat = model(x)
# Computes loss
loss = loss_fn(y, yhat)
# Computes gradients
loss.backward()
# Updates parameters and zeroes gradients
optimizer.step()
optimizer.zero_grad()
# Returns the loss
return loss.item()
# Returns the function that will be called inside the train loop
return train_step
# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
training_losses = []
validation_losses = []
#print(model.state_dict())
# For each epoch...
for epoch in range(n_epochs):
losses = []
# Uses loader to fetch one mini-batch for training
for x_batch, y_batch in train_loader:
# NOW, sends the mini-batch data to the device
# so it matches location of the MODEL
# x_batch = x_batch.to(device)
# y_batch = y_batch.to(device)
# One stpe of training
loss = train_step(x_batch, y_batch)
losses.append(loss)
training_loss = np.mean(losses)
training_losses.append(training_loss)
# After finishing training steps for all mini-batches,
# it is time for evaluation!
# We tell PyTorch to NOT use autograd...
# Do you remember why?
with torch.no_grad():
val_losses = []
# Uses loader to fetch one mini-batch for validation
for x_val, y_val in val_loader:
# Again, sends data to same device as model
# x_val = x_val.to(device)
# y_val = y_val.to(device)
model.eval()
# Makes predictions
yhat = model(x_val)
# Computes validation loss
val_loss = loss_fn(y_val, yhat)
val_losses.append(val_loss.item())
validation_loss = np.mean(val_losses)
validation_losses.append(validation_loss)
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
# Checks model's parameters
print("Model's state_dict:")
for param_tensor in model.state_dict():
print(param_tensor, "\t", model.state_dict()[param_tensor])
# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
print(var_name, "\t", optimizer.state_dict()[var_name])
print("Mean squared error for training: ", np.mean(losses))
print("Mean squared error for validating: ", np.mean(val_losses))
torch.save(model, 'model.pt')

1431
mms.csv Normal file

File diff suppressed because it is too large Load Diff

1431
mms.csv.bak Normal file

File diff suppressed because it is too large Load Diff

1431
mms_norm.csv Normal file

File diff suppressed because it is too large Load Diff

1431
mms_norm.csv.bak Normal file

File diff suppressed because it is too large Load Diff

27
requirments.txt Normal file
View File

@ -0,0 +1,27 @@
certifi==2020.12.5
chardet==4.0.0
cycler==0.10.0
graphviz==0.16
idna==2.10
joblib==1.0.1
kaggle==1.5.12
kiwisolver==1.3.1
matplotlib==3.4.1
numpy==1.20.2
pandas==1.2.4
Pillow==8.2.0
pyparsing==2.4.7
python-dateutil==2.8.1
python-slugify==4.0.1
pytz==2021.1
requests==2.25.1
scikit-learn==0.24.1
scipy==1.6.2
six==1.15.0
sklearn==0.0
text-unidecode==1.3
threadpoolctl==2.1.0
torchviz==0.0.2
tqdm==4.60.0
typing-extensions==3.7.4.3
urllib3==1.26.4

View File

@ -1,7 +1,7 @@
#!/bin/bash
kaggle datasets download -d apoorvaappz/global-super-store-dataset
unzip -o global-super-store-dataset.zip
if [ $1 = "0" ]; then
if [[ $1 = "0" ]]; then
CUTOFF=51291
cp Global_Superstore2.csv Global_Superstore22.csv
else
@ -16,4 +16,4 @@ head -n $((prop1*2)) Global_Superstore2.csv.shuf | tail -n $prop1 > Global_Super
tail -n +$((prop1*2+1)) Global_Superstore2.csv.shuf > Global_Superstore2.csv.train
rm Global_Superstore2.csv.shuf
#Sprawdźmy, czy wielkości się zgadzają:
wc -l Global_Superstore2*
wc -l Global_Superstore2*

View File

@ -3,7 +3,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "eligible-business",
"id": "strange-teens",
"metadata": {
"tags": []
},
@ -18,7 +18,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "limited-memorial",
"id": "another-accessory",
"metadata": {},
"outputs": [],
"source": [
@ -28,7 +28,7 @@
{
"cell_type": "code",
"execution_count": 158,
"id": "aware-allah",
"id": "valid-malta",
"metadata": {},
"outputs": [
{
@ -48,7 +48,7 @@
{
"cell_type": "code",
"execution_count": 159,
"id": "drawn-financing",
"id": "noble-compilation",
"metadata": {},
"outputs": [
{
@ -448,7 +448,7 @@
{
"cell_type": "code",
"execution_count": 160,
"id": "boring-consumption",
"id": "multiple-council",
"metadata": {},
"outputs": [],
"source": [
@ -458,7 +458,7 @@
{
"cell_type": "code",
"execution_count": 161,
"id": "cathedral-frank",
"id": "green-trunk",
"metadata": {},
"outputs": [],
"source": [
@ -468,7 +468,7 @@
{
"cell_type": "code",
"execution_count": 162,
"id": "satisfactory-venice",
"id": "operating-catalyst",
"metadata": {},
"outputs": [
{
@ -489,7 +489,7 @@
{
"cell_type": "code",
"execution_count": 163,
"id": "united-climate",
"id": "female-landscape",
"metadata": {},
"outputs": [
{
@ -510,7 +510,7 @@
{
"cell_type": "code",
"execution_count": 164,
"id": "institutional-corpus",
"id": "thirty-auckland",
"metadata": {},
"outputs": [
{
@ -531,7 +531,7 @@
{
"cell_type": "code",
"execution_count": 165,
"id": "caroline-shannon",
"id": "mysterious-alignment",
"metadata": {},
"outputs": [
{
@ -552,7 +552,7 @@
{
"cell_type": "code",
"execution_count": 166,
"id": "sublime-quarter",
"id": "stone-combining",
"metadata": {},
"outputs": [
{
@ -950,7 +950,7 @@
{
"cell_type": "code",
"execution_count": 167,
"id": "committed-disease",
"id": "demanding-milwaukee",
"metadata": {
"tags": []
},
@ -990,7 +990,7 @@
{
"cell_type": "code",
"execution_count": 168,
"id": "corporate-fisher",
"id": "above-script",
"metadata": {
"tags": []
},
@ -1026,7 +1026,7 @@
{
"cell_type": "code",
"execution_count": 169,
"id": "revised-study",
"id": "abroad-durham",
"metadata": {},
"outputs": [
{
@ -1047,7 +1047,7 @@
{
"cell_type": "code",
"execution_count": 170,
"id": "checked-thought",
"id": "centered-realtor",
"metadata": {},
"outputs": [],
"source": [
@ -1058,7 +1058,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "assigned-mobility",
"id": "relevant-receptor",
"metadata": {},
"outputs": [],
"source": [
@ -1076,7 +1076,7 @@
{
"cell_type": "code",
"execution_count": 172,
"id": "stunning-metallic",
"id": "informal-unemployment",
"metadata": {},
"outputs": [
{
@ -1474,7 +1474,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "boolean-calgary",
"id": "reserved-cookie",
"metadata": {},
"outputs": [],
"source": []
@ -1496,7 +1496,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.8.5"
}
},
"nbformat": 4,