diff --git a/Dockerfile b/Dockerfile index 9771c31..b181f3c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,7 @@ WORKDIR /app # Skopiujmy nasz skrypt do katalogu /app w kontenerze COPY ./skrypt_download.py ./ COPY ./skrypt_stat.py ./ +COPY ./IUM_05.py ./ RUN mkdir /.kaggle RUN chmod -R 777 /.kaggle diff --git a/FootballModel.pth b/FootballModel.pth new file mode 100644 index 0000000..a7908f8 Binary files /dev/null and b/FootballModel.pth differ diff --git a/IUM_05.py b/IUM_05.py new file mode 100644 index 0000000..207bab2 --- /dev/null +++ b/IUM_05.py @@ -0,0 +1,152 @@ +from sklearn.model_selection import train_test_split +import torch +import torch.nn as nn +import pandas as pd +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset, random_split +from sklearn import preprocessing + +results = pd.read_csv('results.csv') +#brak wierszy z NaN +results.dropna() + +#normalizacja itp +for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: + results[collumn] = results[collumn].str.lower() + +categorical_cols = results.select_dtypes(include=object).columns.values + + +train, test = train_test_split(results, test_size= 1 - 0.4) + +#valid, test = train_test_split(test, test_size=0.5) +input_cols = train.columns.values[1:-1] +output_cols = train.columns.values[-1:] + + +def dataframe_to_arrays(dataframe): + # Make a copy of the original dataframe + dataframe1 = dataframe.copy(deep=True) + # Convert non-numeric categorical columns to numbers + for col in categorical_cols: + dataframe1[col] = dataframe1[col].astype('category').cat.codes + # Extract input & outupts as numpy arrays + + min_max_scaler = preprocessing.MinMaxScaler() + x_scaled = min_max_scaler.fit_transform(dataframe1) + dataframe1 = pd.DataFrame(x_scaled, columns = dataframe1.columns) + + inputs_array = dataframe1[input_cols].to_numpy() + targets_array = dataframe1[output_cols].to_numpy() + return inputs_array, targets_array + +inputs_array_training, targets_array_training = dataframe_to_arrays(train) + + +inputs_array_testing, targets_array_testing = dataframe_to_arrays(test) + + +inputs_training = torch.from_numpy(inputs_array_training).type(torch.float32) +targets_training = torch.from_numpy(targets_array_training).type(torch.float32) + +inputs_testing = torch.from_numpy(inputs_array_testing).type(torch.float32) +targets_testing = torch.from_numpy(targets_array_testing).type(torch.float32) + +train_dataset = TensorDataset(inputs_training, targets_training) +val_dataset = TensorDataset(inputs_testing, targets_testing) + +batch_size = 64 +train_loader = DataLoader(train_dataset, batch_size, shuffle=True) +val_loader = DataLoader(val_dataset, batch_size*2) + +input_size = len(input_cols) +output_size = len(output_cols) + + + +class FootbalModel(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(input_size, output_size) + + def forward(self, xb): + out = self.linear(xb) + return out + + def training_step(self, batch): + inputs, targets = batch + # Generate predictions + out = self(inputs) + # Calcuate loss + # loss = F.l1_loss(out, targets) + loss = F.mse_loss(out, targets) + return loss + + def validation_step(self, batch): + inputs, targets = batch + # Generate predictions + out = self(inputs) + # Calculate loss + # loss = F.l1_loss(out, targets) + loss = F.mse_loss(out, targets) + return {'val_loss': loss.detach()} + + def validation_epoch_end(self, outputs): + batch_losses = [x['val_loss'] for x in outputs] + epoch_loss = torch.stack(batch_losses).mean() + return {'val_loss': epoch_loss.item()} + + def epoch_end(self, epoch, result, num_epochs): + # Print result every 20th epoch + if (epoch + 1) % 20 == 0 or epoch == num_epochs - 1: + print("Epoch [{}], val_loss: {:.4f}".format(epoch + 1, result['val_loss'])) + +model = FootbalModel() +list(model.parameters()) + + +def evaluate(model, val_loader): + outputs = [model.validation_step(batch) for batch in val_loader] + return model.validation_epoch_end(outputs) + +def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD): + history = [] + optimizer = opt_func(model.parameters(), lr) + for epoch in range(epochs): + # Training Phase + for batch in train_loader: + loss = model.training_step(batch) + loss.backward() + optimizer.step() + optimizer.zero_grad() + # Validation phase + result = evaluate(model, val_loader) + model.epoch_end(epoch, result, epochs) + history.append(result) + return history + + +result = evaluate(model, val_loader) # Use the the evaluate function + +epochs = 100 +lr = 1e-6 +history3 = fit(epochs, lr, model, train_loader, val_loader) + +def predict_single(input, target, model): + inputs = input.unsqueeze(0) + predictions = model(input) # fill this + prediction = predictions[0].detach() + print("Prediction:", prediction) + if prediction >= 0.5: + print('Neutral') + else: + print('not neutral') + + + +for i in range(len(val_dataset)): + input, target = val_dataset[i] + predict_single(input, target, model) + + +torch.save(model.state_dict(), 'FootballModel.pth') \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 4ac64d7..735a57b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,10 +27,12 @@ node { def image = docker.build("s434732/ium") image.inside { sh 'python3 ./skrypt_download.py' + sh 'python3 ./IUM_05.py > model.txt' archiveArtifacts "train.csv" archiveArtifacts "test.csv" archiveArtifacts "valid.csv" + archiveArtifacts 'model.txt' } diff --git a/skrypt_download.py b/skrypt_download.py index 2dfa1c5..2286ffa 100644 --- a/skrypt_download.py +++ b/skrypt_download.py @@ -15,7 +15,7 @@ results.dropna() #normalizacja itp for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: results[collumn] = results[collumn].str.lower() - + # PodziaƂ zbioru 6:1:1 train, test = train_test_split(results, test_size= 1 - 0.6)