commit 96b5908f344e4c9265af064d1a535270c530b672 Author: s444439 Date: Wed Apr 19 17:21:39 2023 +0200 add script and jenkinsfile diff --git a/JenkinsFile b/JenkinsFile new file mode 100644 index 0000000..dff4ed9 --- /dev/null +++ b/JenkinsFile @@ -0,0 +1,34 @@ +pipeline { + agent any + parameters{ + string( + defaultValue: '0', + description: 'CUTOFF', + name: 'CUTOFF', + trim: false + ) + } + stages { + stage('Clone repository') { + steps { + git 'https://git.wmi.amu.edu.pl/s444439/ium_z444439' + } + } + + stage('Run shell script') { + steps { + sh 'mkdir -p dataset' + sh 'curl -o dataset/adult.data https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' + sh 'sed -i "s/ //g" dataset/adult.data' + sh 'cut -d"," -f1,3-5,9-10,13 dataset/adult.data > dataset/processed_data.csv' + sh 'python script.py' + } + } + } + + post { + always { + archiveArtifacts 'results/*' + } + } +} \ No newline at end of file diff --git a/script.py b/script.py new file mode 100644 index 0000000..4dbaaa5 --- /dev/null +++ b/script.py @@ -0,0 +1,120 @@ +import os +import urllib.request +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + + +def download_file(): + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" + filename = "adult.data" + urllib.request.urlretrieve(url, filename) + csv_file = convert_data_to_csv() + return csv_file + + +def convert_data_to_csv(): + data_file = "adult.data" + csv_file = "adult.csv" + df = pd.read_csv(data_file, header=None) + df.to_csv(csv_file, index=False) + delete_data_file() + return csv_file + + +def delete_data_file(): + filename = "adult.data" + os.remove(filename) + + +def add_subsets_to_csv_file(data): + data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", + "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", + "income"] + + train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + if len(train_data) > len(test_data): + train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42) + else: + dev_data = pd.DataFrame() + + train_data.to_csv("adult_train.csv", index=False) + dev_data.to_csv("adult_dev.csv", index=False) + test_data.to_csv("adult_test.csv", index=False) + + print("Data set: ", data.shape) + print("Train Data set: ", train_data.shape) + print("Dev Data set: ", dev_data.shape) + print("Test Data set: ", test_data.shape) + return data + + +def check_if_data_set_has_division_into_subsets(file_name): + data = pd.read_csv(file_name) + + if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: + data_set = add_subsets_to_csv_file(data) + data_set.to_csv(file_name, index=False) + + +def get_statistics(data): + train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"}) + dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"}) + test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"}) + + print("Wielkość zbioru: ", len(data)) + print("Wielkość zbioru treningowego: ", len(train_data)) + print("Wielkość zbioru walidacyjnego: ", len(dev_data)) + print("Wielkość zbioru testowego: ", len(test_data)) + print("Średnia wartość wieku: ", np.mean(data["age"])) + print("Minimalna wartość wieku: ", np.min(data["age"])) + print("Maksymalna wartość wieku: ", np.max(data["age"])) + print("Odchylenie standardowe wartości wieku: ", np.std(data["age"])) + print("Mediana wartości wieku: ", np.median(data["age"])) + + print("Rozkład częstości klas: ") + freq_dist_all = data['income'].value_counts() + print('Rozkład częstości etykiet klas na całym zbiorze danych:') + print(freq_dist_all) + + freq_dist_train = train_data['income'].value_counts() + print('Rozkład częstości etykiet klas na zbiorze treningowym:') + print(freq_dist_train) + + freq_dist_test = test_data['income'].value_counts() + print('Rozkład częstości etykiet klas na zbiorze testowym:') + print(freq_dist_test) + + freq_dist_dev = dev_data['income'].value_counts() + print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:') + print(freq_dist_dev) + + +def normalization(data): + numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] + numeric_data = data[numeric_features] + + scaler = StandardScaler() + normalized_data = scaler.fit_transform(numeric_data) + + data[numeric_features] = normalized_data + + print(data.head()) + + +def clean(data): + data.replace('?', np.nan, inplace=True) + data.dropna(inplace=True) + data.drop_duplicates(inplace=True) + data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ + ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) + + +if __name__ == '__main__': + csv_file_name = download_file() + check_if_data_set_has_division_into_subsets(csv_file_name) + data = pd.read_csv(csv_file_name, dtype={"income": "category"}) + get_statistics(data) + normalization(data) + clean(data)