diff --git a/prepare_data.py b/prepare_data.py new file mode 100644 index 0000000..f447418 --- /dev/null +++ b/prepare_data.py @@ -0,0 +1,20 @@ +import pandas as pd +from sklearn.preprocessing import MinMaxScaler + +def prepare_data(file_path): + data = pd.read_csv(file_path, sep=';') + + data = pd.get_dummies(data, columns=['Sex', 'Medal']) + data = data.drop(columns=['Name', 'Team', 'NOC', 'Games', 'Year', 'Season', 'City', 'Sport', 'Event']) + + data = data.fillna(0) + scaler = MinMaxScaler() + data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns) + + return data + +if __name__ == "__main__": + file_path = 'olympics-124-years-datasettill-2020/Data.csv' + data = prepare_data(file_path) + + data.to_csv('processed_data.csv', index=False)