from kaggle.api.kaggle_api_extended import KaggleApi import zipfile from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import pandas as pd import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential # Use TensorFlow's Keras module from tensorflow.keras.layers import Dense # Use TensorFlow's Keras module import matplotlib.pyplot as plt from keras.utils import to_categorical # Use Keras's to_categorical function api = KaggleApi() api.authenticate() api.dataset_download_files('dansbecker/powerlifting-database', path='./data') with zipfile.ZipFile('./data/powerlifting-database.zip', 'r') as zip_ref: zip_ref.extractall('./data') def get_simplified_age(age): if 0 <= age < 10: return 0 elif 10 <= age < 20: return 1 elif 20 <= age < 30: return 2 elif 30 <= age < 40: return 3 elif 40 <= age < 50: return 4 elif 50 <= age < 60: return 5 elif 60 <= age < 70: return 6 elif 70 <= age < 80: return 7 elif 80 <= age < 100: return 8 else: return age def plot_loss_tf(history): fig, ax = plt.subplots(1, 1, figsize=(4, 3)) fig.canvas.toolbar_visible = False fig.canvas.header_visible = False fig.canvas.footer_visible = False ax.plot(history.history['loss'], label='loss') ax.set_xlabel('Epoch') ax.set_ylabel('loss (cost)') ax.legend() ax.grid(True) plt.show() # Load your CSV data powerlifters_stats = pd.read_csv('data/openpowerlifting.csv', engine='python', encoding='ISO-8859-1', sep=',') # Drop unnecessary columns columns_to_drop = ['MeetID', 'Name', 'Sex', 'Equipment', 'Division', 'Squat4Kg', 'BestSquatKg', 'Bench4Kg', 'BestBenchKg', 'Deadlift4Kg', 'BestDeadliftKg', 'TotalKg', 'Place', 'Wilks','WeightClassKg'] powerlifters_stats = powerlifters_stats.drop(columns_to_drop, axis=1) # Apply the age simplification function powerlifters_stats['Age'] = powerlifters_stats['Age'].apply(get_simplified_age) # Split your data into features (X) and target (y) X = powerlifters_stats.drop(columns=['Age']) y = powerlifters_stats['Age'] # Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X = pd.DataFrame(X_scaled, columns=X.columns) # Split the data into train, validation, and test sets X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1) X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # Create a mask to identify rows with NaN values in y_train nan_mask = pd.isna(y_train).values # Apply the mask to both X_train and y_train X_train = X_train[~nan_mask] y_train = y_train[~nan_mask] y_train = y_train.astype(int) unique_values = np.unique(y_train) print(unique_values) print(y_train.dtypes) # Convert the target variables to categorical y_train = to_categorical(y_train, num_classes=9) y_val = to_categorical(y_val, num_classes=8) y_test = to_categorical(y_test, num_classes=9) # Create a Sequential model model = Sequential( [ Dense(100, input_dim=X_train.shape[1], activation='relu'), Dense(70, activation='relu'), Dense(50, activation='relu'), Dense(9, activation='softmax') # Changed the output units to 9 to match the number of age categories ], name="Players_model" ) # Compile the model model.compile( loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'] ) # Train the model history = model.fit( X_train, y_train, epochs=500, validation_data=(X_val, y_val) ) # Plot the loss plot_loss_tf(history) # Evaluate the model print('Evaluating...') accuracy = model.evaluate(X_test, y_test)[1] print(f"accuracy: {accuracy}")