132 lines
3.8 KiB
Python
132 lines
3.8 KiB
Python
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
||
|
import zipfile
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import tensorflow as tf
|
||
|
from tensorflow.keras.models import Sequential # Use TensorFlow's Keras module
|
||
|
from tensorflow.keras.layers import Dense # Use TensorFlow's Keras module
|
||
|
import matplotlib.pyplot as plt
|
||
|
from keras.utils import to_categorical # Use Keras's to_categorical function
|
||
|
|
||
|
|
||
|
api = KaggleApi()
|
||
|
api.authenticate()
|
||
|
|
||
|
api.dataset_download_files('dansbecker/powerlifting-database', path='./data')
|
||
|
|
||
|
with zipfile.ZipFile('./data/powerlifting-database.zip', 'r') as zip_ref:
|
||
|
zip_ref.extractall('./data')
|
||
|
|
||
|
|
||
|
def get_simplified_age(age):
|
||
|
if 0 <= age < 10:
|
||
|
return 0
|
||
|
elif 10 <= age < 20:
|
||
|
return 1
|
||
|
elif 20 <= age < 30:
|
||
|
return 2
|
||
|
elif 30 <= age < 40:
|
||
|
return 3
|
||
|
elif 40 <= age < 50:
|
||
|
return 4
|
||
|
elif 50 <= age < 60:
|
||
|
return 5
|
||
|
elif 60 <= age < 70:
|
||
|
return 6
|
||
|
elif 70 <= age < 80:
|
||
|
return 7
|
||
|
elif 80 <= age < 100:
|
||
|
return 8
|
||
|
else:
|
||
|
return age
|
||
|
|
||
|
|
||
|
def plot_loss_tf(history):
|
||
|
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
|
||
|
fig.canvas.toolbar_visible = False
|
||
|
fig.canvas.header_visible = False
|
||
|
fig.canvas.footer_visible = False
|
||
|
ax.plot(history.history['loss'], label='loss')
|
||
|
ax.set_xlabel('Epoch')
|
||
|
ax.set_ylabel('loss (cost)')
|
||
|
ax.legend()
|
||
|
ax.grid(True)
|
||
|
plt.show()
|
||
|
|
||
|
# Load your CSV data
|
||
|
powerlifters_stats = pd.read_csv('data/openpowerlifting.csv', engine='python', encoding='ISO-8859-1', sep=',')
|
||
|
|
||
|
# Drop unnecessary columns
|
||
|
columns_to_drop = ['MeetID', 'Name', 'Sex', 'Equipment', 'Division', 'Squat4Kg', 'BestSquatKg',
|
||
|
'Bench4Kg', 'BestBenchKg', 'Deadlift4Kg', 'BestDeadliftKg', 'TotalKg', 'Place', 'Wilks','WeightClassKg']
|
||
|
powerlifters_stats = powerlifters_stats.drop(columns_to_drop, axis=1)
|
||
|
|
||
|
# Apply the age simplification function
|
||
|
powerlifters_stats['Age'] = powerlifters_stats['Age'].apply(get_simplified_age)
|
||
|
|
||
|
# Split your data into features (X) and target (y)
|
||
|
X = powerlifters_stats.drop(columns=['Age'])
|
||
|
y = powerlifters_stats['Age']
|
||
|
|
||
|
# Standardize the features
|
||
|
scaler = StandardScaler()
|
||
|
X_scaled = scaler.fit_transform(X)
|
||
|
X = pd.DataFrame(X_scaled, columns=X.columns)
|
||
|
|
||
|
# Split the data into train, validation, and test sets
|
||
|
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
|
||
|
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)
|
||
|
|
||
|
# Create a mask to identify rows with NaN values in y_train
|
||
|
nan_mask = pd.isna(y_train).values
|
||
|
|
||
|
|
||
|
# Apply the mask to both X_train and y_train
|
||
|
X_train = X_train[~nan_mask]
|
||
|
y_train = y_train[~nan_mask]
|
||
|
|
||
|
|
||
|
y_train = y_train.astype(int)
|
||
|
unique_values = np.unique(y_train)
|
||
|
print(unique_values)
|
||
|
print(y_train.dtypes)
|
||
|
|
||
|
# Convert the target variables to categorical
|
||
|
y_train = to_categorical(y_train, num_classes=9)
|
||
|
y_val = to_categorical(y_val, num_classes=8)
|
||
|
y_test = to_categorical(y_test, num_classes=9)
|
||
|
|
||
|
# Create a Sequential model
|
||
|
model = Sequential(
|
||
|
[
|
||
|
Dense(100, input_dim=X_train.shape[1], activation='relu'),
|
||
|
Dense(70, activation='relu'),
|
||
|
Dense(50, activation='relu'),
|
||
|
Dense(9, activation='softmax') # Changed the output units to 9 to match the number of age categories
|
||
|
], name="Players_model"
|
||
|
)
|
||
|
|
||
|
# Compile the model
|
||
|
model.compile(
|
||
|
loss=tf.keras.losses.CategoricalCrossentropy(),
|
||
|
optimizer=tf.keras.optimizers.Adam(),
|
||
|
metrics=['accuracy']
|
||
|
)
|
||
|
|
||
|
# Train the model
|
||
|
history = model.fit(
|
||
|
X_train, y_train,
|
||
|
epochs=500,
|
||
|
validation_data=(X_val, y_val)
|
||
|
)
|
||
|
|
||
|
# Plot the loss
|
||
|
plot_loss_tf(history)
|
||
|
|
||
|
# Evaluate the model
|
||
|
print('Evaluating...')
|
||
|
accuracy = model.evaluate(X_test, y_test)[1]
|
||
|
print(f"accuracy: {accuracy}")
|