ium_z486867/iumz_486867.py

132 lines
3.8 KiB
Python
Raw Normal View History

2023-09-30 00:06:47 +02:00
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential # Use TensorFlow's Keras module
from tensorflow.keras.layers import Dense # Use TensorFlow's Keras module
import matplotlib.pyplot as plt
from keras.utils import to_categorical # Use Keras's to_categorical function
api = KaggleApi()
api.authenticate()
api.dataset_download_files('dansbecker/powerlifting-database', path='./data')
with zipfile.ZipFile('./data/powerlifting-database.zip', 'r') as zip_ref:
zip_ref.extractall('./data')
def get_simplified_age(age):
if 0 <= age < 10:
return 0
elif 10 <= age < 20:
return 1
elif 20 <= age < 30:
return 2
elif 30 <= age < 40:
return 3
elif 40 <= age < 50:
return 4
elif 50 <= age < 60:
return 5
elif 60 <= age < 70:
return 6
elif 70 <= age < 80:
return 7
elif 80 <= age < 100:
return 8
else:
return age
def plot_loss_tf(history):
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
fig.canvas.toolbar_visible = False
fig.canvas.header_visible = False
fig.canvas.footer_visible = False
ax.plot(history.history['loss'], label='loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('loss (cost)')
ax.legend()
ax.grid(True)
plt.show()
# Load your CSV data
powerlifters_stats = pd.read_csv('data/openpowerlifting.csv', engine='python', encoding='ISO-8859-1', sep=',')
# Drop unnecessary columns
columns_to_drop = ['MeetID', 'Name', 'Sex', 'Equipment', 'Division', 'Squat4Kg', 'BestSquatKg',
'Bench4Kg', 'BestBenchKg', 'Deadlift4Kg', 'BestDeadliftKg', 'TotalKg', 'Place', 'Wilks','WeightClassKg']
powerlifters_stats = powerlifters_stats.drop(columns_to_drop, axis=1)
# Apply the age simplification function
powerlifters_stats['Age'] = powerlifters_stats['Age'].apply(get_simplified_age)
# Split your data into features (X) and target (y)
X = powerlifters_stats.drop(columns=['Age'])
y = powerlifters_stats['Age']
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)
# Create a mask to identify rows with NaN values in y_train
nan_mask = pd.isna(y_train).values
# Apply the mask to both X_train and y_train
X_train = X_train[~nan_mask]
y_train = y_train[~nan_mask]
y_train = y_train.astype(int)
unique_values = np.unique(y_train)
print(unique_values)
print(y_train.dtypes)
# Convert the target variables to categorical
y_train = to_categorical(y_train, num_classes=9)
y_val = to_categorical(y_val, num_classes=8)
y_test = to_categorical(y_test, num_classes=9)
# Create a Sequential model
model = Sequential(
[
Dense(100, input_dim=X_train.shape[1], activation='relu'),
Dense(70, activation='relu'),
Dense(50, activation='relu'),
Dense(9, activation='softmax') # Changed the output units to 9 to match the number of age categories
], name="Players_model"
)
# Compile the model
model.compile(
loss=tf.keras.losses.CategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy']
)
# Train the model
history = model.fit(
X_train, y_train,
epochs=500,
validation_data=(X_val, y_val)
)
# Plot the loss
plot_loss_tf(history)
# Evaluate the model
print('Evaluating...')
accuracy = model.evaluate(X_test, y_test)[1]
print(f"accuracy: {accuracy}")