diff --git a/.gitignore b/.gitignore index e71ac37..b10d4b8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ openpowerlifting.csv openpowerlifting-2024-01-06-4c732975.csv .idea .ipynb_checkpoints +powerlifting_test_predictions.csv diff --git a/05.ipynb b/05.ipynb new file mode 100644 index 0000000..3bc689f --- /dev/null +++ b/05.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "is_executing": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "import tensorflow as tf\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense\n", + "\n", + "# Wczytywanie danych\n", + "data = pd.read_csv('openpowerlifting.csv')\n", + "\n", + "# Zakładając, że kolumny to 'squat', 'bench_press', 'deadlift' i 'total'\n", + "features = data[['squat', 'bench_press', 'deadlift']]\n", + "target = data['total']\n", + "\n", + "# Podział na dane treningowe i testowe\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n", + "\n", + "# Normalizacja danych\n", + "scaler = StandardScaler()\n", + "X_train = scaler.fit_transform(X_train)\n", + "X_test = scaler.transform(X_test) # Używamy tego samego scaler do danych testowych\n", + "\n", + "# Tworzenie modelu\n", + "model = Sequential([\n", + " Dense(64, activation='relu', input_shape=(X_train.shape[1],)),\n", + " Dense(64, activation='relu'),\n", + " Dense(1)\n", + "])\n", + "\n", + "model.compile(optimizer='adam', loss='mse', metrics=['mae'])\n", + "\n", + "# Trenowanie modelu\n", + "model.fit(X_train, y_train, epochs=10, validation_split=0.1) # Używam validation_split zamiast oddzielnego zbioru\n", + "\n", + "# Save the model\n", + "model.save('powerlifting_model.h5')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Dockerfile b/Dockerfile index f16a550..9496042 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:latest RUN apt-get update && apt-get install -y python3-pip unzip coreutils -RUN pip install --user kaggle pandas +RUN pip install --user kaggle pandas scikit-learn tensorflow WORKDIR /app diff --git a/model.py b/model.py new file mode 100644 index 0000000..796f46f --- /dev/null +++ b/model.py @@ -0,0 +1,39 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense +import tensorflow as tf + +data = pd.read_csv('openpowerlifting.csv') + +data = data[['Sex', 'Age', 'BodyweightKg', 'TotalKg']].dropna() + +features = data[['Sex', 'Age', 'BodyweightKg']] +target = data['TotalKg'] + +X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42) + +preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), ['Age', 'BodyweightKg']), + ('cat', OneHotEncoder(), ['Sex']) + ] +) + +pipeline = Pipeline(steps=[ + ('preprocessor', preprocessor), + ('model', Sequential([ + Dense(64, activation='relu', input_dim=4), # Liczba wejść musi zgadzać się z wynikowym wymiarem preprocessingu + Dense(64, activation='relu'), + Dense(1) + ])) +]) + +pipeline['model'].compile(optimizer='adam', loss='mse', metrics=['mae']) + +pipeline.fit(X_train, y_train, model__epochs=10, model__validation_split=0.1) + +pipeline['model'].save('powerlifting_model.h5') diff --git a/powerlifting_model.h5 b/powerlifting_model.h5 new file mode 100644 index 0000000..b6f051e Binary files /dev/null and b/powerlifting_model.h5 differ diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..270e7a7 --- /dev/null +++ b/predict.py @@ -0,0 +1,28 @@ +import pandas as pd +import tensorflow as tf +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.model_selection import train_test_split + +loaded_model = tf.keras.models.load_model('powerlifting_model.h5') + +data = pd.read_csv('openpowerlifting.csv') +data = data[['Sex', 'Age', 'BodyweightKg', 'TotalKg']].dropna() # Usunięcie wierszy z brakującymi danymi +features = data[['Sex', 'Age', 'BodyweightKg']] +target = data['TotalKg'] + +X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42) + +preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), ['Age', 'BodyweightKg']), + ('cat', OneHotEncoder(), ['Sex']) + ] +) +X_test_transformed = preprocessor.fit_transform(X_test) + +predictions = loaded_model.predict(X_test_transformed) + +predictions_df = pd.DataFrame(predictions, columns=['predicted_TotalKg']) +predictions_df.to_csv('powerlifting_test_predictions.csv', index=False)