diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..915cc65 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/venv/ +/kaggle.json diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e0c8dfb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11 + +RUN apt-get update && apt-get -y upgrade +RUN apt-get install -y build-essential + +RUN python -m pip install --upgrade pip +COPY requirements.txt /tmp +RUN python -m pip install -r /tmp/requirements.txt + +WORKDIR ./app + +COPY ./get_dataset.py ./ +COPY ./get_stats.py ./ diff --git a/get_dataset.py b/get_dataset.py index 0543413..61cb483 100644 --- a/get_dataset.py +++ b/get_dataset.py @@ -1,10 +1,15 @@ -import opendatasets as od import pandas as pd from sklearn import preprocessing from sklearn.model_selection import train_test_split +import sys +import os -od.download("https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code") -data = pd.read_csv("student-performance-multiple-linear-regression/Student_Performance.csv") +os.environ["KAGGLE_USERNAME"] = sys.argv[1] +os.environ["KAGGLE_KEY"] = sys.argv[2] + +os.system("kaggle datasets download -d nikhil7280/student-performance-multiple-linear-regression --unzip") + +data = pd.read_csv("Student_Performance.csv") print(data.head()) data.drop_duplicates(inplace=True) data["Extracurricular Activities"] = data["Extracurricular Activities"].replace({'Yes': 1, 'No': 0}) diff --git a/stats/get_stats.py b/get_stats.py similarity index 100% rename from stats/get_stats.py rename to get_stats.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fef1509 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +kaggle +pandas +scikit-learn \ No newline at end of file