From 00b57384c2da1d071a407c801bf50c3d16334a06 Mon Sep 17 00:00:00 2001 From: Sheaza Date: Tue, 2 Apr 2024 19:43:25 +0200 Subject: [PATCH] fix script download --- .gitignore | 2 ++ Dockerfile | 13 +++++++++++++ get_dataset.py | 11 ++++++++--- stats/get_stats.py => get_stats.py | 0 requirements.txt | 3 +++ 5 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 .gitignore create mode 100644 Dockerfile rename stats/get_stats.py => get_stats.py (100%) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..915cc65 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/venv/ +/kaggle.json diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e0c8dfb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11 + +RUN apt-get update && apt-get -y upgrade +RUN apt-get install -y build-essential + +RUN python -m pip install --upgrade pip +COPY requirements.txt /tmp +RUN python -m pip install -r /tmp/requirements.txt + +WORKDIR ./app + +COPY ./get_dataset.py ./ +COPY ./get_stats.py ./ diff --git a/get_dataset.py b/get_dataset.py index 0543413..61cb483 100644 --- a/get_dataset.py +++ b/get_dataset.py @@ -1,10 +1,15 @@ -import opendatasets as od import pandas as pd from sklearn import preprocessing from sklearn.model_selection import train_test_split +import sys +import os -od.download("https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code") -data = pd.read_csv("student-performance-multiple-linear-regression/Student_Performance.csv") +os.environ["KAGGLE_USERNAME"] = sys.argv[1] +os.environ["KAGGLE_KEY"] = sys.argv[2] + +os.system("kaggle datasets download -d nikhil7280/student-performance-multiple-linear-regression --unzip") + +data = pd.read_csv("Student_Performance.csv") print(data.head()) data.drop_duplicates(inplace=True) data["Extracurricular Activities"] = data["Extracurricular Activities"].replace({'Yes': 1, 'No': 0}) diff --git a/stats/get_stats.py b/get_stats.py similarity index 100% rename from stats/get_stats.py rename to get_stats.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fef1509 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +kaggle +pandas +scikit-learn \ No newline at end of file