diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3e494e4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM ubuntu:latest + +RUN apt update && apt install -y python3 && apt install -y nano + +RUN apt update && apt install python3-pip -y +RUN pip3 install kaggle && pip3 install pandas && pip3 install scikit-learn && pip3 install matplotlib +RUN apt install -y curl +RUN pip3 install --user wget + +WORKDIR /app + +COPY ./init.py ./ +COPY ./stats.py ./ + +RUN mkdir /.kaggle +RUN chmod -R 777 /.kaggle \ No newline at end of file diff --git a/init.py b/init.py new file mode 100644 index 0000000..7249cd6 --- /dev/null +++ b/init.py @@ -0,0 +1,38 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import kaggle + +kaggle.api.authenticate() +kaggle.api.dataset_download_files('jilkothari/finance-accounting-courses-udemy-13k-course', path='.', unzip=True) + +courses = pd.read_csv('courses.csv') + +#Delete redundant columns +imp_col = ['id', 'title', 'url', 'is_paid', 'num_subscribers', 'rating', 'num_reviews', 'created'] +courses = courses[imp_col] +courses.to_csv("courses.csv", index=False) +courses = pd.read_csv('courses.csv') + +#Delete empty rows of rating column and number of reviews less than 10 +rating_col = 'rating' +num_reviews_col = 'num_reviews' +courses = courses.drop(courses[courses.rating == 0].index) +courses = courses.drop(courses[courses.num_reviews < 10].index) + +#Simplify numbers to one decimal place and format 'title' column to specifc schema +courses = courses.round(1) +courses['title'] = courses['title'].str.lower() +courses['title'] = courses['title'].str.replace(" ", "_") + +#Delete artifacts +courses = courses.dropna() + +#Split dataset into 60% 20% 20% - train, valid, test +courses_train, courses_validate, courses_test = np.split(courses.sample(frac=1), [int(.6*len(courses)), int(.8*len(courses))]) + +#Create new csv +courses_train.to_csv("train.csv", index=False) +courses_validate.to_csv("valid.csv",index=False) +courses_test.to_csv("test.csv",index=False) diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..adb1dc1 --- /dev/null +++ b/stats.py @@ -0,0 +1,10 @@ +import pandas as pd +import pandas as pd + +train = pd.read_csv('train.csv') +test = pd.read_csv('test.csv') +valid = pd.read_csv('valid.csv') + +print("train size: ", train.size) +print("test size: ", test.size) +print("valid size: ", valid.size) \ No newline at end of file