add docker file and python scripts
This commit is contained in:
parent
4db0f002e0
commit
d1b2e90c9c
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt update && apt install -y python3 && apt install -y nano
|
||||||
|
|
||||||
|
RUN apt update && apt install python3-pip -y
|
||||||
|
RUN pip3 install kaggle && pip3 install pandas && pip3 install scikit-learn && pip3 install matplotlib
|
||||||
|
RUN apt install -y curl
|
||||||
|
RUN pip3 install --user wget
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY ./init.py ./
|
||||||
|
COPY ./stats.py ./
|
||||||
|
|
||||||
|
RUN mkdir /.kaggle
|
||||||
|
RUN chmod -R 777 /.kaggle
|
38
init.py
Normal file
38
init.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn import preprocessing
|
||||||
|
import kaggle
|
||||||
|
|
||||||
|
kaggle.api.authenticate()
|
||||||
|
kaggle.api.dataset_download_files('jilkothari/finance-accounting-courses-udemy-13k-course', path='.', unzip=True)
|
||||||
|
|
||||||
|
courses = pd.read_csv('courses.csv')
|
||||||
|
|
||||||
|
#Delete redundant columns
|
||||||
|
imp_col = ['id', 'title', 'url', 'is_paid', 'num_subscribers', 'rating', 'num_reviews', 'created']
|
||||||
|
courses = courses[imp_col]
|
||||||
|
courses.to_csv("courses.csv", index=False)
|
||||||
|
courses = pd.read_csv('courses.csv')
|
||||||
|
|
||||||
|
#Delete empty rows of rating column and number of reviews less than 10
|
||||||
|
rating_col = 'rating'
|
||||||
|
num_reviews_col = 'num_reviews'
|
||||||
|
courses = courses.drop(courses[courses.rating == 0].index)
|
||||||
|
courses = courses.drop(courses[courses.num_reviews < 10].index)
|
||||||
|
|
||||||
|
#Simplify numbers to one decimal place and format 'title' column to specifc schema
|
||||||
|
courses = courses.round(1)
|
||||||
|
courses['title'] = courses['title'].str.lower()
|
||||||
|
courses['title'] = courses['title'].str.replace(" ", "_")
|
||||||
|
|
||||||
|
#Delete artifacts
|
||||||
|
courses = courses.dropna()
|
||||||
|
|
||||||
|
#Split dataset into 60% 20% 20% - train, valid, test
|
||||||
|
courses_train, courses_validate, courses_test = np.split(courses.sample(frac=1), [int(.6*len(courses)), int(.8*len(courses))])
|
||||||
|
|
||||||
|
#Create new csv
|
||||||
|
courses_train.to_csv("train.csv", index=False)
|
||||||
|
courses_validate.to_csv("valid.csv",index=False)
|
||||||
|
courses_test.to_csv("test.csv",index=False)
|
Loading…
Reference in New Issue
Block a user