37 lines
1.3 KiB
Python
37 lines
1.3 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn import preprocessing
|
|
import kaggle
|
|
|
|
kaggle.api.authenticate()
|
|
kaggle.api.dataset_download_files('jilkothari/finance-accounting-courses-udemy-13k-course', path='.', unzip=True)
|
|
|
|
courses = pd.read_csv('udemy_output_All_Finance__Accounting_p1_p626.csv')
|
|
|
|
#Delete redundant columns
|
|
imp_col = ['id', 'title', 'url', 'is_paid', 'num_subscribers', 'rating', 'num_reviews', 'created']
|
|
courses = courses[imp_col]
|
|
|
|
#Delete empty rows of rating column and number of reviews less than 10
|
|
rating_col = 'rating'
|
|
num_reviews_col = 'num_reviews'
|
|
courses = courses.drop(courses[courses.rating == 0].index)
|
|
courses = courses.drop(courses[courses.num_reviews < 10].index)
|
|
|
|
#Simplify numbers to one decimal place and format 'title' column to specifc schema
|
|
courses = courses.round(1)
|
|
courses['title'] = courses['title'].str.lower()
|
|
courses['title'] = courses['title'].str.replace(" ", "_")
|
|
|
|
#Delete artifacts
|
|
courses = courses.dropna()
|
|
|
|
#Split dataset into 60% 20% 20% - train, valid, test
|
|
courses_train, courses_validate, courses_test = np.split(courses.sample(frac=1), [int(.6*len(courses)), int(.8*len(courses))])
|
|
|
|
#Create new csv
|
|
courses_train.to_csv("train.csv", index=False)
|
|
courses_validate.to_csv("valid.csv",index=False)
|
|
courses_test.to_csv("test.csv",index=False)
|