netflix instead of cars
This commit is contained in:
parent
be3e041321
commit
3714b5a301
22
main.py
22
main.py
@ -6,18 +6,24 @@ pd.set_option('display.max_columns', 100)
|
||||
|
||||
api = KaggleApi()
|
||||
api.authenticate()
|
||||
api.dataset_download_files('rkiattisak/sports-car-prices-dataset', path='./data')
|
||||
with zipfile.ZipFile('./data/sports-car-prices-dataset.zip', 'r') as zip_ref:
|
||||
api.dataset_download_files('shivamb/netflix-shows', path='./data')
|
||||
with zipfile.ZipFile('./data/netflix-shows.zip', 'r') as zip_ref:
|
||||
zip_ref.extractall('./data')
|
||||
|
||||
cars = pd.read_csv('./data/Sport car price.csv')
|
||||
netflix = pd.read_csv('./data/netflix_titles.csv')
|
||||
|
||||
cars.dropna(inplace=True)
|
||||
netflix['release_year'] = (netflix['release_year'] - netflix['release_year'].min()) / (netflix['release_year'].max() - netflix['release_year'].min())
|
||||
|
||||
netflix.dropna(inplace=True)
|
||||
|
||||
random_seed = 42
|
||||
train_data, test_data = train_test_split(cars, test_size=0.2, random_state=random_seed)
|
||||
train_data, test_data = train_test_split(netflix, test_size=0.2, random_state=random_seed)
|
||||
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=random_seed)
|
||||
|
||||
train_data['release_year'] = (train_data['release_year'] - train_data['release_year'].min()) / (train_data['release_year'].max() - train_data['release_year'].min())
|
||||
dev_data['release_year'] = (dev_data['release_year'] - dev_data['release_year'].min()) / (dev_data['release_year'].max() - dev_data['release_year'].min())
|
||||
test_data['release_year'] = (test_data['release_year'] - test_data['release_year'].min()) / (test_data['release_year'].max() - test_data['release_year'].min())
|
||||
|
||||
train_stats = train_data.describe(include='all')
|
||||
print(f"\nTraining set statistics:\n{train_stats}")
|
||||
dev_stats = dev_data.describe(include='all')
|
||||
@ -25,3 +31,9 @@ print(f"\nDevelopment set statistics:\n{dev_stats}")
|
||||
test_stats = test_data.describe(include='all')
|
||||
print(f"\nTest set statistics:\n{test_stats}")
|
||||
|
||||
train_class_dist = train_data["type"].value_counts()
|
||||
print(f"\nTraining set class distribution:\n{train_class_dist}")
|
||||
dev_class_dist = dev_data["type"].value_counts()
|
||||
print(f"\nDevelopment set class distribution:\n{dev_class_dist}")
|
||||
test_class_dist = test_data["type"].value_counts()
|
||||
print(f"\nTest set class distribution:\n{test_class_dist}")
|
||||
|
Loading…
Reference in New Issue
Block a user