ium_z486867/main.py

34 lines
1.3 KiB
Python
Raw Normal View History

2023-03-25 12:49:56 +01:00
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
from sklearn.model_selection import train_test_split
import pandas as pd
pd.set_option('display.max_columns', 100)
api = KaggleApi()
api.authenticate()
2023-03-25 13:17:37 +01:00
api.dataset_download_files('shivamb/netflix-shows', path='./data')
with zipfile.ZipFile('./data/netflix-shows.zip', 'r') as zip_ref:
2023-03-25 12:49:56 +01:00
zip_ref.extractall('./data')
2023-03-25 13:17:37 +01:00
netflix = pd.read_csv('./data/netflix_titles.csv')
2023-03-25 12:49:56 +01:00
2023-03-25 13:17:37 +01:00
netflix.dropna(inplace=True)
2023-03-25 12:49:56 +01:00
random_seed = 42
2023-03-25 13:17:37 +01:00
train_data, test_data = train_test_split(netflix, test_size=0.2, random_state=random_seed)
2023-03-25 12:49:56 +01:00
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=random_seed)
train_stats = train_data.describe(include='all')
print(f"\nTraining set statistics:\n{train_stats}")
dev_stats = dev_data.describe(include='all')
print(f"\nDevelopment set statistics:\n{dev_stats}")
test_stats = test_data.describe(include='all')
print(f"\nTest set statistics:\n{test_stats}")
2023-03-25 13:17:37 +01:00
train_class_dist = train_data["type"].value_counts()
print(f"\nTraining set class distribution:\n{train_class_dist}")
dev_class_dist = dev_data["type"].value_counts()
print(f"\nDevelopment set class distribution:\n{dev_class_dist}")
test_class_dist = test_data["type"].value_counts()
print(f"\nTest set class distribution:\n{test_class_dist}")