import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt sc = pd.read_csv('who_suicide_statistics.csv') train, validate, test = np.split(sc.sample(frac=1, random_state=42), [int(.6*len(sc)), int(.8*len(sc))]) print("Train set: ", train.size) print("Validate set: ", validate.size) print("Test set: ", test.size) print(train.describe(include='all')) print(train.country.value_counts()) print(validate.describe(include='all')) print(validate.country.value_counts()) print(test.describe(include='all')) print(test.country.value_counts()) pd.value_counts(train['country']).plot.bar() pd.value_counts(validate['country']).plot.bar() pd.value_counts(test['country']).plot.bar() test['age'] = test['age'].map(lambda x: x.rstrip('years')) train['age'] = train['age'].map(lambda x: x.rstrip('years')) validate['age'] = validate['age'].map(lambda x: x.rstrip('years')) print(train.isnull().sum()) print(validate.isnull().sum()) print(test.isnull().sum()) train.dropna(inplace=True) validate.dropna(inplace=True) test.dropna(inplace=True) print(train) print(validate) print(test)