From 171fd416dc6616a5e5dfe2bb01d9cdb984013c36 Mon Sep 17 00:00:00 2001 From: Maciej Sobkowiak Date: Mon, 12 Apr 2021 00:09:28 +0200 Subject: [PATCH] Added Data python script --- preprocesing_python.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 preprocesing_python.py diff --git a/preprocesing_python.py b/preprocesing_python.py new file mode 100644 index 0000000..0631a96 --- /dev/null +++ b/preprocesing_python.py @@ -0,0 +1,44 @@ +import sys +import kaggle +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +sc = pd.read_csv('who_suicide_statistics.csv') +sc + + +train, validate, test = np.split(sc.sample(frac=1, random_state=42), + [int(.6*len(sc)), int(.8*len(sc))]) + +print("Train set: ", train.size) +print("Validate set: ", validate.size) +print("Test set: ", test.size) +print(train.describe(include='all')) +print(train.country.value_counts()) + +print(validate.describe(include='all')) +print(validate.country.value_counts()) + +print(test.describe(include='all')) +print(test.country.value_counts()) + +pd.value_counts(train['country']).plot.bar() +pd.value_counts(validate['country']).plot.bar() +pd.value_counts(test['country']).plot.bar() + +test['age'] = test['age'].map(lambda x: x.rstrip('years')) +train['age'] = train['age'].map(lambda x: x.rstrip('years')) +validate['age'] = validate['age'].map(lambda x: x.rstrip('years')) + +print(train.isnull().sum()) +print(validate.isnull().sum()) +print(test.isnull().sum()) + +train.dropna(inplace=True) +validate.dropna(inplace=True) +test.dropna(inplace=True) + +print(train) +print(validate) +print(test)