37 lines
868 B
Python
37 lines
868 B
Python
import sys
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
CUTOFF = int(sys.argv[1])
|
|
|
|
sc = pd.read_csv('who_suicide_statistics.csv')
|
|
|
|
age = {"5-14 years": 0, "15-24 years": 1, "25-34 years": 2,
|
|
"35-54 years": 3, "55-74 years": 4, "75+ years": 5}
|
|
|
|
sex = {"male": 0, "female": 1}
|
|
|
|
# Usunięcie niepełnych danych
|
|
sc.dropna(inplace=True)
|
|
|
|
# Kategoryzacja
|
|
sc = pd.get_dummies(
|
|
sc, columns=['age', 'sex', 'country'], prefix='', prefix_sep='')
|
|
|
|
# CUTOFF
|
|
sc = sc.head(CUTOFF)
|
|
|
|
# podział na train validate i test
|
|
train, validate, test = np.split(sc.sample(frac=1, random_state=42),
|
|
[int(.6*len(sc)), int(.8*len(sc))])
|
|
|
|
# zapis do plików
|
|
train.to_csv('train.csv', index=False)
|
|
validate.to_csv('validate.csv', index=False)
|
|
test.to_csv('test.csv', index=False)
|
|
|
|
# print(train)
|
|
# print(validate)
|
|
# print(test)
|