ium_444421/preparation.ipynb
2022-03-20 19:23:11 +01:00

655 KiB
Raw Blame History

!kaggle datasets download -d tejashvi14/travel-insurance-prediction-data
!unzip -o travel-insurance-prediction-data.zip
import pandas as pd
travel_insurance=pd.read_csv('TravelInsurancePrediction.csv', index_col=0)
travel_insurance
Age Employment Type GraduateOrNot AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer EverTravelledAbroad TravelInsurance
0 31 Government Sector Yes 400000 6 1 No No 0
1 31 Private Sector/Self Employed Yes 1250000 7 0 No No 0
2 34 Private Sector/Self Employed Yes 500000 4 1 No No 1
3 28 Private Sector/Self Employed Yes 700000 3 1 No No 0
4 28 Private Sector/Self Employed Yes 700000 8 1 Yes No 0
5 25 Private Sector/Self Employed No 1150000 4 0 No No 0
6 31 Government Sector Yes 1300000 4 0 No No 0
7 31 Private Sector/Self Employed Yes 1350000 3 0 Yes Yes 1
8 28 Private Sector/Self Employed Yes 1450000 6 1 Yes Yes 1
9 33 Government Sector Yes 800000 3 0 Yes No 0
10 31 Government Sector Yes 400000 9 1 No No 0
11 26 Private Sector/Self Employed Yes 1400000 5 0 Yes Yes 1
12 32 Government Sector Yes 850000 6 0 No No 1
13 31 Government Sector Yes 1500000 6 0 Yes Yes 1
14 31 Government Sector Yes 400000 3 0 No No 0
15 34 Private Sector/Self Employed Yes 700000 7 0 No No 0
16 28 Private Sector/Self Employed Yes 1150000 4 1 No No 0
17 28 Private Sector/Self Employed Yes 800000 7 0 No No 1
18 29 Private Sector/Self Employed Yes 1050000 5 1 No No 1
19 34 Private Sector/Self Employed Yes 1500000 2 0 Yes Yes 1
20 28 Private Sector/Self Employed Yes 1150000 6 0 Yes No 0
21 29 Private Sector/Self Employed Yes 350000 3 0 No No 1
22 31 Private Sector/Self Employed Yes 1100000 4 0 No No 0
23 28 Government Sector Yes 600000 9 0 No No 1
24 31 Government Sector Yes 1500000 7 0 Yes Yes 0
25 35 Private Sector/Self Employed No 800000 4 0 No Yes 0
26 34 Private Sector/Self Employed No 1300000 6 0 Yes No 1
27 28 Private Sector/Self Employed Yes 1250000 2 0 Yes No 0
28 26 Private Sector/Self Employed Yes 600000 4 0 Yes No 0
29 31 Private Sector/Self Employed Yes 900000 3 0 No No 0
... ... ... ... ... ... ... ... ... ...
1957 31 Private Sector/Self Employed Yes 1250000 7 0 No No 0
1958 31 Government Sector Yes 1300000 3 1 No No 0
1959 28 Private Sector/Self Employed No 550000 6 0 No No 0
1960 28 Government Sector Yes 300000 6 0 No No 1
1961 26 Government Sector Yes 500000 3 0 No No 0
1962 25 Private Sector/Self Employed No 1150000 7 0 No No 0
1963 33 Government Sector Yes 1100000 4 0 No No 1
1964 28 Government Sector Yes 1100000 4 0 No No 0
1965 30 Private Sector/Self Employed No 550000 3 0 No No 1
1966 34 Private Sector/Self Employed Yes 700000 3 0 Yes No 0
1967 34 Private Sector/Self Employed Yes 700000 3 1 No No 0
1968 28 Government Sector Yes 300000 5 1 No No 0
1969 31 Private Sector/Self Employed Yes 500000 4 1 No No 0
1970 29 Private Sector/Self Employed Yes 500000 4 0 No No 1
1971 31 Private Sector/Self Employed Yes 950000 5 0 No No 0
1972 28 Private Sector/Self Employed Yes 800000 5 1 No No 0
1973 31 Private Sector/Self Employed Yes 1350000 3 0 No Yes 1
1974 28 Private Sector/Self Employed Yes 1200000 6 0 Yes No 0
1975 26 Private Sector/Self Employed Yes 1400000 9 0 No Yes 1
1976 32 Government Sector Yes 900000 6 0 No No 0
1977 25 Private Sector/Self Employed No 1350000 6 0 No Yes 1
1978 34 Private Sector/Self Employed Yes 700000 5 0 No No 1
1979 28 Private Sector/Self Employed Yes 1100000 4 1 No No 0
1980 25 Private Sector/Self Employed No 1150000 5 1 No No 0
1981 27 Government Sector Yes 850000 3 0 No No 1
1982 33 Private Sector/Self Employed Yes 1500000 4 0 Yes Yes 1
1983 28 Private Sector/Self Employed Yes 1750000 5 1 No Yes 0
1984 28 Private Sector/Self Employed Yes 1150000 6 1 No No 0
1985 34 Private Sector/Self Employed Yes 1000000 6 0 Yes Yes 1
1986 34 Private Sector/Self Employed Yes 500000 4 0 No No 0

1987 rows × 9 columns

# usunięcie wierszy zawierających braki
travel_insurance.dropna(axis='index', how='any')
# normalizacja danych
for column in travel_insurance.columns:
    if travel_insurance[column].dtype == 'object':
        travel_insurance[column] = travel_insurance[column].str.lower()

travel_insurance
Age Employment Type GraduateOrNot AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer EverTravelledAbroad TravelInsurance
0 31 government sector yes 400000 6 1 no no 0
1 31 private sector/self employed yes 1250000 7 0 no no 0
2 34 private sector/self employed yes 500000 4 1 no no 1
3 28 private sector/self employed yes 700000 3 1 no no 0
4 28 private sector/self employed yes 700000 8 1 yes no 0
5 25 private sector/self employed no 1150000 4 0 no no 0
6 31 government sector yes 1300000 4 0 no no 0
7 31 private sector/self employed yes 1350000 3 0 yes yes 1
8 28 private sector/self employed yes 1450000 6 1 yes yes 1
9 33 government sector yes 800000 3 0 yes no 0
10 31 government sector yes 400000 9 1 no no 0
11 26 private sector/self employed yes 1400000 5 0 yes yes 1
12 32 government sector yes 850000 6 0 no no 1
13 31 government sector yes 1500000 6 0 yes yes 1
14 31 government sector yes 400000 3 0 no no 0
15 34 private sector/self employed yes 700000 7 0 no no 0
16 28 private sector/self employed yes 1150000 4 1 no no 0
17 28 private sector/self employed yes 800000 7 0 no no 1
18 29 private sector/self employed yes 1050000 5 1 no no 1
19 34 private sector/self employed yes 1500000 2 0 yes yes 1
20 28 private sector/self employed yes 1150000 6 0 yes no 0
21 29 private sector/self employed yes 350000 3 0 no no 1
22 31 private sector/self employed yes 1100000 4 0 no no 0
23 28 government sector yes 600000 9 0 no no 1
24 31 government sector yes 1500000 7 0 yes yes 0
25 35 private sector/self employed no 800000 4 0 no yes 0
26 34 private sector/self employed no 1300000 6 0 yes no 1
27 28 private sector/self employed yes 1250000 2 0 yes no 0
28 26 private sector/self employed yes 600000 4 0 yes no 0
29 31 private sector/self employed yes 900000 3 0 no no 0
... ... ... ... ... ... ... ... ... ...
1957 31 private sector/self employed yes 1250000 7 0 no no 0
1958 31 government sector yes 1300000 3 1 no no 0
1959 28 private sector/self employed no 550000 6 0 no no 0
1960 28 government sector yes 300000 6 0 no no 1
1961 26 government sector yes 500000 3 0 no no 0
1962 25 private sector/self employed no 1150000 7 0 no no 0
1963 33 government sector yes 1100000 4 0 no no 1
1964 28 government sector yes 1100000 4 0 no no 0
1965 30 private sector/self employed no 550000 3 0 no no 1
1966 34 private sector/self employed yes 700000 3 0 yes no 0
1967 34 private sector/self employed yes 700000 3 1 no no 0
1968 28 government sector yes 300000 5 1 no no 0
1969 31 private sector/self employed yes 500000 4 1 no no 0
1970 29 private sector/self employed yes 500000 4 0 no no 1
1971 31 private sector/self employed yes 950000 5 0 no no 0
1972 28 private sector/self employed yes 800000 5 1 no no 0
1973 31 private sector/self employed yes 1350000 3 0 no yes 1
1974 28 private sector/self employed yes 1200000 6 0 yes no 0
1975 26 private sector/self employed yes 1400000 9 0 no yes 1
1976 32 government sector yes 900000 6 0 no no 0
1977 25 private sector/self employed no 1350000 6 0 no yes 1
1978 34 private sector/self employed yes 700000 5 0 no no 1
1979 28 private sector/self employed yes 1100000 4 1 no no 0
1980 25 private sector/self employed no 1150000 5 1 no no 0
1981 27 government sector yes 850000 3 0 no no 1
1982 33 private sector/self employed yes 1500000 4 0 yes yes 1
1983 28 private sector/self employed yes 1750000 5 1 no yes 0
1984 28 private sector/self employed yes 1150000 6 1 no no 0
1985 34 private sector/self employed yes 1000000 6 0 yes yes 1
1986 34 private sector/self employed yes 500000 4 0 no no 0

1987 rows × 9 columns

# podział na podzbiory train/dev/test
import sklearn
from sklearn.model_selection import train_test_split
travel_insurance_train, travel_insurance_rest = sklearn.model_selection.train_test_split(travel_insurance, test_size=0.4, random_state=1)
travel_insurance_test, travel_insurance_dev = sklearn.model_selection.train_test_split(travel_insurance_rest, test_size=0.5, random_state=1)
travel_insurance.describe(include='all')
Age Employment Type GraduateOrNot AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer EverTravelledAbroad TravelInsurance
count 1987.0 1987 1988 1.987000e+03 1988.000000 1988.000000 1988 1988 1988.000000
unique 11.0 2 2 NaN NaN NaN 2 2 NaN
top 28.0 private sector/self employed yes NaN NaN NaN no no NaN
freq 506.0 1417 1693 NaN NaN NaN 1571 1608 NaN
mean NaN NaN NaN 9.327630e+05 4.753018 0.277666 NaN NaN 0.357646
std NaN NaN NaN 3.768557e+05 1.609254 0.447960 NaN NaN 0.479428
min NaN NaN NaN 3.000000e+05 2.000000 0.000000 NaN NaN 0.000000
25% NaN NaN NaN 6.000000e+05 4.000000 0.000000 NaN NaN 0.000000
50% NaN NaN NaN 9.000000e+05 5.000000 0.000000 NaN NaN 0.000000
75% NaN NaN NaN 1.250000e+06 6.000000 1.000000 NaN NaN 1.000000
max NaN NaN NaN 1.800000e+06 9.000000 1.000000 NaN NaN 1.000000
# zwracanie informacji o danym zbiorze 

import seaborn as sns

def printInformation(data):
    print(f'Size (rows): {len(data)}\n')
    mean_value = data.mean()
    min_value = data.min(numeric_only=True)
    max_value = data.max(numeric_only=True)
    std_value = data.std()
    median_value = data.median()
    print(f'(mean)\n{mean_value}', f'(min)\n{min_value}', f'(max)\n{max_value}', f'(std)\n{std_value}', f'(median)\n{median_value}', sep="\n\n")
    sns.pairplot(data=data, hue="TravelInsurance")
printInformation(travel_insurance)
Size (rows): 1988

(mean)
Age                    29.650226
AnnualIncome       932762.959235
FamilyMembers           4.753018
ChronicDiseases         0.277666
TravelInsurance         0.357646
dtype: float64

(min)
AnnualIncome       300000.0
FamilyMembers           2.0
ChronicDiseases         0.0
TravelInsurance         0.0
dtype: float64

(max)
AnnualIncome       1800000.0
FamilyMembers            9.0
ChronicDiseases          1.0
TravelInsurance          1.0
dtype: float64

(std)
Age                     2.913308
AnnualIncome       376855.684748
FamilyMembers           1.609254
ChronicDiseases         0.447960
TravelInsurance         0.479428
dtype: float64

(median)
Age                    29.0
AnnualIncome       900000.0
FamilyMembers           5.0
ChronicDiseases         0.0
TravelInsurance         0.0
dtype: float64
printInformation(travel_insurance_train)
Size (rows): 1192

(mean)
Age                    29.744128
AnnualIncome       933095.637584
FamilyMembers           4.814597
ChronicDiseases         0.269295
TravelInsurance         0.349832
dtype: float64

(min)
Age                    25
AnnualIncome       300000
FamilyMembers           2
ChronicDiseases         0
TravelInsurance         0
dtype: int64

(max)
Age                     35
AnnualIncome       1800000
FamilyMembers            9
ChronicDiseases          1
TravelInsurance          1
dtype: int64

(std)
Age                     2.923164
AnnualIncome       376856.587539
FamilyMembers           1.619576
ChronicDiseases         0.443780
TravelInsurance         0.477117
dtype: float64

(median)
Age                    29.0
AnnualIncome       900000.0
FamilyMembers           5.0
ChronicDiseases         0.0
TravelInsurance         0.0
dtype: float64
printInformation(travel_insurance_test)
Size (rows): 397

(mean)
Age                    29.561713
AnnualIncome       916120.906801
FamilyMembers           4.622166
ChronicDiseases         0.287154
TravelInsurance         0.362720
dtype: float64

(min)
Age                    25
AnnualIncome       300000
FamilyMembers           2
ChronicDiseases         0
TravelInsurance         0
dtype: int64

(max)
Age                     35
AnnualIncome       1750000
FamilyMembers            9
ChronicDiseases          1
TravelInsurance          1
dtype: int64

(std)
Age                     2.850544
AnnualIncome       379748.807848
FamilyMembers           1.627824
ChronicDiseases         0.453005
TravelInsurance         0.481392
dtype: float64

(median)
Age                    29.0
AnnualIncome       850000.0
FamilyMembers           4.0
ChronicDiseases         0.0
TravelInsurance         0.0
dtype: float64
printInformation(travel_insurance_dev)
Size (rows): 398

(mean)
Age                    29.457286
AnnualIncome       948366.834171
FamilyMembers           4.698492
ChronicDiseases         0.293970
TravelInsurance         0.374372
dtype: float64

(min)
Age                    25
AnnualIncome       300000
FamilyMembers           2
ChronicDiseases         0
TravelInsurance         0
dtype: int64

(max)
Age                     35
AnnualIncome       1800000
FamilyMembers            9
ChronicDiseases          1
TravelInsurance          1
dtype: int64

(std)
Age                     2.940542
AnnualIncome       374204.238614
FamilyMembers           1.554889
ChronicDiseases         0.456152
TravelInsurance         0.484569
dtype: float64

(median)
Age                     28.0
AnnualIncome       1000000.0
FamilyMembers            4.0
ChronicDiseases          0.0
TravelInsurance          0.0
dtype: float64