655 KiB
655 KiB
!kaggle datasets download -d tejashvi14/travel-insurance-prediction-data
!unzip -o travel-insurance-prediction-data.zip
import pandas as pd
travel_insurance=pd.read_csv('TravelInsurancePrediction.csv', index_col=0)
travel_insurance
Age | Employment Type | GraduateOrNot | AnnualIncome | FamilyMembers | ChronicDiseases | FrequentFlyer | EverTravelledAbroad | TravelInsurance | |
---|---|---|---|---|---|---|---|---|---|
0 | 31 | Government Sector | Yes | 400000 | 6 | 1 | No | No | 0 |
1 | 31 | Private Sector/Self Employed | Yes | 1250000 | 7 | 0 | No | No | 0 |
2 | 34 | Private Sector/Self Employed | Yes | 500000 | 4 | 1 | No | No | 1 |
3 | 28 | Private Sector/Self Employed | Yes | 700000 | 3 | 1 | No | No | 0 |
4 | 28 | Private Sector/Self Employed | Yes | 700000 | 8 | 1 | Yes | No | 0 |
5 | 25 | Private Sector/Self Employed | No | 1150000 | 4 | 0 | No | No | 0 |
6 | 31 | Government Sector | Yes | 1300000 | 4 | 0 | No | No | 0 |
7 | 31 | Private Sector/Self Employed | Yes | 1350000 | 3 | 0 | Yes | Yes | 1 |
8 | 28 | Private Sector/Self Employed | Yes | 1450000 | 6 | 1 | Yes | Yes | 1 |
9 | 33 | Government Sector | Yes | 800000 | 3 | 0 | Yes | No | 0 |
10 | 31 | Government Sector | Yes | 400000 | 9 | 1 | No | No | 0 |
11 | 26 | Private Sector/Self Employed | Yes | 1400000 | 5 | 0 | Yes | Yes | 1 |
12 | 32 | Government Sector | Yes | 850000 | 6 | 0 | No | No | 1 |
13 | 31 | Government Sector | Yes | 1500000 | 6 | 0 | Yes | Yes | 1 |
14 | 31 | Government Sector | Yes | 400000 | 3 | 0 | No | No | 0 |
15 | 34 | Private Sector/Self Employed | Yes | 700000 | 7 | 0 | No | No | 0 |
16 | 28 | Private Sector/Self Employed | Yes | 1150000 | 4 | 1 | No | No | 0 |
17 | 28 | Private Sector/Self Employed | Yes | 800000 | 7 | 0 | No | No | 1 |
18 | 29 | Private Sector/Self Employed | Yes | 1050000 | 5 | 1 | No | No | 1 |
19 | 34 | Private Sector/Self Employed | Yes | 1500000 | 2 | 0 | Yes | Yes | 1 |
20 | 28 | Private Sector/Self Employed | Yes | 1150000 | 6 | 0 | Yes | No | 0 |
21 | 29 | Private Sector/Self Employed | Yes | 350000 | 3 | 0 | No | No | 1 |
22 | 31 | Private Sector/Self Employed | Yes | 1100000 | 4 | 0 | No | No | 0 |
23 | 28 | Government Sector | Yes | 600000 | 9 | 0 | No | No | 1 |
24 | 31 | Government Sector | Yes | 1500000 | 7 | 0 | Yes | Yes | 0 |
25 | 35 | Private Sector/Self Employed | No | 800000 | 4 | 0 | No | Yes | 0 |
26 | 34 | Private Sector/Self Employed | No | 1300000 | 6 | 0 | Yes | No | 1 |
27 | 28 | Private Sector/Self Employed | Yes | 1250000 | 2 | 0 | Yes | No | 0 |
28 | 26 | Private Sector/Self Employed | Yes | 600000 | 4 | 0 | Yes | No | 0 |
29 | 31 | Private Sector/Self Employed | Yes | 900000 | 3 | 0 | No | No | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1957 | 31 | Private Sector/Self Employed | Yes | 1250000 | 7 | 0 | No | No | 0 |
1958 | 31 | Government Sector | Yes | 1300000 | 3 | 1 | No | No | 0 |
1959 | 28 | Private Sector/Self Employed | No | 550000 | 6 | 0 | No | No | 0 |
1960 | 28 | Government Sector | Yes | 300000 | 6 | 0 | No | No | 1 |
1961 | 26 | Government Sector | Yes | 500000 | 3 | 0 | No | No | 0 |
1962 | 25 | Private Sector/Self Employed | No | 1150000 | 7 | 0 | No | No | 0 |
1963 | 33 | Government Sector | Yes | 1100000 | 4 | 0 | No | No | 1 |
1964 | 28 | Government Sector | Yes | 1100000 | 4 | 0 | No | No | 0 |
1965 | 30 | Private Sector/Self Employed | No | 550000 | 3 | 0 | No | No | 1 |
1966 | 34 | Private Sector/Self Employed | Yes | 700000 | 3 | 0 | Yes | No | 0 |
1967 | 34 | Private Sector/Self Employed | Yes | 700000 | 3 | 1 | No | No | 0 |
1968 | 28 | Government Sector | Yes | 300000 | 5 | 1 | No | No | 0 |
1969 | 31 | Private Sector/Self Employed | Yes | 500000 | 4 | 1 | No | No | 0 |
1970 | 29 | Private Sector/Self Employed | Yes | 500000 | 4 | 0 | No | No | 1 |
1971 | 31 | Private Sector/Self Employed | Yes | 950000 | 5 | 0 | No | No | 0 |
1972 | 28 | Private Sector/Self Employed | Yes | 800000 | 5 | 1 | No | No | 0 |
1973 | 31 | Private Sector/Self Employed | Yes | 1350000 | 3 | 0 | No | Yes | 1 |
1974 | 28 | Private Sector/Self Employed | Yes | 1200000 | 6 | 0 | Yes | No | 0 |
1975 | 26 | Private Sector/Self Employed | Yes | 1400000 | 9 | 0 | No | Yes | 1 |
1976 | 32 | Government Sector | Yes | 900000 | 6 | 0 | No | No | 0 |
1977 | 25 | Private Sector/Self Employed | No | 1350000 | 6 | 0 | No | Yes | 1 |
1978 | 34 | Private Sector/Self Employed | Yes | 700000 | 5 | 0 | No | No | 1 |
1979 | 28 | Private Sector/Self Employed | Yes | 1100000 | 4 | 1 | No | No | 0 |
1980 | 25 | Private Sector/Self Employed | No | 1150000 | 5 | 1 | No | No | 0 |
1981 | 27 | Government Sector | Yes | 850000 | 3 | 0 | No | No | 1 |
1982 | 33 | Private Sector/Self Employed | Yes | 1500000 | 4 | 0 | Yes | Yes | 1 |
1983 | 28 | Private Sector/Self Employed | Yes | 1750000 | 5 | 1 | No | Yes | 0 |
1984 | 28 | Private Sector/Self Employed | Yes | 1150000 | 6 | 1 | No | No | 0 |
1985 | 34 | Private Sector/Self Employed | Yes | 1000000 | 6 | 0 | Yes | Yes | 1 |
1986 | 34 | Private Sector/Self Employed | Yes | 500000 | 4 | 0 | No | No | 0 |
1987 rows × 9 columns
# usunięcie wierszy zawierających braki
travel_insurance.dropna(axis='index', how='any')
# normalizacja danych
for column in travel_insurance.columns:
if travel_insurance[column].dtype == 'object':
travel_insurance[column] = travel_insurance[column].str.lower()
travel_insurance
Age | Employment Type | GraduateOrNot | AnnualIncome | FamilyMembers | ChronicDiseases | FrequentFlyer | EverTravelledAbroad | TravelInsurance | |
---|---|---|---|---|---|---|---|---|---|
0 | 31 | government sector | yes | 400000 | 6 | 1 | no | no | 0 |
1 | 31 | private sector/self employed | yes | 1250000 | 7 | 0 | no | no | 0 |
2 | 34 | private sector/self employed | yes | 500000 | 4 | 1 | no | no | 1 |
3 | 28 | private sector/self employed | yes | 700000 | 3 | 1 | no | no | 0 |
4 | 28 | private sector/self employed | yes | 700000 | 8 | 1 | yes | no | 0 |
5 | 25 | private sector/self employed | no | 1150000 | 4 | 0 | no | no | 0 |
6 | 31 | government sector | yes | 1300000 | 4 | 0 | no | no | 0 |
7 | 31 | private sector/self employed | yes | 1350000 | 3 | 0 | yes | yes | 1 |
8 | 28 | private sector/self employed | yes | 1450000 | 6 | 1 | yes | yes | 1 |
9 | 33 | government sector | yes | 800000 | 3 | 0 | yes | no | 0 |
10 | 31 | government sector | yes | 400000 | 9 | 1 | no | no | 0 |
11 | 26 | private sector/self employed | yes | 1400000 | 5 | 0 | yes | yes | 1 |
12 | 32 | government sector | yes | 850000 | 6 | 0 | no | no | 1 |
13 | 31 | government sector | yes | 1500000 | 6 | 0 | yes | yes | 1 |
14 | 31 | government sector | yes | 400000 | 3 | 0 | no | no | 0 |
15 | 34 | private sector/self employed | yes | 700000 | 7 | 0 | no | no | 0 |
16 | 28 | private sector/self employed | yes | 1150000 | 4 | 1 | no | no | 0 |
17 | 28 | private sector/self employed | yes | 800000 | 7 | 0 | no | no | 1 |
18 | 29 | private sector/self employed | yes | 1050000 | 5 | 1 | no | no | 1 |
19 | 34 | private sector/self employed | yes | 1500000 | 2 | 0 | yes | yes | 1 |
20 | 28 | private sector/self employed | yes | 1150000 | 6 | 0 | yes | no | 0 |
21 | 29 | private sector/self employed | yes | 350000 | 3 | 0 | no | no | 1 |
22 | 31 | private sector/self employed | yes | 1100000 | 4 | 0 | no | no | 0 |
23 | 28 | government sector | yes | 600000 | 9 | 0 | no | no | 1 |
24 | 31 | government sector | yes | 1500000 | 7 | 0 | yes | yes | 0 |
25 | 35 | private sector/self employed | no | 800000 | 4 | 0 | no | yes | 0 |
26 | 34 | private sector/self employed | no | 1300000 | 6 | 0 | yes | no | 1 |
27 | 28 | private sector/self employed | yes | 1250000 | 2 | 0 | yes | no | 0 |
28 | 26 | private sector/self employed | yes | 600000 | 4 | 0 | yes | no | 0 |
29 | 31 | private sector/self employed | yes | 900000 | 3 | 0 | no | no | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1957 | 31 | private sector/self employed | yes | 1250000 | 7 | 0 | no | no | 0 |
1958 | 31 | government sector | yes | 1300000 | 3 | 1 | no | no | 0 |
1959 | 28 | private sector/self employed | no | 550000 | 6 | 0 | no | no | 0 |
1960 | 28 | government sector | yes | 300000 | 6 | 0 | no | no | 1 |
1961 | 26 | government sector | yes | 500000 | 3 | 0 | no | no | 0 |
1962 | 25 | private sector/self employed | no | 1150000 | 7 | 0 | no | no | 0 |
1963 | 33 | government sector | yes | 1100000 | 4 | 0 | no | no | 1 |
1964 | 28 | government sector | yes | 1100000 | 4 | 0 | no | no | 0 |
1965 | 30 | private sector/self employed | no | 550000 | 3 | 0 | no | no | 1 |
1966 | 34 | private sector/self employed | yes | 700000 | 3 | 0 | yes | no | 0 |
1967 | 34 | private sector/self employed | yes | 700000 | 3 | 1 | no | no | 0 |
1968 | 28 | government sector | yes | 300000 | 5 | 1 | no | no | 0 |
1969 | 31 | private sector/self employed | yes | 500000 | 4 | 1 | no | no | 0 |
1970 | 29 | private sector/self employed | yes | 500000 | 4 | 0 | no | no | 1 |
1971 | 31 | private sector/self employed | yes | 950000 | 5 | 0 | no | no | 0 |
1972 | 28 | private sector/self employed | yes | 800000 | 5 | 1 | no | no | 0 |
1973 | 31 | private sector/self employed | yes | 1350000 | 3 | 0 | no | yes | 1 |
1974 | 28 | private sector/self employed | yes | 1200000 | 6 | 0 | yes | no | 0 |
1975 | 26 | private sector/self employed | yes | 1400000 | 9 | 0 | no | yes | 1 |
1976 | 32 | government sector | yes | 900000 | 6 | 0 | no | no | 0 |
1977 | 25 | private sector/self employed | no | 1350000 | 6 | 0 | no | yes | 1 |
1978 | 34 | private sector/self employed | yes | 700000 | 5 | 0 | no | no | 1 |
1979 | 28 | private sector/self employed | yes | 1100000 | 4 | 1 | no | no | 0 |
1980 | 25 | private sector/self employed | no | 1150000 | 5 | 1 | no | no | 0 |
1981 | 27 | government sector | yes | 850000 | 3 | 0 | no | no | 1 |
1982 | 33 | private sector/self employed | yes | 1500000 | 4 | 0 | yes | yes | 1 |
1983 | 28 | private sector/self employed | yes | 1750000 | 5 | 1 | no | yes | 0 |
1984 | 28 | private sector/self employed | yes | 1150000 | 6 | 1 | no | no | 0 |
1985 | 34 | private sector/self employed | yes | 1000000 | 6 | 0 | yes | yes | 1 |
1986 | 34 | private sector/self employed | yes | 500000 | 4 | 0 | no | no | 0 |
1987 rows × 9 columns
# podział na podzbiory train/dev/test
import sklearn
from sklearn.model_selection import train_test_split
travel_insurance_train, travel_insurance_rest = sklearn.model_selection.train_test_split(travel_insurance, test_size=0.4, random_state=1)
travel_insurance_test, travel_insurance_dev = sklearn.model_selection.train_test_split(travel_insurance_rest, test_size=0.5, random_state=1)
travel_insurance.describe(include='all')
Age | Employment Type | GraduateOrNot | AnnualIncome | FamilyMembers | ChronicDiseases | FrequentFlyer | EverTravelledAbroad | TravelInsurance | |
---|---|---|---|---|---|---|---|---|---|
count | 1987.0 | 1987 | 1988 | 1.987000e+03 | 1988.000000 | 1988.000000 | 1988 | 1988 | 1988.000000 |
unique | 11.0 | 2 | 2 | NaN | NaN | NaN | 2 | 2 | NaN |
top | 28.0 | private sector/self employed | yes | NaN | NaN | NaN | no | no | NaN |
freq | 506.0 | 1417 | 1693 | NaN | NaN | NaN | 1571 | 1608 | NaN |
mean | NaN | NaN | NaN | 9.327630e+05 | 4.753018 | 0.277666 | NaN | NaN | 0.357646 |
std | NaN | NaN | NaN | 3.768557e+05 | 1.609254 | 0.447960 | NaN | NaN | 0.479428 |
min | NaN | NaN | NaN | 3.000000e+05 | 2.000000 | 0.000000 | NaN | NaN | 0.000000 |
25% | NaN | NaN | NaN | 6.000000e+05 | 4.000000 | 0.000000 | NaN | NaN | 0.000000 |
50% | NaN | NaN | NaN | 9.000000e+05 | 5.000000 | 0.000000 | NaN | NaN | 0.000000 |
75% | NaN | NaN | NaN | 1.250000e+06 | 6.000000 | 1.000000 | NaN | NaN | 1.000000 |
max | NaN | NaN | NaN | 1.800000e+06 | 9.000000 | 1.000000 | NaN | NaN | 1.000000 |
# zwracanie informacji o danym zbiorze
import seaborn as sns
def printInformation(data):
print(f'Size (rows): {len(data)}\n')
mean_value = data.mean()
min_value = data.min(numeric_only=True)
max_value = data.max(numeric_only=True)
std_value = data.std()
median_value = data.median()
print(f'(mean)\n{mean_value}', f'(min)\n{min_value}', f'(max)\n{max_value}', f'(std)\n{std_value}', f'(median)\n{median_value}', sep="\n\n")
sns.pairplot(data=data, hue="TravelInsurance")
printInformation(travel_insurance)
Size (rows): 1988 (mean) Age 29.650226 AnnualIncome 932762.959235 FamilyMembers 4.753018 ChronicDiseases 0.277666 TravelInsurance 0.357646 dtype: float64 (min) AnnualIncome 300000.0 FamilyMembers 2.0 ChronicDiseases 0.0 TravelInsurance 0.0 dtype: float64 (max) AnnualIncome 1800000.0 FamilyMembers 9.0 ChronicDiseases 1.0 TravelInsurance 1.0 dtype: float64 (std) Age 2.913308 AnnualIncome 376855.684748 FamilyMembers 1.609254 ChronicDiseases 0.447960 TravelInsurance 0.479428 dtype: float64 (median) Age 29.0 AnnualIncome 900000.0 FamilyMembers 5.0 ChronicDiseases 0.0 TravelInsurance 0.0 dtype: float64
printInformation(travel_insurance_train)
Size (rows): 1192 (mean) Age 29.744128 AnnualIncome 933095.637584 FamilyMembers 4.814597 ChronicDiseases 0.269295 TravelInsurance 0.349832 dtype: float64 (min) Age 25 AnnualIncome 300000 FamilyMembers 2 ChronicDiseases 0 TravelInsurance 0 dtype: int64 (max) Age 35 AnnualIncome 1800000 FamilyMembers 9 ChronicDiseases 1 TravelInsurance 1 dtype: int64 (std) Age 2.923164 AnnualIncome 376856.587539 FamilyMembers 1.619576 ChronicDiseases 0.443780 TravelInsurance 0.477117 dtype: float64 (median) Age 29.0 AnnualIncome 900000.0 FamilyMembers 5.0 ChronicDiseases 0.0 TravelInsurance 0.0 dtype: float64
printInformation(travel_insurance_test)
Size (rows): 397 (mean) Age 29.561713 AnnualIncome 916120.906801 FamilyMembers 4.622166 ChronicDiseases 0.287154 TravelInsurance 0.362720 dtype: float64 (min) Age 25 AnnualIncome 300000 FamilyMembers 2 ChronicDiseases 0 TravelInsurance 0 dtype: int64 (max) Age 35 AnnualIncome 1750000 FamilyMembers 9 ChronicDiseases 1 TravelInsurance 1 dtype: int64 (std) Age 2.850544 AnnualIncome 379748.807848 FamilyMembers 1.627824 ChronicDiseases 0.453005 TravelInsurance 0.481392 dtype: float64 (median) Age 29.0 AnnualIncome 850000.0 FamilyMembers 4.0 ChronicDiseases 0.0 TravelInsurance 0.0 dtype: float64
printInformation(travel_insurance_dev)
Size (rows): 398 (mean) Age 29.457286 AnnualIncome 948366.834171 FamilyMembers 4.698492 ChronicDiseases 0.293970 TravelInsurance 0.374372 dtype: float64 (min) Age 25 AnnualIncome 300000 FamilyMembers 2 ChronicDiseases 0 TravelInsurance 0 dtype: int64 (max) Age 35 AnnualIncome 1800000 FamilyMembers 9 ChronicDiseases 1 TravelInsurance 1 dtype: int64 (std) Age 2.940542 AnnualIncome 374204.238614 FamilyMembers 1.554889 ChronicDiseases 0.456152 TravelInsurance 0.484569 dtype: float64 (median) Age 28.0 AnnualIncome 1000000.0 FamilyMembers 4.0 ChronicDiseases 0.0 TravelInsurance 0.0 dtype: float64