ium_470618/dane.ipynb
2023-03-21 22:00:29 +01:00

57 KiB

!kaggle competitions download -c titanic
Downloading titanic.zip to /home/gedin/Studia/InzUczeniaMaszynowego/zadania
100%|███████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 212kB/s]
100%|███████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 212kB/s]
!unzip titanic.zip
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               

Dane o pliku

!wc -l train.csv
!wc -l test.csv
892 train.csv
419 test.csv
import pandas as pd
df = pd.read_csv("train.csv")
df.head(5)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
df.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
df.hist(["Survived", "Pclass"])
array([[<Axes: title={'center': 'Survived'}>,
        <Axes: title={'center': 'Pclass'}>]], dtype=object)
embarked = df.value_counts("Embarked")
#later will be transformed to one-hot
embarked.plot.bar()
<Axes: xlabel='Embarked'>
# df.dropna()
#df.fillna()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
columns_to_normalize=['Age','Fare']
for colname in columns_to_normalize:
    df[colname]=(df[colname]-df[colname].min())/(df[colname].max()-df[colname].min())
df.head(5)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 0.271174 1 0 A/5 21171 0.014151 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 0.472229 1 0 PC 17599 0.139136 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 0.321438 0 0 STON/O2. 3101282 0.015469 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 0.434531 1 0 113803 0.103644 C123 S
4 5 0 3 Allen, Mr. William Henry male 0.434531 0 0 373450 0.015713 NaN S