uczenie_maszynowe_zadania/cw_4/.ipynb_checkpoints/main-checkpoint.ipynb
2023-07-04 20:42:14 +02:00

17 KiB

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('titanic.tsv',sep='\t')
data.head()
Survived PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 530 2 Hocking\t Mr. Richard George male 23.0 2 1 29104 11.5000 NaN S
1 0 466 3 Goncalves\t Mr. Manuel Estanslas male 38.0 0 0 SOTON/O.Q. 3101306 7.0500 NaN S
2 0 753 3 Vande Velde\t Mr. Johannes Joseph male 33.0 0 0 345780 9.5000 NaN S
3 0 855 2 Carter\t Mrs. Ernest Courtenay (Lilian Hughes) female 44.0 1 0 244252 26.0000 NaN S
4 0 333 1 Graham\t Mr. George Edward male 38.0 0 1 PC 17582 153.4625 C91 S
data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)
data['Name_to_num'] = data['Name'].apply(
    lambda x: 1 if 'Mr.' in x else 0
)
del data['Name']
data.head()
Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Name_to_num
0 0 530 2 1 23.0 2 1 29104 11.5000 NaN S 1
1 0 466 3 1 38.0 0 0 SOTON/O.Q. 3101306 7.0500 NaN S 1
2 0 753 3 1 33.0 0 0 345780 9.5000 NaN S 1
3 0 855 2 0 44.0 1 0 244252 26.0000 NaN S 0
4 0 333 1 1 38.0 0 1 PC 17582 153.4625 C91 S 1
data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')


vectorizer = TfidfVectorizer()
vectorizer.fit(data['Cabin'])
vector = vectorizer.transform(data['Cabin']).toarray()
vector_sum = []
for v in vector:
    vector_sum.append(v.sum())
data['Cabin']=vector_sum
data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')

data = pd.get_dummies(data,columns=['Embarked'])
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[['Age']] = imputer.fit_transform(data[['Age']])
data['Age'].value_counts()
28.00    139
22.00     20
21.00     19
24.00     19
19.00     17
        ... 
61.00      1
70.50      1
0.75       1
10.00      1
46.00      1
Name: Age, Length: 82, dtype: int64
data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')

vectorizer = TfidfVectorizer()
vectorizer.fit(data['Ticket'])
vector = vectorizer.transform(data['Ticket']).toarray()
vector_sum = []
for v in vector:
    vector_sum.append(v.sum())
data['Ticket']=vector_sum
data.head()
Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined
0 0 530 2 1 23.0 2 1 1.000000 11.5000 1.0 1 0 0 1 0
1 0 466 3 1 38.0 0 0 1.391284 7.0500 1.0 1 0 0 1 0
2 0 753 3 1 33.0 0 0 1.000000 9.5000 1.0 1 0 0 1 0
3 0 855 2 0 44.0 1 0 1.000000 26.0000 1.0 0 0 0 1 0
4 0 333 1 1 38.0 0 1 1.365721 153.4625 1.0 1 0 0 1 0