17 KiB
17 KiB
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv('titanic.tsv',sep='\t')
data.head()
Survived | PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 530 | 2 | Hocking\t Mr. Richard George | male | 23.0 | 2 | 1 | 29104 | 11.5000 | NaN | S |
1 | 0 | 466 | 3 | Goncalves\t Mr. Manuel Estanslas | male | 38.0 | 0 | 0 | SOTON/O.Q. 3101306 | 7.0500 | NaN | S |
2 | 0 | 753 | 3 | Vande Velde\t Mr. Johannes Joseph | male | 33.0 | 0 | 0 | 345780 | 9.5000 | NaN | S |
3 | 0 | 855 | 2 | Carter\t Mrs. Ernest Courtenay (Lilian Hughes) | female | 44.0 | 1 | 0 | 244252 | 26.0000 | NaN | S |
4 | 0 | 333 | 1 | Graham\t Mr. George Edward | male | 38.0 | 0 | 1 | PC 17582 | 153.4625 | C91 | S |
data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)
data['Name_to_num'] = data['Name'].apply(
lambda x: 1 if 'Mr.' in x else 0
)
del data['Name']
data.head()
Survived | PassengerId | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Name_to_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 530 | 2 | 1 | 23.0 | 2 | 1 | 29104 | 11.5000 | NaN | S | 1 |
1 | 0 | 466 | 3 | 1 | 38.0 | 0 | 0 | SOTON/O.Q. 3101306 | 7.0500 | NaN | S | 1 |
2 | 0 | 753 | 3 | 1 | 33.0 | 0 | 0 | 345780 | 9.5000 | NaN | S | 1 |
3 | 0 | 855 | 2 | 0 | 44.0 | 1 | 0 | 244252 | 26.0000 | NaN | S | 0 |
4 | 0 | 333 | 1 | 1 | 38.0 | 0 | 1 | PC 17582 | 153.4625 | C91 | S | 1 |
data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')
vectorizer = TfidfVectorizer()
vectorizer.fit(data['Cabin'])
vector = vectorizer.transform(data['Cabin']).toarray()
vector_sum = []
for v in vector:
vector_sum.append(v.sum())
data['Cabin']=vector_sum
data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')
data = pd.get_dummies(data,columns=['Embarked'])
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[['Age']] = imputer.fit_transform(data[['Age']])
data['Age'].value_counts()
28.00 139 22.00 20 21.00 19 24.00 19 19.00 17 ... 61.00 1 70.50 1 0.75 1 10.00 1 46.00 1 Name: Age, Length: 82, dtype: int64
data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')
vectorizer = TfidfVectorizer()
vectorizer.fit(data['Ticket'])
vector = vectorizer.transform(data['Ticket']).toarray()
vector_sum = []
for v in vector:
vector_sum.append(v.sum())
data['Ticket']=vector_sum
data.head()
Survived | PassengerId | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Name_to_num | Embarked_C | Embarked_Q | Embarked_S | Embarked_Undefined | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 530 | 2 | 1 | 23.0 | 2 | 1 | 1.000000 | 11.5000 | 1.0 | 1 | 0 | 0 | 1 | 0 |
1 | 0 | 466 | 3 | 1 | 38.0 | 0 | 0 | 1.391284 | 7.0500 | 1.0 | 1 | 0 | 0 | 1 | 0 |
2 | 0 | 753 | 3 | 1 | 33.0 | 0 | 0 | 1.000000 | 9.5000 | 1.0 | 1 | 0 | 0 | 1 | 0 |
3 | 0 | 855 | 2 | 0 | 44.0 | 1 | 0 | 1.000000 | 26.0000 | 1.0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 333 | 1 | 1 | 38.0 | 0 | 1 | 1.365721 | 153.4625 | 1.0 | 1 | 0 | 0 | 1 | 0 |