🐧

kaggle Titanic 머신러닝 입문기

import pandas as pd

Load Dataset

train = pd.read_csv("data/train.csv", index_col="PassengerId")

print(train.shape)
train.head()
(891, 11)
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Explore

%matplotlib inline
import seaborn as sns
sns.countplot(data=train, x="Embarked", hue="Survived")
<matplotlib.axes._subplots.AxesSubplot at 0x23ddebd21d0>

이미지

low_fare = train[train["Fare"] < 100]
low_fare.head()
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
sns.lmplot(data=low_fare, x="Age", y="Fare", hue="Survived", fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x23ddeba74e0>

png

Reload Dataset

train = pd.read_csv("data/train.csv", index_col="PassengerId")

print(train.shape)
train.head()
(891, 11)
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
test = pd.read_csv("data/test.csv", index_col="PassengerId")

print(test.shape)
test.head()
(418, 10)
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

Preprocessing

Encode Sex

train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1

print(train.shape)
train.head()
(891, 11)
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1

print(test.shape)
test.head()
(418, 10)
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
892 3 Kelly, Mr. James 0 34.5 0 0 330911 7.8292 NaN Q
893 3 Wilkes, Mrs. James (Ellen Needs) 1 47.0 1 0 363272 7.0000 NaN S
894 2 Myles, Mr. Thomas Francis 0 62.0 0 0 240276 9.6875 NaN Q
895 3 Wirz, Mr. Albert 0 27.0 0 0 315154 8.6625 NaN S
896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 22.0 1 1 3101298 12.2875 NaN S

Fill in missing fare

# 비어있다 => NaN(Not a Number), null
test.loc[test["Fare"].isnull(), "Fare"] = 0

test[test["Fare"].isnull()]
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId

Encode Embarked

# True == 1, False == 0
train["Embarked_C"] = train["Embarked"] == "C"
train["Embarked_S"] = train["Embarked"] == "S"
train["Embarked_Q"] = train["Embarked"] == "Q"

train[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()
Embarked Embarked_C Embarked_S Embarked_Q
PassengerId
1 S False True False
2 C True False False
3 S False True False
4 S False True False
5 S False True False
test["Embarked_C"] = test["Embarked"] == "C"
test["Embarked_S"] = test["Embarked"] == "S"
test["Embarked_Q"] = test["Embarked"] == "Q"

test[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()
Embarked Embarked_C Embarked_S Embarked_Q
PassengerId
892 Q False False True
893 S False True False
894 Q False False True
895 S False True False
896 S False True False

Train

feature_names = ["Pclass", "Sex", "Fare",
                 "Embarked_C", "Embarked_S", "Embarked_Q"]

feature_names
['Pclass', 'Sex', 'Fare', 'Embarked_C', 'Embarked_S', 'Embarked_Q']
X_train = train[feature_names]

print(X_train.shape)
X_train.head()
(891, 6)
Pclass Sex Fare Embarked_C Embarked_S Embarked_Q
PassengerId
1 3 0 7.2500 False True False
2 1 1 71.2833 True False False
3 3 1 7.9250 False True False
4 1 1 53.1000 False True False
5 3 0 8.0500 False True False
label_name = "Survived"

y_train = train[label_name]

print(y_train.shape)
y_train.head()
(891,)





PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5)
model
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Visualize

%load_ext gvmagic
from sklearn.tree import export_graphviz

dot_tree = export_graphviz(model,
                           feature_names=feature_names,
                           class_names=["Perish", "Survived"],
                           out_file=None)

%dotstr dot_tree

svg

Predict

X_test = test[feature_names]

print(X_test.shape)
X_test.head()
(418, 6)
Pclass Sex Fare Embarked_C Embarked_S Embarked_Q
PassengerId
892 3 0 7.8292 False False True
893 3 1 7.0000 False True False
894 2 0 9.6875 False False True
895 3 0 8.6625 False True False
896 3 1 12.2875 False True False
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]
(418,)





array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

Submit

submission = pd.read_csv("data/gender_submission.csv", index_col="PassengerId")

submission["Survived"] = predictions

print(submission.shape)
submission.head()
(418, 1)
Survived
PassengerId
892 0
893 0
894 0
895 0
896 1
submission.to_csv("data/decision-tree.csv")

png

결과는 0.78947로 나름 만족스럽게 나왔다.

yoon.homme
yoon.homme

기술과 커뮤니케이션의 힘이 세상을 바꾼다고 믿습니다.

편리한 세상으로 나아가기 위해 고민하고 개발합니다.