🐧

kaggle Titanic 머신러닝 입문기

import pandas as pd

Load Dataset

train = pd.read_csv("data/train.csv", index_col="PassengerId")

print(train.shape)
train.head()

(891, 11)

SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS

Explore

%matplotlib inline
import seaborn as sns
sns.countplot(data=train, x="Embarked", hue="Survived")

<matplotlib.axes._subplots.AxesSubplot at 0x23ddebd21d0>

이미지
low_fare = train[train["Fare"] < 100]
low_fare.head()
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
sns.lmplot(data=low_fare, x="Age", y="Fare", hue="Survived", fit_reg=False)
    <seaborn.axisgrid.FacetGrid at 0x23ddeba74e0>
png

Reload Dataset

train = pd.read_csv("data/train.csv", index_col="PassengerId")

print(train.shape)
train.head()
    (891, 11)
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
test = pd.read_csv("data/test.csv", index_col="PassengerId")

print(test.shape)
test.head()

(418, 10)

PclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
8923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
8933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
8942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
8953Wirz, Mr. Albertmale27.0003151548.6625NaNS
8963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS

Preprocessing

Encode Sex

train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1

print(train.shape)
train.head()

(891, 11)

SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harris022.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...138.010PC 1759971.2833C85C
313Heikkinen, Miss. Laina126.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)135.01011380353.1000C123S
503Allen, Mr. William Henry035.0003734508.0500NaNS
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1

print(test.shape)
test.head()

(418, 10)

PclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
8923Kelly, Mr. James034.5003309117.8292NaNQ
8933Wilkes, Mrs. James (Ellen Needs)147.0103632727.0000NaNS
8942Myles, Mr. Thomas Francis062.0002402769.6875NaNQ
8953Wirz, Mr. Albert027.0003151548.6625NaNS
8963Hirvonen, Mrs. Alexander (Helga E Lindqvist)122.011310129812.2875NaNS

Fill in missing fare

# 비어있다 => NaN(Not a Number), null
test.loc[test["Fare"].isnull(), "Fare"] = 0

test[test["Fare"].isnull()]
PclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId

Encode Embarked

# True == 1, False == 0
train["Embarked_C"] = train["Embarked"] == "C"
train["Embarked_S"] = train["Embarked"] == "S"
train["Embarked_Q"] = train["Embarked"] == "Q"

train[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()
EmbarkedEmbarked_CEmbarked_SEmbarked_Q
PassengerId
1SFalseTrueFalse
2CTrueFalseFalse
3SFalseTrueFalse
4SFalseTrueFalse
5SFalseTrueFalse
test["Embarked_C"] = test["Embarked"] == "C"
test["Embarked_S"] = test["Embarked"] == "S"
test["Embarked_Q"] = test["Embarked"] == "Q"

test[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()
EmbarkedEmbarked_CEmbarked_SEmbarked_Q
PassengerId
892QFalseFalseTrue
893SFalseTrueFalse
894QFalseFalseTrue
895SFalseTrueFalse
896SFalseTrueFalse

Train

feature_names = ["Pclass", "Sex", "Fare",
                 "Embarked_C", "Embarked_S", "Embarked_Q"]

feature_names

[‘Pclass’, ‘Sex’, ‘Fare’, ‘Embarked_C’, ‘Embarked_S’, ‘Embarked_Q’]

X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(891, 6)

PclassSexFareEmbarked_CEmbarked_SEmbarked_Q
PassengerId
1307.2500FalseTrueFalse
21171.2833TrueFalseFalse
3317.9250FalseTrueFalse
41153.1000FalseTrueFalse
5308.0500FalseTrueFalse
label_name = "Survived"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(891,)

PassengerId 1 0 2 1 3 1 4 1 5 0 Name: Survived, dtype: int64

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5)
model

DecisionTreeClassifier(class_weight=None, criterion=‘gini’, max_depth=5, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter=‘best’)

model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion=‘gini’, max_depth=5, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter=‘best’)

Visualize

%load_ext gvmagic
from sklearn.tree import export_graphviz

dot_tree = export_graphviz(model,
                           feature_names=feature_names,
                           class_names=["Perish", "Survived"],
                           out_file=None)

%dotstr dot_tree
svg

Predict

X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(418, 6)

PclassSexFareEmbarked_CEmbarked_SEmbarked_Q
PassengerId
892307.8292FalseFalseTrue
893317.0000FalseTrueFalse
894209.6875FalseFalseTrue
895308.6625FalseTrueFalse
8963112.2875FalseTrueFalse
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]

(418,)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

Submit

submission = pd.read_csv("data/gender_submission.csv", index_col="PassengerId")

submission["Survived"] = predictions

print(submission.shape)
submission.head()

(418, 1)

Survived
PassengerId
8920
8930
8940
8950
8961
submission.to_csv("data/decision-tree.csv")
png

결과는 0.78947로 나름 만족스럽게 나왔다.

yoon.homme
yoon.homme

기술과 커뮤니케이션의 힘이 세상을 바꾼다고 믿습니다.

편리한 세상으로 나아가기 위해 고민하고 개발합니다.