[Machine Learning] 타이타닉 생존자 머신러닝 -2-

Posted Nov 29, 2023

By 엉덩희 5 min read

In [4]:

df_train=pd.read_csv('Dataset/Titanic/train.csv')
df_test=pd.read_csv('Dataset/Titanic/test.csv')

Pclass분석

In [9]:

df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).count() #groupby() 메서드로 Pclass별로 묶어서 각각의 등급별로 사람 수 세기.

Out[9]:

	Survived
Pclass
1	216
2	184
3	491

In [10]:

df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).sum()  # sum()을 하면 전체 값을 더하기 때문에 그 값이 결국 생존자 수라는 것을 알 수 있다.

Out[10]:

	Survived
Pclass
1	136
2	87
3	119

In [12]:

# pandas의 crosstab으로 더 편하게 정보들을 볼 수 있다. style로 cmap을 설정하여 원하는 색으로 지정할 수 있다.
pd.crosstab(df_train['Pclass'],df_train['Survived'],margins=True).style.background_gradient(cmap='summer_r')

Out[12]:

Survived	0	1	All
Pclass
1	80	136	216
2	97	87	184
3	372	119	491
All	549	342	891

In [19]:

# Survived가 1과 0값이라 평균자체가 생존률이 될 수 있다. 따라서 이것을 정렬하고 그래프로 그렸다.
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived',ascending=False).plot.bar()

Out[19]:

<AxesSubplot:xlabel='Pclass'>

In [17]:

# seaborn으로도 시각화 bar플롯으로는 등급별 사람 수, count플롯으로는 등급별 생존자와 사망자 나눠서 표현

y_position=1.02
f,ax=plt.subplots(1,2, figsize=(18,8))

df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number of Passengers By Pclass', y = y_position)
ax[0].set_ylabel('Count')

sns.countplot('Pclass', hue='Survived', data=df_train,ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y = y_position)

plt.show()

SEX분석

In [24]:

# 성별 비율과 각 성별에 따른 생존자와 사망자 수 시각화

f,ax  = plt.subplots(1,2, figsize=(18,8))
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title("Survived vs Sex")
sns.countplot('Sex',data= df_train,hue='Survived',ax=ax[1])
ax[1].set_title("Sex: Survived vs Dead")
plt.show()

In [27]:

# mean()으로 성별에 따른 생존률 확인
df_train[['Sex','Survived']].groupby(['Sex'],as_index=False).mean().sort_values(by="Survived",ascending=False)

Out[27]:

	Sex	Survived
0	female	0.742038
1	male	0.188908

In [28]:

pd.crosstab(df_train['Sex'],df_train['Survived'], margins=True).style.background_gradient(cmap='Blues_r')

Out[28]:

Survived	0	1	All
Sex
female	81	233	314
male	468	109	577
All	549	342	891

Both Sex and Pclass

In [31]:

# seaborn의 factorplot으로 복수의 피쳐들을 확인(확률값 출력)
sns.factorplot('Pclass','Survived', hue='Sex', data=df_train, size=6, aspect=1.5)

Out[31]:

<seaborn.axisgrid.FacetGrid at 0x1f87f1d1b80>

In [36]:

# factorplot에 col= 옵션을 주어서 각각 등급별로 그래프 따로따로 그렸음.
sns.factorplot(x="Sex",y='Survived',col="Pclass", data=df_train, satureation=.5, size=9, aspect=1)

Out[36]:

<seaborn.axisgrid.FacetGrid at 0x1f87efa93d0>

Age분석

In [41]:

# 나이 분석
print('제일 나이 많은 탑승객: {:.1f}years'.format(df_train['Age'].max()))
print('제일 어린 탑승객: {:.1f}years'.format(df_train['Age'].min()))
print('탑승객 평균 나이: {:.1f}years'.format(df_train['Age'].mean()))

제일 나이 많은 탑승객: 80.0years
제일 어린 탑승객: 0.4years
탑승객 평균 나이: 29.7years

In [45]:

# 나이에 따른 histogram

fig,ax= plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived']==1]['Age'], ax =ax)
sns.kdeplot(df_train[df_train['Survived']==0]['Age'], ax=ax)
plt.legend(['Survived==1','Survived==0'])
plt.show()

In [47]:

# 나이와 등급 분포
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass']==1].plot(kind="kde")
df_train['Age'][df_train['Pclass']==2].plot(kind='kde')
df_train['Age'][df_train['Pclass']==3].plot(kind='kde')

plt.xlabel('Age')
plt.title("Age Distribution within classes")
plt.legend(['1st Class','2nd Class','3rd Class'])
plt.show()

In [48]:

# 나이에 따른 생존률 변화 추이
cummulate_survival_ratio =[]
for i in range(1,81):
    cummulate_survival_ratio.append(df_train[df_train['Age']<i]['Survived'].sum()/len(df_train[df_train['Age']<i]['Survived']))
    
plt.figure(figsize=(8,6))
plt.plot(cummulate_survival_ratio)
plt.title("Survival rate change depending on range of Age", y=1.02)
plt.ylabel("Survival rate")
plt.xlabel("Range of Age(0~x)")
plt.show()

Pclass, Sex and Age

In [49]:

# violin plot으로 시각화
f,ax= plt.subplots(1,2,figsize=(18,8))
sns.violinplot("Pclass", "Age", hue='Survived', data=df_train, scale="count", split=True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age",hue="Survived",data=df_train, scale="count", split=True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0,110,10))
plt.show()

Embarked 분석

In [51]:

# 선착장 별로 생존률 구하고 정렬한 뒤 시각화
f,ax = plt.subplots(1,1,figsize=(7,7))
df_train[['Embarked','Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by="Survived",ascending=False).plot.bar(ax=ax)

Out[51]:

<AxesSubplot:xlabel='Embarked'>

In [55]:

# 선착장과 다른 피쳐들과의 관계

f,ax= plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=df_train,ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')

sns.countplot("Embarked", hue='Sex',data=df_train, ax= ax[0,1])
ax[0,1].set_title("(2) Male_Female Split for Embarked")

sns.countplot("Embarked", hue="Survived", data=df_train, ax=ax[1,0])
ax[1,0].set_title("(3) Embarked vs Survived")

sns.countplot("Embarked", hue="Pclass", data=df_train,ax=ax[1,1])
ax[1,1].set_title("(4) embarked vs Pclass")

plt.subplots_adjust(wspace=0.2, hspace=0.5)

plt.show()

Family - Sibsp + Parch

In [56]:

df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] +1 # +1은 자기자신까지합쳐서
df_test['FamilySize']= df_test['SibSp'] + df_test['Parch'] +1

In [57]:

print("Maximum size of Family: ",df_train['FamilySize'].max())
print("Minimum size of Family: ",df_train['FamilySize'].min())

Maximum size of Family:  11
Minimum size of Family:  1

In [60]:

f, ax = plt.subplots(1,3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title("(1) No. Of Family Boarded")

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title("(2) Survived countplot depending on FamilySize", y=1.02)

df_train[['FamilySize','Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by="Survived", ascending=False).plot.bar(ax=ax[2])
ax[2].set_title("(3) Survived rate depending on FamilySize", y=1.02)

plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()

Fare

In [61]:

# Fare 분포 보기
fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g= g.legend(loc='best')

In [62]:

# Fare 결측치를 평균값으로 대체
df_test.loc[df_test.Fare.isnull(), "Fare"] = df_test['Fare'].mean()

In [64]:

df_train.loc[df_train.Fare.isnull(),'Fare'] = df_train['Fare'].mean()

In [65]:

df_train['Fare']= df_train['Fare'].map(lambda i:np.log(i) if i>0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i :  np.log(i) if i>0 else 0)

In [66]:

# Fare값이 튀는 것을 막기위해 로그값을 취해준다.
fig,ax= plt.subplots(1,1,figsize=(8,6))
g= sns.distplot(df_train['Fare'],color='b',label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g= g.legend(loc='best')

plt.show()

In [67]:

from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

Bigdata-AI, Kaggle

This post is licensed under CC BY 4.0 by the author.