[Machine Learning] 타이타닉 생존자 머신러닝 -2-
In [4]:
1
2
df_train=pd.read_csv('Dataset/Titanic/train.csv')
df_test=pd.read_csv('Dataset/Titanic/test.csv')
Pclass분석
In [9]:
1
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).count() #groupby() 메서드로 Pclass별로 묶어서 각각의 등급별로 사람 수 세기.
Out[9]:
Survived | |
---|---|
Pclass | |
1 | 216 |
2 | 184 |
3 | 491 |
In [10]:
1
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).sum() # sum()을 하면 전체 값을 더하기 때문에 그 값이 결국 생존자 수라는 것을 알 수 있다.
Out[10]:
Survived | |
---|---|
Pclass | |
1 | 136 |
2 | 87 |
3 | 119 |
In [12]:
1
2
# pandas의 crosstab으로 더 편하게 정보들을 볼 수 있다. style로 cmap을 설정하여 원하는 색으로 지정할 수 있다.
pd.crosstab(df_train['Pclass'],df_train['Survived'],margins=True).style.background_gradient(cmap='summer_r')
Out[12]:
Survived | 0 | 1 | All |
---|---|---|---|
Pclass | |||
1 | 80 | 136 | 216 |
2 | 97 | 87 | 184 |
3 | 372 | 119 | 491 |
All | 549 | 342 | 891 |
In [19]:
1
2
# Survived가 1과 0값이라 평균자체가 생존률이 될 수 있다. 따라서 이것을 정렬하고 그래프로 그렸다.
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived',ascending=False).plot.bar()
Out[19]:
1
<AxesSubplot:xlabel='Pclass'>
In [17]:
1
2
3
4
5
6
7
8
9
10
11
12
13
# seaborn으로도 시각화 bar플롯으로는 등급별 사람 수, count플롯으로는 등급별 생존자와 사망자 나눠서 표현
y_position=1.02
f,ax=plt.subplots(1,2, figsize=(18,8))
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number of Passengers By Pclass', y = y_position)
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=df_train,ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y = y_position)
plt.show()
SEX분석
In [24]:
1
2
3
4
5
6
7
8
# 성별 비율과 각 성별에 따른 생존자와 사망자 수 시각화
f,ax = plt.subplots(1,2, figsize=(18,8))
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title("Survived vs Sex")
sns.countplot('Sex',data= df_train,hue='Survived',ax=ax[1])
ax[1].set_title("Sex: Survived vs Dead")
plt.show()
In [27]:
1
2
# mean()으로 성별에 따른 생존률 확인
df_train[['Sex','Survived']].groupby(['Sex'],as_index=False).mean().sort_values(by="Survived",ascending=False)
Out[27]:
Sex | Survived | |
---|---|---|
0 | female | 0.742038 |
1 | male | 0.188908 |
In [28]:
1
pd.crosstab(df_train['Sex'],df_train['Survived'], margins=True).style.background_gradient(cmap='Blues_r')
Out[28]:
Survived | 0 | 1 | All |
---|---|---|---|
Sex | |||
female | 81 | 233 | 314 |
male | 468 | 109 | 577 |
All | 549 | 342 | 891 |
Both Sex and Pclass
In [31]:
1
2
# seaborn의 factorplot으로 복수의 피쳐들을 확인(확률값 출력)
sns.factorplot('Pclass','Survived', hue='Sex', data=df_train, size=6, aspect=1.5)
Out[31]:
1
<seaborn.axisgrid.FacetGrid at 0x1f87f1d1b80>
In [36]:
1
2
# factorplot에 col= 옵션을 주어서 각각 등급별로 그래프 따로따로 그렸음.
sns.factorplot(x="Sex",y='Survived',col="Pclass", data=df_train, satureation=.5, size=9, aspect=1)
Out[36]:
1
<seaborn.axisgrid.FacetGrid at 0x1f87efa93d0>
Age분석
In [41]:
1
2
3
4
# 나이 분석
print('제일 나이 많은 탑승객: {:.1f}years'.format(df_train['Age'].max()))
print('제일 어린 탑승객: {:.1f}years'.format(df_train['Age'].min()))
print('탑승객 평균 나이: {:.1f}years'.format(df_train['Age'].mean()))
1
2
3
제일 나이 많은 탑승객: 80.0years
제일 어린 탑승객: 0.4years
탑승객 평균 나이: 29.7years
In [45]:
1
2
3
4
5
6
7
# 나이에 따른 histogram
fig,ax= plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived']==1]['Age'], ax =ax)
sns.kdeplot(df_train[df_train['Survived']==0]['Age'], ax=ax)
plt.legend(['Survived==1','Survived==0'])
plt.show()
In [47]:
1
2
3
4
5
6
7
8
9
10
# 나이와 등급 분포
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass']==1].plot(kind="kde")
df_train['Age'][df_train['Pclass']==2].plot(kind='kde')
df_train['Age'][df_train['Pclass']==3].plot(kind='kde')
plt.xlabel('Age')
plt.title("Age Distribution within classes")
plt.legend(['1st Class','2nd Class','3rd Class'])
plt.show()
In [48]:
1
2
3
4
5
6
7
8
9
10
11
# 나이에 따른 생존률 변화 추이
cummulate_survival_ratio =[]
for i in range(1,81):
cummulate_survival_ratio.append(df_train[df_train['Age']<i]['Survived'].sum()/len(df_train[df_train['Age']<i]['Survived']))
plt.figure(figsize=(8,6))
plt.plot(cummulate_survival_ratio)
plt.title("Survival rate change depending on range of Age", y=1.02)
plt.ylabel("Survival rate")
plt.xlabel("Range of Age(0~x)")
plt.show()
Pclass, Sex and Age
In [49]:
1
2
3
4
5
6
7
8
9
# violin plot으로 시각화
f,ax= plt.subplots(1,2,figsize=(18,8))
sns.violinplot("Pclass", "Age", hue='Survived', data=df_train, scale="count", split=True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age",hue="Survived",data=df_train, scale="count", split=True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0,110,10))
plt.show()
Embarked 분석
In [51]:
1
2
3
# 선착장 별로 생존률 구하고 정렬한 뒤 시각화
f,ax = plt.subplots(1,1,figsize=(7,7))
df_train[['Embarked','Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by="Survived",ascending=False).plot.bar(ax=ax)
Out[51]:
1
<AxesSubplot:xlabel='Embarked'>
In [55]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 선착장과 다른 피쳐들과의 관계
f,ax= plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=df_train,ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')
sns.countplot("Embarked", hue='Sex',data=df_train, ax= ax[0,1])
ax[0,1].set_title("(2) Male_Female Split for Embarked")
sns.countplot("Embarked", hue="Survived", data=df_train, ax=ax[1,0])
ax[1,0].set_title("(3) Embarked vs Survived")
sns.countplot("Embarked", hue="Pclass", data=df_train,ax=ax[1,1])
ax[1,1].set_title("(4) embarked vs Pclass")
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
Family - Sibsp + Parch
In [56]:
1
2
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] +1 # +1은 자기자신까지합쳐서
df_test['FamilySize']= df_test['SibSp'] + df_test['Parch'] +1
In [57]:
1
2
print("Maximum size of Family: ",df_train['FamilySize'].max())
print("Minimum size of Family: ",df_train['FamilySize'].min())
1
2
Maximum size of Family: 11
Minimum size of Family: 1
In [60]:
1
2
3
4
5
6
7
8
9
10
11
12
f, ax = plt.subplots(1,3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title("(1) No. Of Family Boarded")
sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title("(2) Survived countplot depending on FamilySize", y=1.02)
df_train[['FamilySize','Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by="Survived", ascending=False).plot.bar(ax=ax[2])
ax[2].set_title("(3) Survived rate depending on FamilySize", y=1.02)
plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()
Fare
In [61]:
1
2
3
4
# Fare 분포 보기
fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g= g.legend(loc='best')
In [62]:
1
2
# Fare 결측치를 평균값으로 대체
df_test.loc[df_test.Fare.isnull(), "Fare"] = df_test['Fare'].mean()
In [64]:
1
df_train.loc[df_train.Fare.isnull(),'Fare'] = df_train['Fare'].mean()
In [65]:
1
2
df_train['Fare']= df_train['Fare'].map(lambda i:np.log(i) if i>0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i : np.log(i) if i>0 else 0)
In [66]:
1
2
3
4
5
6
# Fare값이 튀는 것을 막기위해 로그값을 취해준다.
fig,ax= plt.subplots(1,1,figsize=(8,6))
g= sns.distplot(df_train['Fare'],color='b',label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g= g.legend(loc='best')
plt.show()
In [67]:
1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
This post is licensed under CC BY 4.0 by the author.
Comments powered by Disqus.