[Machine Learning] DieTanic EDA 예측 -1-
In [4]:
1
data.head()
Out[4]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th… | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [4]:
1
2
3
# 결측치 개수 확인
data.isnull().sum()
Out[4]:
1
2
3
4
5
6
7
8
9
10
11
12
13
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
In [5]:
1
2
3
4
5
6
7
8
f, ax = plt.subplots(1,2, figsize=(18,8))
data['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct="%1.1f%%", ax=ax[0], shadow=True)
ax[0].set_title('Survived')
ax[0].set_ylabel('')
sns.countplot('Survived', data=data, ax=ax[1])
ax[1].set_title('Survived')
plt.show()
In [9]:
1
data.groupby(['Sex','Survived'])['Survived'].count()
Out[9]:
1
2
3
4
5
6
Sex Survived
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
In [10]:
1
2
3
4
5
6
f,ax= plt.subplots(1,2,figsize=(18,8))
data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex',hue='Survived', data=data,ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()
In [12]:
1
pd.crosstab(data.Pclass,data.Survived,margins=True).style.background_gradient(cmap="Blues_r")
Out[12]:
Survived | 0 | 1 | All |
---|---|---|---|
Pclass | |||
1 | 80 | 136 | 216 |
2 | 97 | 87 | 184 |
3 | 372 | 119 | 491 |
All | 549 | 342 | 891 |
In [5]:
1
2
3
4
5
6
7
f,ax= plt.subplots(1,2,figsize=(18,8))
data['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax = ax[0])
ax[0].set_title("Number Of Passengers By Pclass")
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue="Survived", data=data, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead')
plt.show()
In [14]:
1
pd.crosstab([data.Sex,data.Survived], data.Pclass, margins=True).style.background_gradient(cmap='Blues_r')
Out[14]:
Pclass | 1 | 2 | 3 | All | |
---|---|---|---|---|---|
Sex | Survived | ||||
female | 0 | 3 | 6 | 72 | 81 |
1 | 91 | 70 | 72 | 233 | |
male | 0 | 77 | 91 | 300 | 468 |
1 | 45 | 17 | 47 | 109 | |
All | 216 | 184 | 491 | 891 |
In [15]:
1
2
sns.factorplot('Pclass','Survived',hue="Sex", data=data)
plt.show()
In [16]:
1
2
3
print("Oldest Passesnger was of: ",data['Age'].max(),"Years")
print('Youngest Passeneger was of: ',data['Age'].min(),'Years')
print('Average Age on the ship: ',data["Age"].mean(),'Years')
1
2
3
Oldest Passesnger was of: 80.0 Years
Youngest Passeneger was of: 0.42 Years
Average Age on the ship: 29.69911764705882 Years
In [19]:
1
2
3
4
5
6
7
8
9
10
f,ax= plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass','Age', hue="Survived", data=data, split=True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0,110,10))
sns.violinplot('Sex','Age',hue="Survived", data=data, split=True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0,110,10))
plt.show()
In [7]:
1
2
3
4
5
6
7
# 호칭 추출
data['Initial'] =0
for i in data:
data["Initial"] = data.Name.str.extract('([A-Za-z]+)\.')
data['Initial']
Out[7]:
1
2
3
4
5
6
7
8
9
10
11
12
0 Mr
1 Mrs
2 Miss
3 Mrs
4 Mr
...
886 Rev
887 Miss
888 Miss
889 Mr
890 Mr
Name: Initial, Length: 891, dtype: object
In [24]:
1
2
# 호칭과 성별 파악하기
pd.crosstab(data.Initial, data.Sex).T.style.background_gradient(cmap="Blues_r")
Out[24]:
Initial | Capt | Col | Countess | Don | Dr | Jonkheer | Lady | Major | Master | Miss | Mlle | Mme | Mr | Mrs | Ms | Rev | Sir |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sex | |||||||||||||||||
female | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 182 | 2 | 1 | 0 | 125 | 1 | 0 | 0 |
male | 1 | 2 | 0 | 1 | 6 | 1 | 0 | 2 | 40 | 0 | 0 | 0 | 517 | 0 | 0 | 6 | 1 |
In [8]:
1
data['Initial'].replace(['Mlle','Mme',"Ms",'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)
In [14]:
1
2
# 호칭 연령대 평균
data.groupby("Initial")['Age'].mean()
Out[14]:
1
2
3
4
5
6
7
Initial
Master 4.574167
Miss 21.860000
Mr 32.739609
Mrs 35.981818
Other 45.888889
Name: Age, dtype: float64
In [18]:
1
2
3
4
5
6
7
# 호칭 연령대의 평균을 각 그룹의 결측치에 대입
data.loc[(data.Age.isnull()) & (data.Initial=='Mr'),'Age'] =33
data.loc[(data.Age.isnull()) & (data.Initial =="Mrs"),'Age'] = 36
data.loc[(data.Age.isnull()) & (data.Initial == "Master"),"Age"] =5
data.loc[(data.Age.isnull()) & (data.Initial =='Miss'),"Age"] =22
data.loc[(data.Age.isnull()) & (data.Initial=='Other'),"Age"] =46
In [19]:
1
data.Age.isnull().value_counts()
Out[19]:
1
2
False 891
Name: Age, dtype: int64
In [21]:
1
2
3
4
5
6
7
8
9
10
11
f,ax= plt.subplots(1,2,figsize=(20,10))
data[data["Survived"]==0].Age.plot.hist(ax=ax[0],bins=20, edgecolor='black', color='red')
ax[0].set_title("Survived = 0")
x1= list(range(0,85,5))
ax[0].set_xticks(x1)
data[data["Survived"]==1].Age.plot.hist(ax=ax[1], color='green',bins=20, edgecolor='black')
ax[1].set_title("Survived = 1")
x2= list(range(0,85,5))
ax[1].set_xticks(x2)
plt.show()
In [22]:
1
2
sns.factorplot('Pclass','Survived', col='Initial', data=data)
plt.show()
In [23]:
1
pd.crosstab([data.Embarked, data.Pclass],[data.Sex, data.Survived], margins=True).style.background_gradient(cmap='summer_r')
Out[23]:
Sex | female | male | All | |||
---|---|---|---|---|---|---|
Survived | 0 | 1 | 0 | 1 | ||
Embarked | Pclass | |||||
C | 1 | 1 | 42 | 25 | 17 | 85 |
2 | 0 | 7 | 8 | 2 | 17 | |
3 | 8 | 15 | 33 | 10 | 66 | |
Q | 1 | 0 | 1 | 1 | 0 | 2 |
2 | 0 | 2 | 1 | 0 | 3 | |
3 | 9 | 24 | 36 | 3 | 72 | |
S | 1 | 2 | 46 | 51 | 28 | 127 |
2 | 6 | 61 | 82 | 15 | 164 | |
3 | 55 | 33 | 231 | 34 | 353 | |
All | 81 | 231 | 468 | 109 | 889 |
In [24]:
1
2
3
4
sns.factorplot('Embarked','Survived',data=data)
fig= plt.gcf()
fig.set_size_inches(5,3)
plt.show()
In [26]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
f, ax= plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=data, ax=ax[0,0])
ax[0,0].set_title("No. Of Passengers Boarded")
sns.countplot("Embarked", hue="Sex", data=data, ax=ax[0,1])
ax[0,1].set_title("Male-Female Split for Embarked")
sns.countplot('Embarked', hue="Survived", data=data,ax=ax[1,0])
ax[1,0].set_title("Embarked vs Survived")
sns.countplot("Embarked", hue="Pclass", data=data, ax=ax[1,1])
ax[1,1].set_title("Embarked vs Pclass")
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
In [27]:
1
2
sns.factorplot("Pclass",'Survived', hue='Sex', col='Embarked', data= data)
plt.show()
In [28]:
1
data['Embarked'].fillna('S', inplace=True)
In [30]:
1
data.Embarked.isnull().any()
Out[30]:
1
False
In [32]:
1
pd.crosstab([data.SibSp], data.Survived).style.background_gradient(cmap="summer_r")
Out[32]:
Survived | 0 | 1 |
---|---|---|
SibSp | ||
0 | 398 | 210 |
1 | 97 | 112 |
2 | 15 | 13 |
3 | 12 | 4 |
4 | 15 | 3 |
5 | 5 | 0 |
8 | 7 | 0 |
In [45]:
1
2
3
4
5
6
7
f,ax= plt.subplots(1,2,figsize=(20,8))
sns.barplot('SibSp','Survived', data=data, ax=ax[0])
ax[0].set_title("SibSp vs Survived")
sns.factorplot('SibSp','Survived',data=data, ax=ax[1])
ax[1].set_title("SibSp vs Survived")
plt.show()
In [47]:
1
pd.crosstab(data.SibSp,data.Pclass).style.background_gradient(cmap='Blues')
Out[47]:
Pclass | 1 | 2 | 3 |
---|---|---|---|
SibSp | |||
0 | 137 | 120 | 351 |
1 | 71 | 55 | 83 |
2 | 5 | 8 | 15 |
3 | 3 | 1 | 12 |
4 | 0 | 0 | 18 |
5 | 0 | 0 | 5 |
8 | 0 | 0 | 7 |
In [48]:
1
pd.crosstab(data.Parch,data.Pclass).style.background_gradient(cmap="Blues")
Out[48]:
Pclass | 1 | 2 | 3 |
---|---|---|---|
Parch | |||
0 | 163 | 134 | 381 |
1 | 31 | 32 | 55 |
2 | 21 | 16 | 43 |
3 | 0 | 2 | 3 |
4 | 1 | 0 | 3 |
5 | 0 | 0 | 5 |
6 | 0 | 0 | 1 |
In [55]:
1
2
3
4
5
6
7
8
9
f, ax= plt.subplots(1,2,figsize=(20,8))
sns.barplot("Parch",'Survived', data=data, ax= ax[0])
ax[0].set_title("Parch vs Survived")
sns.factorplot('Parch',"Survived", data= data,ax=ax[1])
ax[1].set_title("Parch vs Survived")
plt.show()
In [57]:
1
2
3
print("Highest Fare was: ",data['Fare'].max())
print("Lowest Fare was: ",data['Fare'].min())
print("Average Fare was: ",data['Fare'].mean())
1
2
3
Highest Fare was: 512.3292
Lowest Fare was: 0.0
Average Fare was: 32.204207968574636
In [58]:
1
2
3
4
5
6
7
8
9
10
11
f,ax=plt.subplots(1,3,figsize=(20,8))
sns.distplot(data[data['Pclass']==1].Fare, ax=ax[0])
ax[0].set_title("Fares in Pclass 1")
sns.distplot(data[data['Pclass']==2].Fare, ax=ax[1])
ax[1].set_title("Fares in Pclass 2")
sns.distplot(data[data['Pclass']==3].Fare, ax=ax[2])
ax[2].set_title("Fares in Pclass 3")
plt.show()
In [59]:
1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
This post is licensed under CC BY 4.0 by the author.
Comments powered by Disqus.