Post

[Machine Learning] DieTanic EDA 예측 -1-

In [4]:

1
data.head()

Out[4]:

 PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th…female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS

In [4]:

1
2
3
# 결측치 개수 확인

data.isnull().sum()

Out[4]:

1
2
3
4
5
6
7
8
9
10
11
12
13
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:

1
2
3
4
5
6
7
8
f, ax = plt.subplots(1,2, figsize=(18,8))
data['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct="%1.1f%%", ax=ax[0], shadow=True)
ax[0].set_title('Survived')
ax[0].set_ylabel('')

sns.countplot('Survived', data=data, ax=ax[1])
ax[1].set_title('Survived')
plt.show()

In [9]:

1
data.groupby(['Sex','Survived'])['Survived'].count()

Out[9]:

1
2
3
4
5
6
Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

In [10]:

1
2
3
4
5
6
f,ax= plt.subplots(1,2,figsize=(18,8))
data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex',hue='Survived', data=data,ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

In [12]:

1
pd.crosstab(data.Pclass,data.Survived,margins=True).style.background_gradient(cmap="Blues_r")

Out[12]:

Survived01All
Pclass   
180136216
29787184
3372119491
All549342891

In [5]:

1
2
3
4
5
6
7
f,ax= plt.subplots(1,2,figsize=(18,8))
data['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax = ax[0])
ax[0].set_title("Number Of Passengers By Pclass")
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue="Survived", data=data, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead')
plt.show()

In [14]:

1
pd.crosstab([data.Sex,data.Survived], data.Pclass, margins=True).style.background_gradient(cmap='Blues_r')

Out[14]:

 Pclass123All
SexSurvived    
female0367281
1917072233 
male07791300468
1451747109 
All 216184491891

In [15]:

1
2
sns.factorplot('Pclass','Survived',hue="Sex", data=data)
plt.show()

In [16]:

1
2
3
print("Oldest Passesnger was of: ",data['Age'].max(),"Years")
print('Youngest Passeneger was of: ',data['Age'].min(),'Years')
print('Average Age on the ship: ',data["Age"].mean(),'Years')
1
2
3
Oldest Passesnger was of:  80.0 Years
Youngest Passeneger was of:  0.42 Years
Average Age on the ship:  29.69911764705882 Years

In [19]:

1
2
3
4
5
6
7
8
9
10
f,ax= plt.subplots(1,2,figsize=(18,8))

sns.violinplot('Pclass','Age', hue="Survived", data=data, split=True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0,110,10))

sns.violinplot('Sex','Age',hue="Survived", data=data, split=True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0,110,10))
plt.show()

In [7]:

1
2
3
4
5
6
7
# 호칭 추출

data['Initial'] =0
for i in data:
    data["Initial"] = data.Name.str.extract('([A-Za-z]+)\.')

data['Initial']

Out[7]:

1
2
3
4
5
6
7
8
9
10
11
12
0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Initial, Length: 891, dtype: object

In [24]:

1
2
# 호칭과 성별 파악하기
pd.crosstab(data.Initial, data.Sex).T.style.background_gradient(cmap="Blues_r")

Out[24]:

InitialCaptColCountessDonDrJonkheerLadyMajorMasterMissMlleMmeMrMrsMsRevSir
Sex                 
female001010100182210125100
male12016102400005170061

In [8]:

1
data['Initial'].replace(['Mlle','Mme',"Ms",'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)

In [14]:

1
2
# 호칭 연령대 평균
data.groupby("Initial")['Age'].mean()

Out[14]:

1
2
3
4
5
6
7
Initial
Master     4.574167
Miss      21.860000
Mr        32.739609
Mrs       35.981818
Other     45.888889
Name: Age, dtype: float64

In [18]:

1
2
3
4
5
6
7
# 호칭 연령대의 평균을 각 그룹의 결측치에 대입

data.loc[(data.Age.isnull()) & (data.Initial=='Mr'),'Age'] =33
data.loc[(data.Age.isnull()) & (data.Initial =="Mrs"),'Age'] = 36
data.loc[(data.Age.isnull()) & (data.Initial == "Master"),"Age"] =5
data.loc[(data.Age.isnull()) & (data.Initial =='Miss'),"Age"] =22
data.loc[(data.Age.isnull()) & (data.Initial=='Other'),"Age"] =46

In [19]:

1
data.Age.isnull().value_counts()

Out[19]:

1
2
False    891
Name: Age, dtype: int64

In [21]:

1
2
3
4
5
6
7
8
9
10
11
f,ax= plt.subplots(1,2,figsize=(20,10))
data[data["Survived"]==0].Age.plot.hist(ax=ax[0],bins=20, edgecolor='black', color='red')
ax[0].set_title("Survived = 0")
x1= list(range(0,85,5))
ax[0].set_xticks(x1)

data[data["Survived"]==1].Age.plot.hist(ax=ax[1], color='green',bins=20, edgecolor='black')
ax[1].set_title("Survived = 1")
x2= list(range(0,85,5))
ax[1].set_xticks(x2)
plt.show()

In [22]:

1
2
sns.factorplot('Pclass','Survived', col='Initial', data=data)
plt.show()

In [23]:

1
pd.crosstab([data.Embarked, data.Pclass],[data.Sex, data.Survived], margins=True).style.background_gradient(cmap='summer_r')

Out[23]:

 Sexfemale male All
 Survived0101 
EmbarkedPclass     
C1142251785
2078217 
3815331066 
Q101102
202103 
392436372 
S12465128127
26618215164 
3553323134353 
All 81231468109889

In [24]:

1
2
3
4
sns.factorplot('Embarked','Survived',data=data)
fig= plt.gcf()
fig.set_size_inches(5,3)
plt.show()

In [26]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
f, ax= plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=data, ax=ax[0,0])
ax[0,0].set_title("No. Of Passengers Boarded")

sns.countplot("Embarked", hue="Sex", data=data, ax=ax[0,1])
ax[0,1].set_title("Male-Female Split for Embarked")

sns.countplot('Embarked', hue="Survived", data=data,ax=ax[1,0])
ax[1,0].set_title("Embarked vs Survived")

sns.countplot("Embarked", hue="Pclass", data=data, ax=ax[1,1])
ax[1,1].set_title("Embarked vs Pclass")

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

In [27]:

1
2
sns.factorplot("Pclass",'Survived', hue='Sex', col='Embarked', data= data)
plt.show()

In [28]:

1
data['Embarked'].fillna('S', inplace=True)

In [30]:

1
data.Embarked.isnull().any()

Out[30]:

1
False

In [32]:

1
pd.crosstab([data.SibSp], data.Survived).style.background_gradient(cmap="summer_r")

Out[32]:

Survived01
SibSp  
0398210
197112
21513
3124
4153
550
870

In [45]:

1
2
3
4
5
6
7
f,ax= plt.subplots(1,2,figsize=(20,8))
sns.barplot('SibSp','Survived', data=data, ax=ax[0])
ax[0].set_title("SibSp vs Survived")

sns.factorplot('SibSp','Survived',data=data, ax=ax[1])
ax[1].set_title("SibSp vs Survived")
plt.show()

In [47]:

1
pd.crosstab(data.SibSp,data.Pclass).style.background_gradient(cmap='Blues')

Out[47]:

Pclass123
SibSp   
0137120351
1715583
25815
33112
40018
5005
8007

In [48]:

1
pd.crosstab(data.Parch,data.Pclass).style.background_gradient(cmap="Blues")

Out[48]:

Pclass123
Parch   
0163134381
1313255
2211643
3023
4103
5005
6001

In [55]:

1
2
3
4
5
6
7
8
9
f, ax= plt.subplots(1,2,figsize=(20,8))
sns.barplot("Parch",'Survived', data=data, ax= ax[0])
ax[0].set_title("Parch vs Survived")

sns.factorplot('Parch',"Survived", data= data,ax=ax[1])
ax[1].set_title("Parch vs Survived")


plt.show()

In [57]:

1
2
3
print("Highest Fare was: ",data['Fare'].max())
print("Lowest Fare was: ",data['Fare'].min())
print("Average Fare was: ",data['Fare'].mean())
1
2
3
Highest Fare was:  512.3292
Lowest Fare was:  0.0
Average Fare was:  32.204207968574636

In [58]:

1
2
3
4
5
6
7
8
9
10
11
f,ax=plt.subplots(1,3,figsize=(20,8))
sns.distplot(data[data['Pclass']==1].Fare, ax=ax[0])
ax[0].set_title("Fares in Pclass 1")

sns.distplot(data[data['Pclass']==2].Fare, ax=ax[1])
ax[1].set_title("Fares in Pclass 2")

sns.distplot(data[data['Pclass']==3].Fare, ax=ax[2])
ax[2].set_title("Fares in Pclass 3")

plt.show()

In [59]:

1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
This post is licensed under CC BY 4.0 by the author.

Comments powered by Disqus.