[Machine Learning] DieTanic EDA 예측 -2-

Posted Nov 30, 2023

By 엉덩희 11 min read

In [4]:

data['Initial'].replace(['Mlle','Mme',"Ms",'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)

In [5]:

data.loc[(data.Age.isnull()) & (data.Initial=='Mr'),'Age'] =33
data.loc[(data.Age.isnull()) & (data.Initial =="Mrs"),'Age'] = 36
data.loc[(data.Age.isnull()) & (data.Initial == "Master"),"Age"] =5
data.loc[(data.Age.isnull()) & (data.Initial =='Miss'),"Age"] =22
data.loc[(data.Age.isnull()) & (data.Initial=='Other'),"Age"] =46

In [6]:

data['Embarked'].fillna('S', inplace=True)

In [7]:

# 데이터 피쳐 간 상관관계 계수 히트맵 출력
sns.heatmap(data.corr(),annot=True, cmap="RdYlGn", linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

In [8]:

# 나이 분포를 임의로 나누어서 ordinal 값으로 새로운 피쳐를 만들었다. 사실 apply()메서드로 해도 되긴 하다.
data['Age_band'] = 0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)& (data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)& (data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)& (data['Age']<=60),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4
data.head(2)

Out[8]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Initial	Age_band
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S	Mr	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th…	female	38.0	1	0	PC 17599	71.2833	C85	C	Mrs	2

In [9]:

data['Age_band'].value_counts().to_frame().style.background_gradient(cmap='Blues')

Out[9]:

	Age_band
1	382
2	325
0	115
3	58
4	11

In [10]:

sns.factorplot("Age_band",'Survived',data=data, col='Pclass')
plt.show()

In [12]:

# 가족의 규모를 피쳐로 새로 만들어서 대입하고 혼자 온 사람들의 피쳐도 만들었다.

data['Family_Size']=0
data['Family_Size'] = data['Parch'] + data['SibSp'] # 가족 규모
data['Alone']=0
data.loc[data['Family_Size']==0, 'Alone'] =1 # 혼자 온 사람 1 대입

f,ax= plt.subplots(1,2, figsize=(18,6))
sns.factorplot('Family_Size','Survived', data=data, ax=ax[0])
ax[0].set_title("Family_Size vs Survived")

sns.factorplot("Alone","Survived",data= data , ax =ax[1])
ax[1].set_title("Alone vs Survived")
plt.close(1)
plt.show()

In [13]:

sns.factorplot("Alone",'Survived',data=data, hue='Sex',col='Pclass')
plt.show()

In [15]:

# qcut은 숫자 피쳐를 우리가 원하는 개수로 나뉘어준다.

data['Fare_Range']= pd.qcut(data['Fare'],4)
data.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap='Blues')

Out[15]:

	Survived
Fare_Range
(-0.001, 7.91]	0.197309
(7.91, 14.454]	0.303571
(14.454, 31.0]	0.454955
(31.0, 512.329]	0.581081

In [16]:

# Fare_Range를 다시 Age_Band처럼 나눈다.

data['Fare_cat']=0
data.loc[data['Fare']<=7.91,'Fare_cat'] =0
data.loc[(data['Fare']>7.91) & (data['Fare']<=14.454), 'Fare_cat'] =1
data.loc[(data["Fare"]>14.454)&(data['Fare']<=31.0),'Fare_cat']=2
data.loc[data['Fare']>31.0, 'Fare_cat']=3

In [17]:

sns.factorplot("Fare_cat",'Survived', data=data, hue='Sex')
plt.show()

In [20]:

data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data["Initial"].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

In [22]:

data.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'],axis=1,inplace=True)
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':20})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

머신러닝 모델 사용¶

In [24]:

from sklearn.linear_model import LogisticRegression #로지스틱 회귀
from sklearn import svm # 서포트 벡터 머신
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.naive_bayes import GaussianNB # 베이즈안
from sklearn.tree import DecisionTreeClassifier # 의사결정트리
from sklearn.model_selection import train_test_split # 훈련데이터와 검증데이터 분리
from sklearn import metrics # 정확도 측정
from sklearn.metrics import confusion_matrix # 행렬

In [25]:

train,test= train_test_split(data,test_size=0.3, random_state=0, stratify=data['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]

X= data[data.columns[1:]]
Y= data['Survived']

In [31]:

# SVM 모델 사용하여 검증까지
model=svm.SVC(kernel='rbf', C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))

Accuracy for rbf SVM is  0.835820895522388

In [32]:

# 선형 SVM 모델 사용
model=svm.SVC(kernel='linear', C=0.1,gamma=0.1)
model.fit(train_X,train_Y)
prediction2=model.predict(test_X)
print("Accuracy for linear SVM is ", metrics.accuracy_score(prediction2,test_Y))

Accuracy for linear SVM is  0.8134328358208955

In [34]:

# 로지스틱 회귀 모델 사용
model= LogisticRegression()
model.fit(train_X,train_Y)
prediction3=model.predict(test_X)
print("The accuracy of the Logistic Regression is ", metrics.accuracy_score(prediction3,test_Y))

The accuracy of the Logistic Regression is  0.8171641791044776

In [35]:

# 의사결정 트리 모델 사용
model=DecisionTreeClassifier()
model.fit(train_X,train_Y)
prediction4= model.predict(test_X)
print("The accuracy of the Decission Tree is", metrics.accuracy_score(prediction4,test_Y))

The accuracy of the Decission Tree is 0.8022388059701493

In [38]:

# K 최근접 이웃 모델
model=KNeighborsClassifier()
model.fit(train_X,train_Y)
prediction5= model.predict(test_X)
print("The accuracy of the KNN is", metrics.accuracy_score(prediction5,test_Y))

The accuracy of the KNN is 0.8134328358208955

In [39]:

# KNN 의 상수 K에 값에 따른 정확도 알아보기

a_index=list(range(1,11))
a=pd.Series()
x=[0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
    model=KNeighborsClassifier(n_neighbors=i)
    model.fit(train_X,train_Y)
    prediction= model.predict(test_X)
    a= a.append(pd.Series(metrics.accuracy_score(prediction,test_Y)))
plt.plot(a_index,a)
plt.xticks(x)
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()

In [41]:

# 나이브 베이즈

model=GaussianNB()
model.fit(train_X,train_Y)
prediction6= model.predict(test_X)
print("The accuracy of the NaiveBayes is", metrics.accuracy_score(prediction6,test_Y))

The accuracy of the NaiveBayes is 0.8134328358208955

In [46]:

# 랜덤 포레스트
model =RandomForestClassifier(n_estimators=100)
model.fit(train_X,train_Y)
prediction7=model.predict(test_X)
print("The accuracy of the Random Forests is", metrics.accuracy_score(prediction7,test_Y))

The accuracy of the Random Forests is 0.8134328358208955

In [49]:

# KFold로 교차검증

from sklearn.model_selection import KFold # kfold 교차검증
from sklearn.model_selection import cross_val_score # score 평가
from sklearn.model_selection import cross_val_predict # 예측

kfold= KFold(n_splits=10, random_state=22, shuffle=True) # train 데이터를 10개로 나눔.

xyz=[]
accuracy=[]
std=[]

classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=9),DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]

for i in models:
    model=i
    cv_result=cross_val_score(model,X,Y, cv=kfold, scoring='accuracy')
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)

new_models_dataframe2= pd.DataFrame({'CV Mean':xyz,'Std': std}, index=classifiers)
new_models_dataframe2

Out[49]:

	CV Mean	Std
Linear Svm	0.782360	0.054343
Radial Svm	0.828377	0.057096
Logistic Regression	0.796929	0.040794
KNN	0.811511	0.041301
Decision Tree	0.797978	0.051001
Naive Bayes	0.795843	0.054861
Random Forest	0.813720	0.041955

In [52]:

accuracy

Out[52]:

[array([0.7       , 0.76404494, 0.75280899, 0.80898876, 0.73033708,
        0.76404494, 0.85393258, 0.75280899, 0.80898876, 0.88764045]),
 array([0.74444444, 0.7752809 , 0.80898876, 0.85393258, 0.7752809 ,
        0.79775281, 0.92134831, 0.83146067, 0.85393258, 0.92134831]),
 array([0.73333333, 0.7752809 , 0.80898876, 0.78651685, 0.76404494,
        0.7752809 , 0.86516854, 0.7752809 , 0.82022472, 0.86516854]),
 array([0.75555556, 0.79775281, 0.76404494, 0.82022472, 0.78651685,
        0.79775281, 0.88764045, 0.83146067, 0.79775281, 0.87640449]),
 array([0.8       , 0.83146067, 0.74157303, 0.80898876, 0.79775281,
        0.73033708, 0.83146067, 0.78651685, 0.74157303, 0.91011236]),
 array([0.7       , 0.78651685, 0.75280899, 0.78651685, 0.76404494,
        0.75280899, 0.86516854, 0.82022472, 0.84269663, 0.88764045]),
 array([0.78888889, 0.84269663, 0.75280899, 0.78651685, 0.79775281,
        0.79775281, 0.87640449, 0.83146067, 0.7752809 , 0.88764045])]

In [56]:

box.T

Out[56]:

	Linear Svm	Radial Svm	Logistic Regression	KNN	Decision Tree	Naive Bayes	Random Forest
0	0.700000	0.744444	0.733333	0.755556	0.800000	0.700000	0.788889
1	0.764045	0.775281	0.775281	0.797753	0.831461	0.786517	0.842697
2	0.752809	0.808989	0.808989	0.764045	0.741573	0.752809	0.752809
3	0.808989	0.853933	0.786517	0.820225	0.808989	0.786517	0.786517
4	0.730337	0.775281	0.764045	0.786517	0.797753	0.764045	0.797753
5	0.764045	0.797753	0.775281	0.797753	0.730337	0.752809	0.797753
6	0.853933	0.921348	0.865169	0.887640	0.831461	0.865169	0.876404
7	0.752809	0.831461	0.775281	0.831461	0.786517	0.820225	0.831461
8	0.808989	0.853933	0.820225	0.797753	0.741573	0.842697	0.775281
9	0.887640	0.921348	0.865169	0.876404	0.910112	0.887640	0.887640

In [54]:

# boxplot으로 각각의 모델의 정확도 통계값을 확인
plt.subplots(figsize=(12,6))
box= pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()
plt.show()

In [57]:

# 각각 cv로 출력된 정확도 평균을 표로 출력
new_models_dataframe2['CV Mean'].plot.barh(width=0.8)
plt.title("Average CV Mean Accuracy")
fig = plt.gcf()
fig.set_size_inches(8,5)
plt.show()

In [60]:

# confusion 행렬로 실제 결과와 모델들이 판단한 결과를 히트맵으로 출력

f,ax= plt.subplots(3,3,figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='rbf'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,0], annot=True,fmt='2.0f')
ax[0,0].set_title("Matrix for rbf-SVM")

y_pred=cross_val_predict(svm.SVC(kernel='linear'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred), ax=ax[0,1], annot=True,fmt='2.0f')
ax[0,1].set_title("Matrix for Linear-SVM")

y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,2],annot=True,fmt='2.0f')
ax[0,2].set_title('Matrix for KNN')

y_pred= cross_val_predict(RandomForestClassifier(n_estimators=100),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,0],annot=True,fmt='2.0f')
ax[1,0].set_title("Matrix for Random-Forests")

y_pred= cross_val_predict(LogisticRegression(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,1],annot=True,fmt='2.0f')
ax[1,1].set_title("Matrix for Logistic Regression")

y_pred=cross_val_predict(DecisionTreeClassifier(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred), ax=ax[1,2],annot=True,fmt='2.0f')
ax[1,2].set_title('Matrix for Decision Tree')

y_pred=cross_val_predict(GaussianNB(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[2,0],annot=True,fmt='2.0f')
ax[2,0].set_title("Matrix for Naive Bayes")
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()

In [62]:

# 하이퍼 매개변수로 모델 튜닝후 검증하여 적절한 매개변수 출력
# SVM
from sklearn.model_selection import GridSearchCV
C =  [0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
hyper={'kernel':kernel,'C':C, 'gamma' :gamma}
gd= GridSearchCV(estimator=svm.SVC(),param_grid=hyper, verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
0.8282593685267716
SVC(C=0.6, gamma=0.1)

In [63]:

# 랜덤 포레스트에서 적절한 매개변수
n_estimators=range(100,1000,100)
hyper={'n_estimators': n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(random_state=0),param_grid=hyper, verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.82045069361622
RandomForestClassifier(n_estimators=400, random_state=0)

In [65]:

# 앙상블 voting

from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=9)),
                                             ("RBF",svm.SVC(probability=True,kernel='rbf',C=0.6,gamma=0.1)),
                                             ("RFor",RandomForestClassifier(n_estimators=400,random_state=0)),
                                             ("LR",LogisticRegression(C=0.05)),
                                             ("DT",DecisionTreeClassifier(random_state=0)),
                                             ("NB",GaussianNB()),
                                             ("svm",svm.SVC(kernel='linear',probability=True))],
                                             voting='soft').fit(train_X,train_Y)
print("The accuracy for ensembled model is: ",ensemble_lin_rbf.score(test_X,test_Y))

cross=cross_val_score(ensemble_lin_rbf,X,Y, cv=10, scoring="accuracy")
print("The cross validated score is", cross.mean())

The accuracy for ensembled model is:  0.8283582089552238
The cross validated score is 0.8249188514357053

In [67]:

# 앙상블 bagging KNN

from sklearn.ensemble import BaggingClassifier
model=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=9),random_state=0,n_estimators=400)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print("The accuracy for bagged KNN is:", metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for bagged KNN is:", result.mean())

The accuracy for bagged KNN is: 0.8507462686567164
The cross validated score for bagged KNN is: 0.8093008739076154

In [69]:

# 앙상블 bagging 의사결정트리
model=BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0, n_estimators=100)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print("The accuracy for bagged Decision Tree is:", metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for bagged Decision Tree is: ",result.mean())

The accuracy for bagged Decision Tree is: 0.8283582089552238
The cross validated score for bagged Decision Tree is:  0.8148938826466917

In [70]:

# 부스트 AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result= cross_val_score(ada,X,Y,cv=10,scoring='accuracy')
print("the cross validated score for AdaBoost is:",result.mean())

the cross validated score for AdaBoost is: 0.8260424469413234

In [71]:

# 부스트 Gradient
from sklearn.ensemble import GradientBoostingClassifier
grad=GradientBoostingClassifier(n_estimators=400, random_state=0,learning_rate=0.1)
result=cross_val_score(grad,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for Gradient Boosting is:", result.mean())

The cross validated score for Gradient Boosting is: 0.8104119850187266

In [73]:

!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.0-py3-none-win_amd64.whl (126.1 MB)
     ------------------------------------- 126.1/126.1 MB 15.2 MB/s eta 0:00:00
Requirement already satisfied: numpy in c:\users\easyd\miniforge3\envs\easydong2\lib\site-packages (from xgboost) (1.22.3)
Requirement already satisfied: scipy in c:\users\easyd\miniforge3\envs\easydong2\lib\site-packages (from xgboost) (1.8.0)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.0

In [74]:

# XGBoost
import xgboost as xg
xgboost=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
result=cross_val_score(xgboost,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for SGBoost is: ", result.mean())

The cross validated score for SGBoost is:  0.8160174781523095

In [77]:

# 하이퍼 매개변수 튜닝으로 가장 정확도가 높은 AdBoost를 튜닝

n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators, 'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
0.8293892411022534
AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

In [79]:

# confusion 행렬로 adaboost확인

ada= AdaBoostClassifier(n_estimators=100,random_state=0,learning_rate=0.1)
result= cross_val_predict(ada,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,result),cmap="winter",annot=True,fmt="2.0f")
plt.show()

In [84]:

# 피처 중요도 확인
f,ax= plt.subplots(2,2,figsize=(15,12))
model=RandomForestClassifier(n_estimators=400, random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8, ax=ax[0,0])
ax[0,0].set_title("Feature Importance in Random Forests")

model=AdaBoostClassifier(n_estimators=100,learning_rate=0.1,random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,1], color='#ddff11')
ax[0,1].set_title('Feature Importance in AdaBoost')

model=GradientBoostingClassifier(n_estimators=500,learning_rate=0.1,random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,0], cmap="RdYlGn_r")
ax[1,0].set_title("Feature Importance in Gradient Boosting")

model=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,1],color='#FD0F00')
ax[1,1].set_title("Feature Importance in XgBoost")
plt.show()

In [85]:

from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

Bigdata-AI, Kaggle

This post is licensed under CC BY 4.0 by the author.

머신러닝 모델 사용¶

Trending Tags