Post

[Machine Learning] DieTanic EDA 예측 -2-

In [4]:

1
data['Initial'].replace(['Mlle','Mme',"Ms",'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)

In [5]:

1
2
3
4
5
data.loc[(data.Age.isnull()) & (data.Initial=='Mr'),'Age'] =33
data.loc[(data.Age.isnull()) & (data.Initial =="Mrs"),'Age'] = 36
data.loc[(data.Age.isnull()) & (data.Initial == "Master"),"Age"] =5
data.loc[(data.Age.isnull()) & (data.Initial =='Miss'),"Age"] =22
data.loc[(data.Age.isnull()) & (data.Initial=='Other'),"Age"] =46

In [6]:

1
data['Embarked'].fillna('S', inplace=True)

In [7]:

1
2
3
4
5
# 데이터 피쳐 간 상관관계 계수 히트맵 출력
sns.heatmap(data.corr(),annot=True, cmap="RdYlGn", linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

In [8]:

1
2
3
4
5
6
7
8
# 나이 분포를 임의로 나누어서 ordinal 값으로 새로운 피쳐를 만들었다. 사실 apply()메서드로 해도 되긴 하다.
data['Age_band'] = 0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)& (data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)& (data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)& (data['Age']<=60),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4
data.head(2)

Out[8]:

 PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedInitialAge_band
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNSMr1
1211Cumings, Mrs. John Bradley (Florence Briggs Th…female38.010PC 1759971.2833C85CMrs2

In [9]:

1
data['Age_band'].value_counts().to_frame().style.background_gradient(cmap='Blues')

Out[9]:

 Age_band
1382
2325
0115
358
411

In [10]:

1
2
sns.factorplot("Age_band",'Survived',data=data, col='Pclass')
plt.show()

In [12]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 가족의 규모를 피쳐로 새로 만들어서 대입하고 혼자 온 사람들의 피쳐도 만들었다.

data['Family_Size']=0
data['Family_Size'] = data['Parch'] + data['SibSp'] # 가족 규모
data['Alone']=0
data.loc[data['Family_Size']==0, 'Alone'] =1 # 혼자 온 사람 1 대입

f,ax= plt.subplots(1,2, figsize=(18,6))
sns.factorplot('Family_Size','Survived', data=data, ax=ax[0])
ax[0].set_title("Family_Size vs Survived")

sns.factorplot("Alone","Survived",data= data , ax =ax[1])
ax[1].set_title("Alone vs Survived")
plt.close(1)
plt.show()

In [13]:

1
2
sns.factorplot("Alone",'Survived',data=data, hue='Sex',col='Pclass')
plt.show()

In [15]:

1
2
3
4
# qcut은 숫자 피쳐를 우리가 원하는 개수로 나뉘어준다.

data['Fare_Range']= pd.qcut(data['Fare'],4)
data.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap='Blues')

Out[15]:

 Survived
Fare_Range 
(-0.001, 7.91]0.197309
(7.91, 14.454]0.303571
(14.454, 31.0]0.454955
(31.0, 512.329]0.581081

In [16]:

1
2
3
4
5
6
7
# Fare_Range를 다시 Age_Band처럼 나눈다.

data['Fare_cat']=0
data.loc[data['Fare']<=7.91,'Fare_cat'] =0
data.loc[(data['Fare']>7.91) & (data['Fare']<=14.454), 'Fare_cat'] =1
data.loc[(data["Fare"]>14.454)&(data['Fare']<=31.0),'Fare_cat']=2
data.loc[data['Fare']>31.0, 'Fare_cat']=3

In [17]:

1
2
sns.factorplot("Fare_cat",'Survived', data=data, hue='Sex')
plt.show()

In [20]:

1
2
3
data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data["Initial"].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

In [22]:

1
2
3
4
5
6
7
data.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'],axis=1,inplace=True)
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':20})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

머신러닝 모델 사용

In [24]:

1
2
3
4
5
6
7
8
9
from sklearn.linear_model import LogisticRegression #로지스틱 회귀
from sklearn import svm # 서포트 벡터 머신
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.naive_bayes import GaussianNB # 베이즈안
from sklearn.tree import DecisionTreeClassifier # 의사결정트리
from sklearn.model_selection import train_test_split # 훈련데이터와 검증데이터 분리
from sklearn import metrics # 정확도 측정
from sklearn.metrics import confusion_matrix # 행렬

In [25]:

1
2
3
4
5
6
7
8
train,test= train_test_split(data,test_size=0.3, random_state=0, stratify=data['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]

X= data[data.columns[1:]]
Y= data['Survived']

In [31]:

1
2
3
4
5
# SVM 모델 사용하여 검증까지
model=svm.SVC(kernel='rbf', C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))
1
Accuracy for rbf SVM is  0.835820895522388

In [32]:

1
2
3
4
5
# 선형 SVM 모델 사용
model=svm.SVC(kernel='linear', C=0.1,gamma=0.1)
model.fit(train_X,train_Y)
prediction2=model.predict(test_X)
print("Accuracy for linear SVM is ", metrics.accuracy_score(prediction2,test_Y))
1
Accuracy for linear SVM is  0.8134328358208955

In [34]:

1
2
3
4
5
# 로지스틱 회귀 모델 사용
model= LogisticRegression()
model.fit(train_X,train_Y)
prediction3=model.predict(test_X)
print("The accuracy of the Logistic Regression is ", metrics.accuracy_score(prediction3,test_Y))
1
The accuracy of the Logistic Regression is  0.8171641791044776

In [35]:

1
2
3
4
5
# 의사결정 트리 모델 사용
model=DecisionTreeClassifier()
model.fit(train_X,train_Y)
prediction4= model.predict(test_X)
print("The accuracy of the Decission Tree is", metrics.accuracy_score(prediction4,test_Y))
1
The accuracy of the Decission Tree is 0.8022388059701493

In [38]:

1
2
3
4
5
# K 최근접 이웃 모델
model=KNeighborsClassifier()
model.fit(train_X,train_Y)
prediction5= model.predict(test_X)
print("The accuracy of the KNN is", metrics.accuracy_score(prediction5,test_Y))
1
The accuracy of the KNN is 0.8134328358208955

In [39]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# KNN 의 상수 K에 값에 따른 정확도 알아보기

a_index=list(range(1,11))
a=pd.Series()
x=[0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
    model=KNeighborsClassifier(n_neighbors=i)
    model.fit(train_X,train_Y)
    prediction= model.predict(test_X)
    a= a.append(pd.Series(metrics.accuracy_score(prediction,test_Y)))
plt.plot(a_index,a)
plt.xticks(x)
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()

In [41]:

1
2
3
4
5
6
# 나이브 베이즈

model=GaussianNB()
model.fit(train_X,train_Y)
prediction6= model.predict(test_X)
print("The accuracy of the NaiveBayes is", metrics.accuracy_score(prediction6,test_Y))
1
The accuracy of the NaiveBayes is 0.8134328358208955

In [46]:

1
2
3
4
5
# 랜덤 포레스트
model =RandomForestClassifier(n_estimators=100)
model.fit(train_X,train_Y)
prediction7=model.predict(test_X)
print("The accuracy of the Random Forests is", metrics.accuracy_score(prediction7,test_Y))
1
The accuracy of the Random Forests is 0.8134328358208955

In [49]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# KFold로 교차검증

from sklearn.model_selection import KFold # kfold 교차검증
from sklearn.model_selection import cross_val_score # score 평가
from sklearn.model_selection import cross_val_predict # 예측

kfold= KFold(n_splits=10, random_state=22, shuffle=True) # train 데이터를 10개로 나눔.

xyz=[]
accuracy=[]
std=[]

classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=9),DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]

for i in models:
    model=i
    cv_result=cross_val_score(model,X,Y, cv=kfold, scoring='accuracy')
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)

new_models_dataframe2= pd.DataFrame({'CV Mean':xyz,'Std': std}, index=classifiers)
new_models_dataframe2

Out[49]:

 CV MeanStd
Linear Svm0.7823600.054343
Radial Svm0.8283770.057096
Logistic Regression0.7969290.040794
KNN0.8115110.041301
Decision Tree0.7979780.051001
Naive Bayes0.7958430.054861
Random Forest0.8137200.041955

In [52]:

1
accuracy

Out[52]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
[array([0.7       , 0.76404494, 0.75280899, 0.80898876, 0.73033708,
        0.76404494, 0.85393258, 0.75280899, 0.80898876, 0.88764045]),
 array([0.74444444, 0.7752809 , 0.80898876, 0.85393258, 0.7752809 ,
        0.79775281, 0.92134831, 0.83146067, 0.85393258, 0.92134831]),
 array([0.73333333, 0.7752809 , 0.80898876, 0.78651685, 0.76404494,
        0.7752809 , 0.86516854, 0.7752809 , 0.82022472, 0.86516854]),
 array([0.75555556, 0.79775281, 0.76404494, 0.82022472, 0.78651685,
        0.79775281, 0.88764045, 0.83146067, 0.79775281, 0.87640449]),
 array([0.8       , 0.83146067, 0.74157303, 0.80898876, 0.79775281,
        0.73033708, 0.83146067, 0.78651685, 0.74157303, 0.91011236]),
 array([0.7       , 0.78651685, 0.75280899, 0.78651685, 0.76404494,
        0.75280899, 0.86516854, 0.82022472, 0.84269663, 0.88764045]),
 array([0.78888889, 0.84269663, 0.75280899, 0.78651685, 0.79775281,
        0.79775281, 0.87640449, 0.83146067, 0.7752809 , 0.88764045])]

In [56]:

1
box.T

Out[56]:

 Linear SvmRadial SvmLogistic RegressionKNNDecision TreeNaive BayesRandom Forest
00.7000000.7444440.7333330.7555560.8000000.7000000.788889
10.7640450.7752810.7752810.7977530.8314610.7865170.842697
20.7528090.8089890.8089890.7640450.7415730.7528090.752809
30.8089890.8539330.7865170.8202250.8089890.7865170.786517
40.7303370.7752810.7640450.7865170.7977530.7640450.797753
50.7640450.7977530.7752810.7977530.7303370.7528090.797753
60.8539330.9213480.8651690.8876400.8314610.8651690.876404
70.7528090.8314610.7752810.8314610.7865170.8202250.831461
80.8089890.8539330.8202250.7977530.7415730.8426970.775281
90.8876400.9213480.8651690.8764040.9101120.8876400.887640

In [54]:

1
2
3
4
5
# boxplot으로 각각의 모델의 정확도 통계값을 확인
plt.subplots(figsize=(12,6))
box= pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()
plt.show()

In [57]:

1
2
3
4
5
6
# 각각 cv로 출력된 정확도 평균을 표로 출력
new_models_dataframe2['CV Mean'].plot.barh(width=0.8)
plt.title("Average CV Mean Accuracy")
fig = plt.gcf()
fig.set_size_inches(8,5)
plt.show()

In [60]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# confusion 행렬로 실제 결과와 모델들이 판단한 결과를 히트맵으로 출력

f,ax= plt.subplots(3,3,figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='rbf'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,0], annot=True,fmt='2.0f')
ax[0,0].set_title("Matrix for rbf-SVM")

y_pred=cross_val_predict(svm.SVC(kernel='linear'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred), ax=ax[0,1], annot=True,fmt='2.0f')
ax[0,1].set_title("Matrix for Linear-SVM")

y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,2],annot=True,fmt='2.0f')
ax[0,2].set_title('Matrix for KNN')

y_pred= cross_val_predict(RandomForestClassifier(n_estimators=100),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,0],annot=True,fmt='2.0f')
ax[1,0].set_title("Matrix for Random-Forests")

y_pred= cross_val_predict(LogisticRegression(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,1],annot=True,fmt='2.0f')
ax[1,1].set_title("Matrix for Logistic Regression")

y_pred=cross_val_predict(DecisionTreeClassifier(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred), ax=ax[1,2],annot=True,fmt='2.0f')
ax[1,2].set_title('Matrix for Decision Tree')

y_pred=cross_val_predict(GaussianNB(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[2,0],annot=True,fmt='2.0f')
ax[2,0].set_title("Matrix for Naive Bayes")
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()

In [62]:

1
2
3
4
5
6
7
8
9
10
11
# 하이퍼 매개변수로 모델 튜닝후 검증하여 적절한 매개변수 출력
# SVM
from sklearn.model_selection import GridSearchCV
C =  [0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
hyper={'kernel':kernel,'C':C, 'gamma' :gamma}
gd= GridSearchCV(estimator=svm.SVC(),param_grid=hyper, verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)
1
2
3
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
0.8282593685267716
SVC(C=0.6, gamma=0.1)

In [63]:

1
2
3
4
5
6
7
# 랜덤 포레스트에서 적절한 매개변수
n_estimators=range(100,1000,100)
hyper={'n_estimators': n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(random_state=0),param_grid=hyper, verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)
1
2
3
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.82045069361622
RandomForestClassifier(n_estimators=400, random_state=0)

In [65]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 앙상블 voting

from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=9)),
                                             ("RBF",svm.SVC(probability=True,kernel='rbf',C=0.6,gamma=0.1)),
                                             ("RFor",RandomForestClassifier(n_estimators=400,random_state=0)),
                                             ("LR",LogisticRegression(C=0.05)),
                                             ("DT",DecisionTreeClassifier(random_state=0)),
                                             ("NB",GaussianNB()),
                                             ("svm",svm.SVC(kernel='linear',probability=True))],
                                             voting='soft').fit(train_X,train_Y)
print("The accuracy for ensembled model is: ",ensemble_lin_rbf.score(test_X,test_Y))

cross=cross_val_score(ensemble_lin_rbf,X,Y, cv=10, scoring="accuracy")
print("The cross validated score is", cross.mean())
1
2
The accuracy for ensembled model is:  0.8283582089552238
The cross validated score is 0.8249188514357053

In [67]:

1
2
3
4
5
6
7
8
9
# 앙상블 bagging KNN

from sklearn.ensemble import BaggingClassifier
model=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=9),random_state=0,n_estimators=400)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print("The accuracy for bagged KNN is:", metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for bagged KNN is:", result.mean())
1
2
The accuracy for bagged KNN is: 0.8507462686567164
The cross validated score for bagged KNN is: 0.8093008739076154

In [69]:

1
2
3
4
5
6
7
# 앙상블 bagging 의사결정트리
model=BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0, n_estimators=100)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print("The accuracy for bagged Decision Tree is:", metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for bagged Decision Tree is: ",result.mean())
1
2
The accuracy for bagged Decision Tree is: 0.8283582089552238
The cross validated score for bagged Decision Tree is:  0.8148938826466917

In [70]:

1
2
3
4
5
# 부스트 AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result= cross_val_score(ada,X,Y,cv=10,scoring='accuracy')
print("the cross validated score for AdaBoost is:",result.mean())
1
the cross validated score for AdaBoost is: 0.8260424469413234

In [71]:

1
2
3
4
5
# 부스트 Gradient
from sklearn.ensemble import GradientBoostingClassifier
grad=GradientBoostingClassifier(n_estimators=400, random_state=0,learning_rate=0.1)
result=cross_val_score(grad,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for Gradient Boosting is:", result.mean())
1
The cross validated score for Gradient Boosting is: 0.8104119850187266

In [73]:

1
!pip install xgboost
1
2
3
4
5
6
7
Collecting xgboost
  Downloading xgboost-1.6.0-py3-none-win_amd64.whl (126.1 MB)
     ------------------------------------- 126.1/126.1 MB 15.2 MB/s eta 0:00:00
Requirement already satisfied: numpy in c:\users\easyd\miniforge3\envs\easydong2\lib\site-packages (from xgboost) (1.22.3)
Requirement already satisfied: scipy in c:\users\easyd\miniforge3\envs\easydong2\lib\site-packages (from xgboost) (1.8.0)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.0

In [74]:

1
2
3
4
5
# XGBoost
import xgboost as xg
xgboost=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
result=cross_val_score(xgboost,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for SGBoost is: ", result.mean())
1
The cross validated score for SGBoost is:  0.8160174781523095

In [77]:

1
2
3
4
5
6
7
8
9
# 하이퍼 매개변수 튜닝으로 가장 정확도가 높은 AdBoost를 튜닝

n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators, 'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)
1
2
3
Fitting 5 folds for each of 120 candidates, totalling 600 fits
0.8293892411022534
AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

In [79]:

1
2
3
4
5
6
# confusion 행렬로 adaboost확인

ada= AdaBoostClassifier(n_estimators=100,random_state=0,learning_rate=0.1)
result= cross_val_predict(ada,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,result),cmap="winter",annot=True,fmt="2.0f")
plt.show()

In [84]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 피처 중요도 확인
f,ax= plt.subplots(2,2,figsize=(15,12))
model=RandomForestClassifier(n_estimators=400, random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8, ax=ax[0,0])
ax[0,0].set_title("Feature Importance in Random Forests")

model=AdaBoostClassifier(n_estimators=100,learning_rate=0.1,random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,1], color='#ddff11')
ax[0,1].set_title('Feature Importance in AdaBoost')

model=GradientBoostingClassifier(n_estimators=500,learning_rate=0.1,random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,0], cmap="RdYlGn_r")
ax[1,0].set_title("Feature Importance in Gradient Boosting")

model=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,1],color='#FD0F00')
ax[1,1].set_title("Feature Importance in XgBoost")
plt.show()

In [85]:

1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
This post is licensed under CC BY 4.0 by the author.

Comments powered by Disqus.