[Machine Learning] DieTanic EDA 예측 -2-
In [4]:
1
data['Initial'].replace(['Mlle','Mme',"Ms",'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)
In [5]:
1
2
3
4
5
data.loc[(data.Age.isnull()) & (data.Initial=='Mr'),'Age'] =33
data.loc[(data.Age.isnull()) & (data.Initial =="Mrs"),'Age'] = 36
data.loc[(data.Age.isnull()) & (data.Initial == "Master"),"Age"] =5
data.loc[(data.Age.isnull()) & (data.Initial =='Miss'),"Age"] =22
data.loc[(data.Age.isnull()) & (data.Initial=='Other'),"Age"] =46
In [6]:
1
data['Embarked'].fillna('S', inplace=True)
In [7]:
1
2
3
4
5
# 데이터 피쳐 간 상관관계 계수 히트맵 출력
sns.heatmap(data.corr(),annot=True, cmap="RdYlGn", linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()
In [8]:
1
2
3
4
5
6
7
8
# 나이 분포를 임의로 나누어서 ordinal 값으로 새로운 피쳐를 만들었다. 사실 apply()메서드로 해도 되긴 하다.
data['Age_band'] = 0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)& (data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)& (data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)& (data['Age']<=60),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4
data.head(2)
Out[8]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Initial | Age_band | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | Mr | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th… | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | Mrs | 2 |
In [9]:
1
data['Age_band'].value_counts().to_frame().style.background_gradient(cmap='Blues')
Out[9]:
Age_band | |
---|---|
1 | 382 |
2 | 325 |
0 | 115 |
3 | 58 |
4 | 11 |
In [10]:
1
2
sns.factorplot("Age_band",'Survived',data=data, col='Pclass')
plt.show()
In [12]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 가족의 규모를 피쳐로 새로 만들어서 대입하고 혼자 온 사람들의 피쳐도 만들었다.
data['Family_Size']=0
data['Family_Size'] = data['Parch'] + data['SibSp'] # 가족 규모
data['Alone']=0
data.loc[data['Family_Size']==0, 'Alone'] =1 # 혼자 온 사람 1 대입
f,ax= plt.subplots(1,2, figsize=(18,6))
sns.factorplot('Family_Size','Survived', data=data, ax=ax[0])
ax[0].set_title("Family_Size vs Survived")
sns.factorplot("Alone","Survived",data= data , ax =ax[1])
ax[1].set_title("Alone vs Survived")
plt.close(1)
plt.show()
In [13]:
1
2
sns.factorplot("Alone",'Survived',data=data, hue='Sex',col='Pclass')
plt.show()
In [15]:
1
2
3
4
# qcut은 숫자 피쳐를 우리가 원하는 개수로 나뉘어준다.
data['Fare_Range']= pd.qcut(data['Fare'],4)
data.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap='Blues')
Out[15]:
Survived | |
---|---|
Fare_Range | |
(-0.001, 7.91] | 0.197309 |
(7.91, 14.454] | 0.303571 |
(14.454, 31.0] | 0.454955 |
(31.0, 512.329] | 0.581081 |
In [16]:
1
2
3
4
5
6
7
# Fare_Range를 다시 Age_Band처럼 나눈다.
data['Fare_cat']=0
data.loc[data['Fare']<=7.91,'Fare_cat'] =0
data.loc[(data['Fare']>7.91) & (data['Fare']<=14.454), 'Fare_cat'] =1
data.loc[(data["Fare"]>14.454)&(data['Fare']<=31.0),'Fare_cat']=2
data.loc[data['Fare']>31.0, 'Fare_cat']=3
In [17]:
1
2
sns.factorplot("Fare_cat",'Survived', data=data, hue='Sex')
plt.show()
In [20]:
1
2
3
data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data["Initial"].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)
In [22]:
1
2
3
4
5
6
7
data.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'],axis=1,inplace=True)
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':20})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
머신러닝 모델 사용¶
In [24]:
1
2
3
4
5
6
7
8
9
from sklearn.linear_model import LogisticRegression #로지스틱 회귀
from sklearn import svm # 서포트 벡터 머신
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.naive_bayes import GaussianNB # 베이즈안
from sklearn.tree import DecisionTreeClassifier # 의사결정트리
from sklearn.model_selection import train_test_split # 훈련데이터와 검증데이터 분리
from sklearn import metrics # 정확도 측정
from sklearn.metrics import confusion_matrix # 행렬
In [25]:
1
2
3
4
5
6
7
8
train,test= train_test_split(data,test_size=0.3, random_state=0, stratify=data['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X= data[data.columns[1:]]
Y= data['Survived']
In [31]:
1
2
3
4
5
# SVM 모델 사용하여 검증까지
model=svm.SVC(kernel='rbf', C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))
1
Accuracy for rbf SVM is 0.835820895522388
In [32]:
1
2
3
4
5
# 선형 SVM 모델 사용
model=svm.SVC(kernel='linear', C=0.1,gamma=0.1)
model.fit(train_X,train_Y)
prediction2=model.predict(test_X)
print("Accuracy for linear SVM is ", metrics.accuracy_score(prediction2,test_Y))
1
Accuracy for linear SVM is 0.8134328358208955
In [34]:
1
2
3
4
5
# 로지스틱 회귀 모델 사용
model= LogisticRegression()
model.fit(train_X,train_Y)
prediction3=model.predict(test_X)
print("The accuracy of the Logistic Regression is ", metrics.accuracy_score(prediction3,test_Y))
1
The accuracy of the Logistic Regression is 0.8171641791044776
In [35]:
1
2
3
4
5
# 의사결정 트리 모델 사용
model=DecisionTreeClassifier()
model.fit(train_X,train_Y)
prediction4= model.predict(test_X)
print("The accuracy of the Decission Tree is", metrics.accuracy_score(prediction4,test_Y))
1
The accuracy of the Decission Tree is 0.8022388059701493
In [38]:
1
2
3
4
5
# K 최근접 이웃 모델
model=KNeighborsClassifier()
model.fit(train_X,train_Y)
prediction5= model.predict(test_X)
print("The accuracy of the KNN is", metrics.accuracy_score(prediction5,test_Y))
1
The accuracy of the KNN is 0.8134328358208955
In [39]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# KNN 의 상수 K에 값에 따른 정확도 알아보기
a_index=list(range(1,11))
a=pd.Series()
x=[0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
model=KNeighborsClassifier(n_neighbors=i)
model.fit(train_X,train_Y)
prediction= model.predict(test_X)
a= a.append(pd.Series(metrics.accuracy_score(prediction,test_Y)))
plt.plot(a_index,a)
plt.xticks(x)
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()
In [41]:
1
2
3
4
5
6
# 나이브 베이즈
model=GaussianNB()
model.fit(train_X,train_Y)
prediction6= model.predict(test_X)
print("The accuracy of the NaiveBayes is", metrics.accuracy_score(prediction6,test_Y))
1
The accuracy of the NaiveBayes is 0.8134328358208955
In [46]:
1
2
3
4
5
# 랜덤 포레스트
model =RandomForestClassifier(n_estimators=100)
model.fit(train_X,train_Y)
prediction7=model.predict(test_X)
print("The accuracy of the Random Forests is", metrics.accuracy_score(prediction7,test_Y))
1
The accuracy of the Random Forests is 0.8134328358208955
In [49]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# KFold로 교차검증
from sklearn.model_selection import KFold # kfold 교차검증
from sklearn.model_selection import cross_val_score # score 평가
from sklearn.model_selection import cross_val_predict # 예측
kfold= KFold(n_splits=10, random_state=22, shuffle=True) # train 데이터를 10개로 나눔.
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=9),DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]
for i in models:
model=i
cv_result=cross_val_score(model,X,Y, cv=kfold, scoring='accuracy')
cv_result=cv_result
xyz.append(cv_result.mean())
std.append(cv_result.std())
accuracy.append(cv_result)
new_models_dataframe2= pd.DataFrame({'CV Mean':xyz,'Std': std}, index=classifiers)
new_models_dataframe2
Out[49]:
CV Mean | Std | |
---|---|---|
Linear Svm | 0.782360 | 0.054343 |
Radial Svm | 0.828377 | 0.057096 |
Logistic Regression | 0.796929 | 0.040794 |
KNN | 0.811511 | 0.041301 |
Decision Tree | 0.797978 | 0.051001 |
Naive Bayes | 0.795843 | 0.054861 |
Random Forest | 0.813720 | 0.041955 |
In [52]:
1
accuracy
Out[52]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
[array([0.7 , 0.76404494, 0.75280899, 0.80898876, 0.73033708,
0.76404494, 0.85393258, 0.75280899, 0.80898876, 0.88764045]),
array([0.74444444, 0.7752809 , 0.80898876, 0.85393258, 0.7752809 ,
0.79775281, 0.92134831, 0.83146067, 0.85393258, 0.92134831]),
array([0.73333333, 0.7752809 , 0.80898876, 0.78651685, 0.76404494,
0.7752809 , 0.86516854, 0.7752809 , 0.82022472, 0.86516854]),
array([0.75555556, 0.79775281, 0.76404494, 0.82022472, 0.78651685,
0.79775281, 0.88764045, 0.83146067, 0.79775281, 0.87640449]),
array([0.8 , 0.83146067, 0.74157303, 0.80898876, 0.79775281,
0.73033708, 0.83146067, 0.78651685, 0.74157303, 0.91011236]),
array([0.7 , 0.78651685, 0.75280899, 0.78651685, 0.76404494,
0.75280899, 0.86516854, 0.82022472, 0.84269663, 0.88764045]),
array([0.78888889, 0.84269663, 0.75280899, 0.78651685, 0.79775281,
0.79775281, 0.87640449, 0.83146067, 0.7752809 , 0.88764045])]
In [56]:
1
box.T
Out[56]:
Linear Svm | Radial Svm | Logistic Regression | KNN | Decision Tree | Naive Bayes | Random Forest | |
---|---|---|---|---|---|---|---|
0 | 0.700000 | 0.744444 | 0.733333 | 0.755556 | 0.800000 | 0.700000 | 0.788889 |
1 | 0.764045 | 0.775281 | 0.775281 | 0.797753 | 0.831461 | 0.786517 | 0.842697 |
2 | 0.752809 | 0.808989 | 0.808989 | 0.764045 | 0.741573 | 0.752809 | 0.752809 |
3 | 0.808989 | 0.853933 | 0.786517 | 0.820225 | 0.808989 | 0.786517 | 0.786517 |
4 | 0.730337 | 0.775281 | 0.764045 | 0.786517 | 0.797753 | 0.764045 | 0.797753 |
5 | 0.764045 | 0.797753 | 0.775281 | 0.797753 | 0.730337 | 0.752809 | 0.797753 |
6 | 0.853933 | 0.921348 | 0.865169 | 0.887640 | 0.831461 | 0.865169 | 0.876404 |
7 | 0.752809 | 0.831461 | 0.775281 | 0.831461 | 0.786517 | 0.820225 | 0.831461 |
8 | 0.808989 | 0.853933 | 0.820225 | 0.797753 | 0.741573 | 0.842697 | 0.775281 |
9 | 0.887640 | 0.921348 | 0.865169 | 0.876404 | 0.910112 | 0.887640 | 0.887640 |
In [54]:
1
2
3
4
5
# boxplot으로 각각의 모델의 정확도 통계값을 확인
plt.subplots(figsize=(12,6))
box= pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()
plt.show()
In [57]:
1
2
3
4
5
6
# 각각 cv로 출력된 정확도 평균을 표로 출력
new_models_dataframe2['CV Mean'].plot.barh(width=0.8)
plt.title("Average CV Mean Accuracy")
fig = plt.gcf()
fig.set_size_inches(8,5)
plt.show()
In [60]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# confusion 행렬로 실제 결과와 모델들이 판단한 결과를 히트맵으로 출력
f,ax= plt.subplots(3,3,figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='rbf'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,0], annot=True,fmt='2.0f')
ax[0,0].set_title("Matrix for rbf-SVM")
y_pred=cross_val_predict(svm.SVC(kernel='linear'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred), ax=ax[0,1], annot=True,fmt='2.0f')
ax[0,1].set_title("Matrix for Linear-SVM")
y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,2],annot=True,fmt='2.0f')
ax[0,2].set_title('Matrix for KNN')
y_pred= cross_val_predict(RandomForestClassifier(n_estimators=100),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,0],annot=True,fmt='2.0f')
ax[1,0].set_title("Matrix for Random-Forests")
y_pred= cross_val_predict(LogisticRegression(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,1],annot=True,fmt='2.0f')
ax[1,1].set_title("Matrix for Logistic Regression")
y_pred=cross_val_predict(DecisionTreeClassifier(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred), ax=ax[1,2],annot=True,fmt='2.0f')
ax[1,2].set_title('Matrix for Decision Tree')
y_pred=cross_val_predict(GaussianNB(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[2,0],annot=True,fmt='2.0f')
ax[2,0].set_title("Matrix for Naive Bayes")
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()
In [62]:
1
2
3
4
5
6
7
8
9
10
11
# 하이퍼 매개변수로 모델 튜닝후 검증하여 적절한 매개변수 출력
# SVM
from sklearn.model_selection import GridSearchCV
C = [0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
hyper={'kernel':kernel,'C':C, 'gamma' :gamma}
gd= GridSearchCV(estimator=svm.SVC(),param_grid=hyper, verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)
1
2
3
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
0.8282593685267716
SVC(C=0.6, gamma=0.1)
In [63]:
1
2
3
4
5
6
7
# 랜덤 포레스트에서 적절한 매개변수
n_estimators=range(100,1000,100)
hyper={'n_estimators': n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(random_state=0),param_grid=hyper, verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)
1
2
3
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.82045069361622
RandomForestClassifier(n_estimators=400, random_state=0)
In [65]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 앙상블 voting
from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=9)),
("RBF",svm.SVC(probability=True,kernel='rbf',C=0.6,gamma=0.1)),
("RFor",RandomForestClassifier(n_estimators=400,random_state=0)),
("LR",LogisticRegression(C=0.05)),
("DT",DecisionTreeClassifier(random_state=0)),
("NB",GaussianNB()),
("svm",svm.SVC(kernel='linear',probability=True))],
voting='soft').fit(train_X,train_Y)
print("The accuracy for ensembled model is: ",ensemble_lin_rbf.score(test_X,test_Y))
cross=cross_val_score(ensemble_lin_rbf,X,Y, cv=10, scoring="accuracy")
print("The cross validated score is", cross.mean())
1
2
The accuracy for ensembled model is: 0.8283582089552238
The cross validated score is 0.8249188514357053
In [67]:
1
2
3
4
5
6
7
8
9
# 앙상블 bagging KNN
from sklearn.ensemble import BaggingClassifier
model=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=9),random_state=0,n_estimators=400)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print("The accuracy for bagged KNN is:", metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for bagged KNN is:", result.mean())
1
2
The accuracy for bagged KNN is: 0.8507462686567164
The cross validated score for bagged KNN is: 0.8093008739076154
In [69]:
1
2
3
4
5
6
7
# 앙상블 bagging 의사결정트리
model=BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0, n_estimators=100)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print("The accuracy for bagged Decision Tree is:", metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for bagged Decision Tree is: ",result.mean())
1
2
The accuracy for bagged Decision Tree is: 0.8283582089552238
The cross validated score for bagged Decision Tree is: 0.8148938826466917
In [70]:
1
2
3
4
5
# 부스트 AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result= cross_val_score(ada,X,Y,cv=10,scoring='accuracy')
print("the cross validated score for AdaBoost is:",result.mean())
1
the cross validated score for AdaBoost is: 0.8260424469413234
In [71]:
1
2
3
4
5
# 부스트 Gradient
from sklearn.ensemble import GradientBoostingClassifier
grad=GradientBoostingClassifier(n_estimators=400, random_state=0,learning_rate=0.1)
result=cross_val_score(grad,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for Gradient Boosting is:", result.mean())
1
The cross validated score for Gradient Boosting is: 0.8104119850187266
In [73]:
1
!pip install xgboost
1
2
3
4
5
6
7
Collecting xgboost
Downloading xgboost-1.6.0-py3-none-win_amd64.whl (126.1 MB)
------------------------------------- 126.1/126.1 MB 15.2 MB/s eta 0:00:00
Requirement already satisfied: numpy in c:\users\easyd\miniforge3\envs\easydong2\lib\site-packages (from xgboost) (1.22.3)
Requirement already satisfied: scipy in c:\users\easyd\miniforge3\envs\easydong2\lib\site-packages (from xgboost) (1.8.0)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.0
In [74]:
1
2
3
4
5
# XGBoost
import xgboost as xg
xgboost=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
result=cross_val_score(xgboost,X,Y,cv=10,scoring='accuracy')
print("The cross validated score for SGBoost is: ", result.mean())
1
The cross validated score for SGBoost is: 0.8160174781523095
In [77]:
1
2
3
4
5
6
7
8
9
# 하이퍼 매개변수 튜닝으로 가장 정확도가 높은 AdBoost를 튜닝
n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators, 'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)
1
2
3
Fitting 5 folds for each of 120 candidates, totalling 600 fits
0.8293892411022534
AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
In [79]:
1
2
3
4
5
6
# confusion 행렬로 adaboost확인
ada= AdaBoostClassifier(n_estimators=100,random_state=0,learning_rate=0.1)
result= cross_val_predict(ada,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,result),cmap="winter",annot=True,fmt="2.0f")
plt.show()
In [84]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 피처 중요도 확인
f,ax= plt.subplots(2,2,figsize=(15,12))
model=RandomForestClassifier(n_estimators=400, random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8, ax=ax[0,0])
ax[0,0].set_title("Feature Importance in Random Forests")
model=AdaBoostClassifier(n_estimators=100,learning_rate=0.1,random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,1], color='#ddff11')
ax[0,1].set_title('Feature Importance in AdaBoost')
model=GradientBoostingClassifier(n_estimators=500,learning_rate=0.1,random_state=0)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,0], cmap="RdYlGn_r")
ax[1,0].set_title("Feature Importance in Gradient Boosting")
model=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
model.fit(X,Y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,1],color='#FD0F00')
ax[1,1].set_title("Feature Importance in XgBoost")
plt.show()
In [85]:
1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
This post is licensed under CC BY 4.0 by the author.
Comments powered by Disqus.