Post

[Machine Learning] 타이타닉 앙상블 모델링

In [4]:

1
test.info()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB

In [5]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# 이상치 탐색
# 데이터 프레임을 매개변수로 받아서 Tukey방식에 따른 이상치 관측의 인덱스 목록을 반환하는 함수 생성
def detect_outliers(df,n,features):
    outlier_indices=[]
    
    for col in features:
        # 1사분위
        Q1 = np.percentile(df[col],25)
        # 3사분위
        Q3 = np.percentile(df[col],75)
        # Interquartile range( IQR )
        IQR = Q3-Q1
        
        # 이상치 공식
        outlier_step = 1.5 * IQR
        
        # 각 특성의 이상치 인덱스 목록 결정
        outlier_list_col = df[(df[col]< Q1 - outlier_step) | (df[col]> Q3 + outlier_step)].index
        
        
        # 찾은 이상치 인덱스 추가
        outlier_indices.extend(outlier_list_col)

    # n개 이상의 이상치가 있는 것 찾기
    outlier_indices= Counter(outlier_indices)
    multiple_outliers= list(k for k, v in outlier_indices.items() if v >n)
        
    return multiple_outliers
    
# 나이, 형제배우자, 부모자식, Parch와 요금 피쳐들에 대한 이상치 확인
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

In [6]:

1
Outliers_to_drop

Out[6]:

1
[27, 88, 159, 180, 201, 324, 341, 792, 846, 863]

In [7]:

1
train.loc[Outliers_to_drop]

Out[7]:

 PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
272801Fortune, Mr. Charles Alexandermale19.03219950263.00C23 C25 C27S
888911Fortune, Miss. Mabel Helenfemale23.03219950263.00C23 C25 C27S
15916003Sage, Master. Thomas HenrymaleNaN82CA. 234369.55NaNS
18018103Sage, Miss. Constance GladysfemaleNaN82CA. 234369.55NaNS
20120203Sage, Mr. FrederickmaleNaN82CA. 234369.55NaNS
32432503Sage, Mr. George John JrmaleNaN82CA. 234369.55NaNS
34134211Fortune, Miss. Alice Elizabethfemale24.03219950263.00C23 C25 C27S
79279303Sage, Miss. Stella AnnafemaleNaN82CA. 234369.55NaNS
84684703Sage, Mr. Douglas BullenmaleNaN82CA. 234369.55NaNS
86386403Sage, Miss. Dorothy Edith “Dolly”femaleNaN82CA. 234369.55NaNS

In [8]:

1
2
# 이상치 제거
train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True)

In [9]:

1
2
3
# train과 test 데이터 조인 -> 데이터셋 규모 맞추기
train_len = len(train)
dataset= pd.concat(objs=[train,test],axis=0).reset_index(drop=True)

In [10]:

1
2
3
4
5
6
# 널 데이터 NaN으로 채우기
dataset = dataset.fillna(np.nan)

# 확인

dataset.isnull().sum()

Out[10]:

1
2
3
4
5
6
7
8
9
10
11
12
13
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             256
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1007
Embarked          2
dtype: int64

In [11]:

1
train.head()

Out[11]:

 PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th…female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS

In [12]:

1
train.dtypes

Out[12]:

1
2
3
4
5
6
7
8
9
10
11
12
13
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [13]:

1
train.describe()

Out[13]:

 PassengerIdSurvivedPclassAgeSibSpParchFare
count881.000000881.000000881.000000711.000000881.000000881.000000881.000000
mean446.7139610.3859252.30760529.7316030.4551650.36322431.121566
std256.6170210.4870900.83505514.5478350.8715710.79183947.996249
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%226.0000000.0000002.00000020.2500000.0000000.0000007.895800
50%448.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.0000001.0000003.00000038.0000001.0000000.00000030.500000
max891.0000001.0000003.00000080.0000005.0000006.000000512.329200

In [14]:

1
g= sns.heatmap(train[['Survived','SibSp','Parch','Age','Fare']].corr(),annot=True, fmt=".2f",cmap='coolwarm')

In [15]:

1
2
3
4
# SibSp 피쳐 확인하기
g = sns.factorplot(x='SibSp',y="Survived",data=train, kind='bar', size=6, palette='muted')
g.despine(left=True) # 테두리 제거
g = g.set_ylabels("survival probability")

In [16]:

1
2
3
4
5
# Parch 피쳐 확인하기

g = sns.factorplot(x='Parch', y='Survived',data=train,kind='bar',size=6,palette='muted')
g.despine(left=True)
g = g.set_ylabels('survival probability')

In [17]:

1
2
3
# Age 피쳐 확인하기
g = sns.FacetGrid(train,col='Survived')
g = g.map(sns.distplot,'Age')

In [18]:

1
2
3
4
5
6
# Age 분포 확인하기
g = sns.kdeplot(train['Age'][(train['Survived']==0)&(train['Age'].notnull())], color='Red', shade=True)
g = sns.kdeplot(train['Age'][(train['Survived']==1)&(train['Age'].notnull())], ax=g, color="Blue",shade=True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(['Not Survived','Survived'])

In [11]:

1
2
# Fare 결측치 확인
dataset['Fare'].isnull().sum()

Out[11]:

1
1

In [20]:

1
2
3
# Fare 분포 확인
g = sns.distplot(dataset['Fare'], color='m', label='Skewness: %.2f'%(dataset['Fare'].skew()))
g= g.legend(loc='best')

In [12]:

1
2
# 편차가 너무 크기 때문에 로그를 취함
dataset['Fare'] = dataset['Fare'].map(lambda i: np.log(i) if i>0 else 0)

In [13]:

1
2
g= sns.distplot(dataset['Fare'], color='b', label='Skewness: %.2f'%(dataset['Fare'].skew()))
g= g.legend(loc='best')

In [23]:

1
2
g = sns.barplot(x='Sex', y='Survived', data=train)
g = g.set_ylabel('Survival Probability')

In [13]:

1
train[['Sex','Survived']].groupby('Sex').mean()

Out[13]:

 Survived
Sex 
female0.747573
male0.190559

In [25]:

1
2
3
4
# Pclass 탐색하기
g = sns.factorplot(x="Pclass", y='Survived', data=train, kind='bar',size=6, palette='muted')
g.despine(left=True)
g = g.set_ylabels('survival probability')

In [26]:

1
2
3
4
# Pclass를 성별로 분류하여 생존살펴보기
g= sns.factorplot(x='Pclass',y='Survived', hue='Sex', data=train, size=6,kind='bar',palette='muted')
g.despine(left=True)
g=g.set_ylabels("survival probability")

In [14]:

1
2
# Embarked 살펴보기
dataset['Embarked'].isnull().sum()

Out[14]:

1
2

In [15]:

1
2
# Embarked 결측치 최빈값으로 채우기
dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [29]:

1
2
3
4
# Embarked 생존과 비교
g = sns.factorplot(x='Embarked', y= 'Survived', data=train, size=6,kind='bar' ,palette="muted")
g.despine(left=True)
g = g.set_ylabels("Survival Probability")

In [30]:

1
2
3
4
# Pclass와 Embarked 비교
g= sns.factorplot('Pclass', col='Embarked', data=train,size=6, kind='count', palette='muted')
g.despine(left=True)
g = g.set_ylabels("Count")

In [31]:

1
2
3
4
5
# 결측치 처리하기
g= sns.factorplot( y='Age', x='Sex', data=dataset,kind='box')
g= sns.factorplot(y='Age',x='Sex', hue="Pclass", data=dataset, kind='box')
g= sns.factorplot(y='Age', x='Parch', data=dataset,kind='box')
g= sns.factorplot(y='Age', x='SibSp', data=dataset, kind='box')

In [16]:

1
2
# 성별을 수치화
dataset['Sex'] = dataset['Sex'].map({'male':0, 'female':1})

In [33]:

1
2
# 주요한 피쳐들 상관관계 분석
g= sns.heatmap(dataset[['Age','Sex','SibSp','Parch','Pclass']].corr(), cmap='Blues', annot=True)

In [17]:

1
2
3
4
5
6
7
8
9
10
11
12
13
# Age 결측치 채우기

index_NaN_age= list(dataset['Age'][dataset['Age'].isnull()].index)

for i in index_NaN_age:
    age_med= dataset['Age'].median()
    
    #결측치가 있는 인덱스의 조건과 같은 행의 Age들 중 중간값
    age_pred= dataset['Age'][((dataset['SibSp']== dataset.iloc[i]['SibSp'])&(dataset['Parch']==dataset.iloc[i]['Parch'])&(dataset['Pclass']==dataset.iloc[i]['Pclass']))].median()
    if not np.isnan(age_pred): #값이 nan이 아닌 경우
        dataset['Age'].iloc[i] = age_pred
    else:
        dataset['Age'].iloc[i] = age_med

In [39]:

1
2
g= sns.factorplot(x="Survived", y = "Age", data=train,kind='box')
g= sns.factorplot(x='Survived', y='Age', data=train, kind='violin')

In [ ]:

1
## Feature engineering

In [40]:

1
dataset['Name'].head()

Out[40]:

1
2
3
4
5
6
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [18]:

1
2
3
4
# 이름으로부터 지위 가져오기
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset['Name']]
dataset["Title"] = pd.Series(dataset_title)
dataset['Title'].head()

Out[18]:

1
2
3
4
5
6
0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

In [42]:

1
2
g = sns.countplot(x= "Title", data=dataset)
g = plt.setp(g.get_xticklabels(), rotation=45)

In [19]:

1
2
3
4
# title을 카테고리로 변환
dataset['Title'] = dataset['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
dataset['Title']= dataset['Title'].map({"Master":0, 'Miss':1, 'Ms': 1,'Mme':1, 'Mlle':1, "Mrs":1, "Mr":2, "Rare":3})
dataset['Title'] = dataset['Title'].astype(int)

In [44]:

1
2
g = sns.countplot(dataset['Title'])
g = g.set_xticklabels(['Master','Miss/Ms/Mme/Mlle/Mrs','Mr',"Rare"])

In [47]:

1
2
g = sns.factorplot(x='Title', y='Survived', data=dataset, kind='bar')
g = g.set_xticklabels(['Master','Miss-Mrs','Mr',"Rare"])

In [20]:

1
2
# Name 컬럼 삭제
dataset = dataset.drop(labels = ['Name'], axis=1)

In [21]:

1
2
# 가족 사이즈 컬럼 만들기
dataset['Fsize'] = dataset['SibSp'] + dataset['Parch'] +1

In [50]:

1
2
g= sns.factorplot(x ="Fsize", y='Survived', data=dataset)
g = g.set_ylabels("Survival Probability")

In [22]:

1
2
3
4
5
6
# 가족 사이즈를 다시 4카테고리로 만듦

dataset['Single'] = dataset['Fsize'].map(lambda s : 1 if s==1 else 0 )
dataset['SmallF'] = dataset['Fsize'].map(lambda s : 1 if s==2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s : 1 if 3<= s<=4 else 0 )
dataset['LargeF'] = dataset['Fsize'].map(lambda s : 1 if s>=5 else 0)

In [25]:

1
2
3
4
5
6
7
8
9
10
11
g= sns.factorplot(x='Single', y='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")

g = sns.factorplot(x="SmallF", y ='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")

g= sns.factorplot(x="MedF", y='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")

g= sns.factorplot(x='LargeF', y='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")

In [23]:

1
2
3
# Title과 Embarked 지표 컬럼 만들기
dataset= pd.get_dummies(dataset, columns=['Title'])
dataset = pd.get_dummies(dataset, columns = ['Embarked'], prefix='Em')

In [24]:

1
dataset.head()

Out[24]:

 PassengerIdSurvivedPclassSexAgeSibSpParchTicketFareCabinSmallFMedFLargeFTitle_0Title_1Title_2Title_3Em_CEm_QEm_S
010.03022.010A/5 211711.981001NaN1000010001
121.01138.010PC 175994.266662C851000100100
231.03126.000STON/O2. 31012822.070022NaN0000100001
341.01135.0101138033.972177C1231000100001
450.03035.0003734502.085672NaN0000010001

5 rows × 22 columns

In [25]:

1
dataset['Cabin'].head()

Out[25]:

1
2
3
4
5
6
0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object

In [59]:

1
dataset['Cabin'].describe()

Out[59]:

1
2
3
4
5
count     292
unique    186
top        G6
freq        5
Name: Cabin, dtype: object

In [26]:

1
dataset['Cabin'].isnull().sum()

Out[26]:

1
1007

In [27]:

1
dataset['Cabin'][dataset["Cabin"].notnull()].head()

Out[27]:

1
2
3
4
5
6
1      C85
3     C123
6      E46
10      G6
11    C103
Name: Cabin, dtype: object

In [28]:

1
2
# 없으면 X로 Cabin 결측치 대체하기 결측치 아니라면 첫번째 문자만 가져오기
dataset['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']])

In [29]:

1
g= sns.countplot(dataset['Cabin'], order=['A','B','C','D','E','F','G','T','X'])

In [71]:

1
2
g= sns.factorplot(y='Survived', x='Cabin', data=dataset, kind='bar', order=['A','B','C','D','E','F','G','T','X'])
g= g.set_ylabels("Survival Probability")

In [30]:

1
dataset= pd.get_dummies(dataset, columns=['Cabin'], prefix='Cabin')

In [31]:

1
dataset['Ticket'].head()

Out[31]:

1
2
3
4
5
6
0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [32]:

1
2
3
4
5
6
7
8
9
10
11
12
13
# 티켓 앞부분만 추출한다. 없으면 X로 대체 
Ticket= []

for i in list(dataset.Ticket):
    if not i.isdigit():
        Ticket.append(i.replace(".",'').replace("/","").strip().split(' ')[0]) # 접두사 추출
    else:
        Ticket.append("X")
        

dataset['Ticket'] =Ticket
dataset['Ticket'].head()
        

Out[32]:

1
2
3
4
5
6
0        A5
1        PC
2    STONO2
3         X
4         X
Name: Ticket, dtype: object

In [34]:

1
dataset= pd.get_dummies(dataset,columns=['Ticket'], prefix='T')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Input In [34], in <cell line: 1>()
----> 1 dataset= pd.get_dummies(dataset,columns=['Ticket'], prefix='T')

File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\reshape\reshape.py:931, in get_dummies(data, prefix, prefix_sep, dummy_na, columns, sparse, drop_first, dtype)
    929     raise TypeError("Input must be a list-like for parameter `columns`")
    930 else:
--> 931     data_to_encode = data[columns]
    933 # validate prefixes and separator to avoid silently dropping cols
    934 def check_len(item, name):

File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\frame.py:3511, in DataFrame.__getitem__(self, key)
   3509     if is_iterator(key):
   3510         key = list(key)
-> 3511     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   3513 # take() does not accept boolean indexers
   3514 if getattr(indexer, "dtype", None) == bool:

File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\indexes\base.py:5782, in Index._get_indexer_strict(self, key, axis_name)
   5779 else:
   5780     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5782 self._raise_if_missing(keyarr, indexer, axis_name)
   5784 keyarr = self.take(indexer)
   5785 if isinstance(key, Index):
   5786     # GH 42790 - Preserve name from an Index

File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\indexes\base.py:5842, in Index._raise_if_missing(self, key, indexer, axis_name)
   5840     if use_interval_msg:
   5841         key = list(key)
-> 5842     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   5844 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   5845 raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index(['Ticket'], dtype='object')] are in the [columns]"

In [35]:

1
2
3
# Pclass도 각각 밸류에 종류 컬럼으로 만들기
dataset['Pclass']= dataset['Pclass'].astype("category")
dataset= pd.get_dummies(dataset,columns=['Pclass'], prefix='Pc')

In [36]:

1
2
# 쓸모없는 거 지우기
dataset = dataset.drop(labels = ['PassengerId'], axis=1)

In [37]:

1
dataset.head()

Out[37]:

 SurvivedSexAgeSibSpParchFareFsizeSingleSmallFMedFT_STONOT_STONO2T_STONOQT_SWPPT_WCT_WEPT_XPc_1Pc_2Pc_3
00.0022.0101.98100120100000000001
11.0138.0104.26666220100000000100
21.0126.0002.07002211000100000001
31.0135.0103.97217720100000001100
40.0035.0002.08567211000000001001

5 rows × 67 columns

In [ ]:

1
## 모델링

In [38]:

1
2
3
4
# train과 test로 나누기
train = dataset[:train_len]
test= dataset[train_len:]
test = test.drop(labels= ['Survived'], axis=1)

In [39]:

1
2
3
4
5
train['Survived']=train['Survived'].astype(int)

Y_train = train['Survived']

X_train = train.drop(labels=['Survived'], axis=1)

In [40]:

1
2
# Cross validate KFold 모델
kfold= StratifiedKFold(n_splits=10)

In [41]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 여러가지 머신러닝 알고리즘
random_state=2
classifiers=[]
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state, learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results= []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y= Y_train,scoring = 'accuracy', cv=kfold, n_jobs=6))
    
cv_means= []
cv_std=[]
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
cv_res = pd.DataFrame({'CrossValMeans': cv_means, 'CrossValerrors':cv_std, 'Algorithm':['SVC','DecisionTree','AdaBoost','RandomForest','ExtraTrees','GradientBoosting','MultipleLayerPerceptron','KNeighbors','LogisticRegression',
                                                                                       'LinearDiscriminantAnalysis']})

g = sns.barplot("CrossValMeans",'Algorithm',data=cv_res, palette='Set3', orient='h', **{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross Validation Scrores")

In [42]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# ADABOOST, RF, EXTRATREES, GRADIENTBOOSTING 메타모델링

# Adaboost
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)

ada_param_grid= {'base_estimator__criterion':['gini','entropy'],
                "base_estimator__splitter": ['best','random'],
                'algorithm': ['SAMME','SAMME.R'],
                'n_estimators': [1,2],
                'learning_rate': [0.0001,0.001,0.01,0.1,0.2,0.3,1.5]}

gsadaDTC=GridSearchCV(adaDTC, param_grid=ada_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)

gsadaDTC.fit(X_train,Y_train)

ada_best= gsadaDTC.best_estimator_
1
Fitting 10 folds for each of 112 candidates, totalling 1120 fits

In [45]:

1
gsadaDTC.best_score_

Out[45]:

1
0.8275536261491316

In [48]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Extra트리
ExtC= ExtraTreesClassifier()

## 최적의 매개변수를 위한 그리드 검색
ex_param_grid= {'max_depth':[None],
               'max_features':[1,3,10],
               'min_samples_split':[2,3,10],
               'min_samples_leaf':[1,3,10],
               'bootstrap':[False],
               "n_estimators":[100,300],
               'criterion':['gini']}

gsExtC = GridSearchCV(ExtC, param_grid = ex_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)

gsExtC.fit(X_train,Y_train)

ExtC_best= gsExtC.best_estimator_

# 최고 점수

gsExtC.best_score_
1
Fitting 10 folds for each of 54 candidates, totalling 540 fits

Out[48]:

1
0.8342696629213483

In [49]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# RFC 매개변수 튜닝
RFC= RandomForestClassifier()

## 최적의 매개변수를 위한 그리드 검색
rf_param_grid= {'max_depth':[None],
               'max_features':[1,3,10],
               'min_samples_split':[2,3,10],
               'min_samples_leaf':[1,3,10],
               'bootstrap':[False],
               'n_estimators':[100,300],
               'criterion':['gini']}

gsRFC = GridSearchCV(RFC, param_grid = rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=4, verbose=1)

gsRFC.fit(X_train, Y_train)

RFC_best= gsRFC.best_estimator_

# 최고 점수
gsRFC.best_score_
1
Fitting 10 folds for each of 54 candidates, totalling 540 fits

Out[49]:

1
0.836567926455567

In [54]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Gradient boosting 튜닝

GBC = GradientBoostingClassifier()
gb_param_grid = {'loss':['deviance'],
                'n_estimators' : [100,200,300],
                'learning_rate': [0.1,0.05,0.01],
                'max_depth':[4,8],
                'min_samples_leaf':[100,150],
                'max_features':[0.3,0.1]
                }

gsGBC= GridSearchCV(GBC, param_grid= gb_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)

gsGBC.fit(X_train, Y_train)

GBC_best= gsGBC.best_estimator_

# 최고 점수
gsGBC.best_score_
1
Fitting 10 folds for each of 72 candidates, totalling 720 fits

Out[54]:

1
0.8388151174668028

In [55]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SVC 분류
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'],
                 'gamma': [0.001,0.01,0.1,1],
                 'C':[1,10,50,100,200,300,1000]
                 }
gsSVMC = GridSearchCV(SVMC, param_grid= svc_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)

gsSVMC.fit(X_train,Y_train)

SVMC_best= gsSVMC.best_estimator_

# 최고 점수

gsSVMC.best_score_
1
Fitting 10 folds for each of 28 candidates, totalling 280 fits

Out[55]:

1
0.8331332992849847

In [58]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1,1.0,5)):
    """
    Generate a simple plot of the test and training learning curve
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    train_sizes, train_scores, test_scores= learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes= train_sizes
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std =np.std(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std= np.std(test_scores,axis=1)
    plt.grid()
    plt.fill_between(train_sizes,train_scores_mean - train_scores_std,
                    train_scores_mean+train_scores_std, alpha=0.1,
                    color='r')
    plt.fill_between(train_sizes, test_scores_mean-test_scores_std,
                    test_scores_mean + test_scores_std, alpha=0.1,
                    color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-',color='g', label='Cross-validation score')
    
    plt.legend(loc='best')
    return plt

In [62]:

1
2
3
4
5
g= plot_learning_curve(gsRFC.best_estimator_, 'RF learning curve', X_train,Y_train, cv=kfold)
g= plot_learning_curve(gsExtC.best_estimator_, 'ExtraTrees learning curves', X_train, Y_train, cv=kfold)
g=plot_learning_curve(gsSVMC.best_estimator_, 'SVC learning curves',X_train, Y_train, cv=kfold)
g= plot_learning_curve(gsadaDTC.best_estimator_, 'AdaBoost learning curves', X_train,Y_train, cv=kfold)
g= plot_learning_curve(gsGBC.best_estimator_,'GradientBoosting learning curves',X_train,Y_train, cv=kfold)

In [64]:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 피쳐 중요도 검색
nrows= ncols=2

fig,axes= plt.subplots(nrows= nrows, ncols=ncols, sharex='all', figsize=(15,15))

names_classifiers= [('AdaBoosting',ada_best), ('ExtraTrees',ExtC_best),('RandomForest',RFC_best),("GradientBoosting",GBC_best)]

nclassifier=0
for row in range(nrows):
    for col in range(ncols):
        name= names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices= np.argsort(classifier.feature_importances_)[::-1][:40]
        g= sns.barplot(y= X_train.columns[indices][:40],x= classifier.feature_importances_[indices][:40], orient='h',ax= axes[row][col])
        g.set_xlabel("Relative importance",fontsize=12)
        g.set_ylabel("Features",fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name+'feature importance')
        nclassifier +=1

In [67]:

1
2
3
4
5
6
7
8
9
10
11
test_Survived_RFC= pd.Series(RFC_best.predict(test), name='RFC')
test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name='ExtC')
test_Survived_SVMC = pd.Series(SVMC_best.predict(test), name='SVC')
test_Survived_AdaC=pd.Series(ada_best.predict(test), name='Ada')
test_Survived_GBC = pd.Series(GBC_best.predict(test), name='GBC')

# 모든 분류 모델 결과 합치기

ensemble_results= pd.concat([test_Survived_RFC, test_Survived_ExtC,test_Survived_AdaC, test_Survived_GBC, test_Survived_SVMC], axis=1)

g= sns.heatmap(ensemble_results.corr(),annot=True)

In [70]:

1
2
3
4
5
# 앙상블

votingC = VotingClassifier(estimators=[('rfc',RFC_best),("extc",ExtC_best), ('svc',SVMC_best),("adac",ada_best),('gbc',GBC_best)], voting='soft', n_jobs=6)

votingC=votingC.fit(X_train,Y_train)

In [71]:

1
2
3
4
5
6
# 앙상블 예측
test_Survived =pd.Series(votingC.predict(test),name='Survived')

results= pd.concat([IDtest,test_Survived],axis=1)

results

Out[71]:

 PassengerIdSurvived
08920
18930
28940
38950
48961
41313050
41413061
41513070
41613080
41713091

418 rows × 2 columns

In [72]:

1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:95% !important;}</style>"))
This post is licensed under CC BY 4.0 by the author.

Comments powered by Disqus.