[Machine Learning] 타이타닉 앙상블 모델링
In [4]:
1
test.info()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 418 non-null int64
1 Pclass 418 non-null int64
2 Name 418 non-null object
3 Sex 418 non-null object
4 Age 332 non-null float64
5 SibSp 418 non-null int64
6 Parch 418 non-null int64
7 Ticket 418 non-null object
8 Fare 417 non-null float64
9 Cabin 91 non-null object
10 Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
In [5]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# 이상치 탐색
# 데이터 프레임을 매개변수로 받아서 Tukey방식에 따른 이상치 관측의 인덱스 목록을 반환하는 함수 생성
def detect_outliers(df,n,features):
outlier_indices=[]
for col in features:
# 1사분위
Q1 = np.percentile(df[col],25)
# 3사분위
Q3 = np.percentile(df[col],75)
# Interquartile range( IQR )
IQR = Q3-Q1
# 이상치 공식
outlier_step = 1.5 * IQR
# 각 특성의 이상치 인덱스 목록 결정
outlier_list_col = df[(df[col]< Q1 - outlier_step) | (df[col]> Q3 + outlier_step)].index
# 찾은 이상치 인덱스 추가
outlier_indices.extend(outlier_list_col)
# n개 이상의 이상치가 있는 것 찾기
outlier_indices= Counter(outlier_indices)
multiple_outliers= list(k for k, v in outlier_indices.items() if v >n)
return multiple_outliers
# 나이, 형제배우자, 부모자식, Parch와 요금 피쳐들에 대한 이상치 확인
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
In [6]:
1
Outliers_to_drop
Out[6]:
1
[27, 88, 159, 180, 201, 324, 341, 792, 846, 863]
In [7]:
1
train.loc[Outliers_to_drop]
Out[7]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
27 | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19.0 | 3 | 2 | 19950 | 263.00 | C23 C25 C27 | S |
88 | 89 | 1 | 1 | Fortune, Miss. Mabel Helen | female | 23.0 | 3 | 2 | 19950 | 263.00 | C23 C25 C27 | S |
159 | 160 | 0 | 3 | Sage, Master. Thomas Henry | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
180 | 181 | 0 | 3 | Sage, Miss. Constance Gladys | female | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
201 | 202 | 0 | 3 | Sage, Mr. Frederick | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
324 | 325 | 0 | 3 | Sage, Mr. George John Jr | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
341 | 342 | 1 | 1 | Fortune, Miss. Alice Elizabeth | female | 24.0 | 3 | 2 | 19950 | 263.00 | C23 C25 C27 | S |
792 | 793 | 0 | 3 | Sage, Miss. Stella Anna | female | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
846 | 847 | 0 | 3 | Sage, Mr. Douglas Bullen | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
863 | 864 | 0 | 3 | Sage, Miss. Dorothy Edith “Dolly” | female | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
In [8]:
1
2
# 이상치 제거
train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True)
In [9]:
1
2
3
# train과 test 데이터 조인 -> 데이터셋 규모 맞추기
train_len = len(train)
dataset= pd.concat(objs=[train,test],axis=0).reset_index(drop=True)
In [10]:
1
2
3
4
5
6
# 널 데이터 NaN으로 채우기
dataset = dataset.fillna(np.nan)
# 확인
dataset.isnull().sum()
Out[10]:
1
2
3
4
5
6
7
8
9
10
11
12
13
PassengerId 0
Survived 418
Pclass 0
Name 0
Sex 0
Age 256
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 1007
Embarked 2
dtype: int64
In [11]:
1
train.head()
Out[11]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th… | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [12]:
1
train.dtypes
Out[12]:
1
2
3
4
5
6
7
8
9
10
11
12
13
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
In [13]:
1
train.describe()
Out[13]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 881.000000 | 881.000000 | 881.000000 | 711.000000 | 881.000000 | 881.000000 | 881.000000 |
mean | 446.713961 | 0.385925 | 2.307605 | 29.731603 | 0.455165 | 0.363224 | 31.121566 |
std | 256.617021 | 0.487090 | 0.835055 | 14.547835 | 0.871571 | 0.791839 | 47.996249 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 226.000000 | 0.000000 | 2.000000 | 20.250000 | 0.000000 | 0.000000 | 7.895800 |
50% | 448.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.000000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 30.500000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 5.000000 | 6.000000 | 512.329200 |
In [14]:
1
g= sns.heatmap(train[['Survived','SibSp','Parch','Age','Fare']].corr(),annot=True, fmt=".2f",cmap='coolwarm')
In [15]:
1
2
3
4
# SibSp 피쳐 확인하기
g = sns.factorplot(x='SibSp',y="Survived",data=train, kind='bar', size=6, palette='muted')
g.despine(left=True) # 테두리 제거
g = g.set_ylabels("survival probability")
In [16]:
1
2
3
4
5
# Parch 피쳐 확인하기
g = sns.factorplot(x='Parch', y='Survived',data=train,kind='bar',size=6,palette='muted')
g.despine(left=True)
g = g.set_ylabels('survival probability')
In [17]:
1
2
3
# Age 피쳐 확인하기
g = sns.FacetGrid(train,col='Survived')
g = g.map(sns.distplot,'Age')
In [18]:
1
2
3
4
5
6
# Age 분포 확인하기
g = sns.kdeplot(train['Age'][(train['Survived']==0)&(train['Age'].notnull())], color='Red', shade=True)
g = sns.kdeplot(train['Age'][(train['Survived']==1)&(train['Age'].notnull())], ax=g, color="Blue",shade=True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(['Not Survived','Survived'])
In [11]:
1
2
# Fare 결측치 확인
dataset['Fare'].isnull().sum()
Out[11]:
1
1
In [20]:
1
2
3
# Fare 분포 확인
g = sns.distplot(dataset['Fare'], color='m', label='Skewness: %.2f'%(dataset['Fare'].skew()))
g= g.legend(loc='best')
In [12]:
1
2
# 편차가 너무 크기 때문에 로그를 취함
dataset['Fare'] = dataset['Fare'].map(lambda i: np.log(i) if i>0 else 0)
In [13]:
1
2
g= sns.distplot(dataset['Fare'], color='b', label='Skewness: %.2f'%(dataset['Fare'].skew()))
g= g.legend(loc='best')
In [23]:
1
2
g = sns.barplot(x='Sex', y='Survived', data=train)
g = g.set_ylabel('Survival Probability')
In [13]:
1
train[['Sex','Survived']].groupby('Sex').mean()
Out[13]:
Survived | |
---|---|
Sex | |
female | 0.747573 |
male | 0.190559 |
In [25]:
1
2
3
4
# Pclass 탐색하기
g = sns.factorplot(x="Pclass", y='Survived', data=train, kind='bar',size=6, palette='muted')
g.despine(left=True)
g = g.set_ylabels('survival probability')
In [26]:
1
2
3
4
# Pclass를 성별로 분류하여 생존살펴보기
g= sns.factorplot(x='Pclass',y='Survived', hue='Sex', data=train, size=6,kind='bar',palette='muted')
g.despine(left=True)
g=g.set_ylabels("survival probability")
In [14]:
1
2
# Embarked 살펴보기
dataset['Embarked'].isnull().sum()
Out[14]:
1
2
In [15]:
1
2
# Embarked 결측치 최빈값으로 채우기
dataset['Embarked'] = dataset['Embarked'].fillna('S')
In [29]:
1
2
3
4
# Embarked 생존과 비교
g = sns.factorplot(x='Embarked', y= 'Survived', data=train, size=6,kind='bar' ,palette="muted")
g.despine(left=True)
g = g.set_ylabels("Survival Probability")
In [30]:
1
2
3
4
# Pclass와 Embarked 비교
g= sns.factorplot('Pclass', col='Embarked', data=train,size=6, kind='count', palette='muted')
g.despine(left=True)
g = g.set_ylabels("Count")
In [31]:
1
2
3
4
5
# 결측치 처리하기
g= sns.factorplot( y='Age', x='Sex', data=dataset,kind='box')
g= sns.factorplot(y='Age',x='Sex', hue="Pclass", data=dataset, kind='box')
g= sns.factorplot(y='Age', x='Parch', data=dataset,kind='box')
g= sns.factorplot(y='Age', x='SibSp', data=dataset, kind='box')
In [16]:
1
2
# 성별을 수치화
dataset['Sex'] = dataset['Sex'].map({'male':0, 'female':1})
In [33]:
1
2
# 주요한 피쳐들 상관관계 분석
g= sns.heatmap(dataset[['Age','Sex','SibSp','Parch','Pclass']].corr(), cmap='Blues', annot=True)
In [17]:
1
2
3
4
5
6
7
8
9
10
11
12
13
# Age 결측치 채우기
index_NaN_age= list(dataset['Age'][dataset['Age'].isnull()].index)
for i in index_NaN_age:
age_med= dataset['Age'].median()
#결측치가 있는 인덱스의 조건과 같은 행의 Age들 중 중간값
age_pred= dataset['Age'][((dataset['SibSp']== dataset.iloc[i]['SibSp'])&(dataset['Parch']==dataset.iloc[i]['Parch'])&(dataset['Pclass']==dataset.iloc[i]['Pclass']))].median()
if not np.isnan(age_pred): #값이 nan이 아닌 경우
dataset['Age'].iloc[i] = age_pred
else:
dataset['Age'].iloc[i] = age_med
In [39]:
1
2
g= sns.factorplot(x="Survived", y = "Age", data=train,kind='box')
g= sns.factorplot(x='Survived', y='Age', data=train, kind='violin')
In [ ]:
1
## Feature engineering
In [40]:
1
dataset['Name'].head()
Out[40]:
1
2
3
4
5
6
0 Braund, Mr. Owen Harris
1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 Heikkinen, Miss. Laina
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 Allen, Mr. William Henry
Name: Name, dtype: object
In [18]:
1
2
3
4
# 이름으로부터 지위 가져오기
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset['Name']]
dataset["Title"] = pd.Series(dataset_title)
dataset['Title'].head()
Out[18]:
1
2
3
4
5
6
0 Mr
1 Mrs
2 Miss
3 Mrs
4 Mr
Name: Title, dtype: object
In [42]:
1
2
g = sns.countplot(x= "Title", data=dataset)
g = plt.setp(g.get_xticklabels(), rotation=45)
In [19]:
1
2
3
4
# title을 카테고리로 변환
dataset['Title'] = dataset['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
dataset['Title']= dataset['Title'].map({"Master":0, 'Miss':1, 'Ms': 1,'Mme':1, 'Mlle':1, "Mrs":1, "Mr":2, "Rare":3})
dataset['Title'] = dataset['Title'].astype(int)
In [44]:
1
2
g = sns.countplot(dataset['Title'])
g = g.set_xticklabels(['Master','Miss/Ms/Mme/Mlle/Mrs','Mr',"Rare"])
In [47]:
1
2
g = sns.factorplot(x='Title', y='Survived', data=dataset, kind='bar')
g = g.set_xticklabels(['Master','Miss-Mrs','Mr',"Rare"])
In [20]:
1
2
# Name 컬럼 삭제
dataset = dataset.drop(labels = ['Name'], axis=1)
In [21]:
1
2
# 가족 사이즈 컬럼 만들기
dataset['Fsize'] = dataset['SibSp'] + dataset['Parch'] +1
In [50]:
1
2
g= sns.factorplot(x ="Fsize", y='Survived', data=dataset)
g = g.set_ylabels("Survival Probability")
In [22]:
1
2
3
4
5
6
# 가족 사이즈를 다시 4카테고리로 만듦
dataset['Single'] = dataset['Fsize'].map(lambda s : 1 if s==1 else 0 )
dataset['SmallF'] = dataset['Fsize'].map(lambda s : 1 if s==2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s : 1 if 3<= s<=4 else 0 )
dataset['LargeF'] = dataset['Fsize'].map(lambda s : 1 if s>=5 else 0)
In [25]:
1
2
3
4
5
6
7
8
9
10
11
g= sns.factorplot(x='Single', y='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF", y ='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")
g= sns.factorplot(x="MedF", y='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")
g= sns.factorplot(x='LargeF', y='Survived', data=dataset, kind='bar')
g= g.set_ylabels("Survival Probability")
In [23]:
1
2
3
# Title과 Embarked 지표 컬럼 만들기
dataset= pd.get_dummies(dataset, columns=['Title'])
dataset = pd.get_dummies(dataset, columns = ['Embarked'], prefix='Em')
In [24]:
1
dataset.head()
Out[24]:
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | … | SmallF | MedF | LargeF | Title_0 | Title_1 | Title_2 | Title_3 | Em_C | Em_Q | Em_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 3 | 0 | 22.0 | 1 | 0 | A/5 21171 | 1.981001 | NaN | … | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
1 | 2 | 1.0 | 1 | 1 | 38.0 | 1 | 0 | PC 17599 | 4.266662 | C85 | … | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | 3 | 1.0 | 3 | 1 | 26.0 | 0 | 0 | STON/O2. 3101282 | 2.070022 | NaN | … | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
3 | 4 | 1.0 | 1 | 1 | 35.0 | 1 | 0 | 113803 | 3.972177 | C123 | … | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
4 | 5 | 0.0 | 3 | 0 | 35.0 | 0 | 0 | 373450 | 2.085672 | NaN | … | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
5 rows × 22 columns
In [25]:
1
dataset['Cabin'].head()
Out[25]:
1
2
3
4
5
6
0 NaN
1 C85
2 NaN
3 C123
4 NaN
Name: Cabin, dtype: object
In [59]:
1
dataset['Cabin'].describe()
Out[59]:
1
2
3
4
5
count 292
unique 186
top G6
freq 5
Name: Cabin, dtype: object
In [26]:
1
dataset['Cabin'].isnull().sum()
Out[26]:
1
1007
In [27]:
1
dataset['Cabin'][dataset["Cabin"].notnull()].head()
Out[27]:
1
2
3
4
5
6
1 C85
3 C123
6 E46
10 G6
11 C103
Name: Cabin, dtype: object
In [28]:
1
2
# 없으면 X로 Cabin 결측치 대체하기 결측치 아니라면 첫번째 문자만 가져오기
dataset['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']])
In [29]:
1
g= sns.countplot(dataset['Cabin'], order=['A','B','C','D','E','F','G','T','X'])
In [71]:
1
2
g= sns.factorplot(y='Survived', x='Cabin', data=dataset, kind='bar', order=['A','B','C','D','E','F','G','T','X'])
g= g.set_ylabels("Survival Probability")
In [30]:
1
dataset= pd.get_dummies(dataset, columns=['Cabin'], prefix='Cabin')
In [31]:
1
dataset['Ticket'].head()
Out[31]:
1
2
3
4
5
6
0 A/5 21171
1 PC 17599
2 STON/O2. 3101282
3 113803
4 373450
Name: Ticket, dtype: object
In [32]:
1
2
3
4
5
6
7
8
9
10
11
12
13
# 티켓 앞부분만 추출한다. 없으면 X로 대체
Ticket= []
for i in list(dataset.Ticket):
if not i.isdigit():
Ticket.append(i.replace(".",'').replace("/","").strip().split(' ')[0]) # 접두사 추출
else:
Ticket.append("X")
dataset['Ticket'] =Ticket
dataset['Ticket'].head()
Out[32]:
1
2
3
4
5
6
0 A5
1 PC
2 STONO2
3 X
4 X
Name: Ticket, dtype: object
In [34]:
1
dataset= pd.get_dummies(dataset,columns=['Ticket'], prefix='T')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [34], in <cell line: 1>()
----> 1 dataset= pd.get_dummies(dataset,columns=['Ticket'], prefix='T')
File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\reshape\reshape.py:931, in get_dummies(data, prefix, prefix_sep, dummy_na, columns, sparse, drop_first, dtype)
929 raise TypeError("Input must be a list-like for parameter `columns`")
930 else:
--> 931 data_to_encode = data[columns]
933 # validate prefixes and separator to avoid silently dropping cols
934 def check_len(item, name):
File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\frame.py:3511, in DataFrame.__getitem__(self, key)
3509 if is_iterator(key):
3510 key = list(key)
-> 3511 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3513 # take() does not accept boolean indexers
3514 if getattr(indexer, "dtype", None) == bool:
File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\indexes\base.py:5782, in Index._get_indexer_strict(self, key, axis_name)
5779 else:
5780 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5782 self._raise_if_missing(keyarr, indexer, axis_name)
5784 keyarr = self.take(indexer)
5785 if isinstance(key, Index):
5786 # GH 42790 - Preserve name from an Index
File ~\miniforge3\envs\easydong2\lib\site-packages\pandas\core\indexes\base.py:5842, in Index._raise_if_missing(self, key, indexer, axis_name)
5840 if use_interval_msg:
5841 key = list(key)
-> 5842 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
5844 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
5845 raise KeyError(f"{not_found} not in index")
KeyError: "None of [Index(['Ticket'], dtype='object')] are in the [columns]"
In [35]:
1
2
3
# Pclass도 각각 밸류에 종류 컬럼으로 만들기
dataset['Pclass']= dataset['Pclass'].astype("category")
dataset= pd.get_dummies(dataset,columns=['Pclass'], prefix='Pc')
In [36]:
1
2
# 쓸모없는 거 지우기
dataset = dataset.drop(labels = ['PassengerId'], axis=1)
In [37]:
1
dataset.head()
Out[37]:
Survived | Sex | Age | SibSp | Parch | Fare | Fsize | Single | SmallF | MedF | … | T_STONO | T_STONO2 | T_STONOQ | T_SWPP | T_WC | T_WEP | T_X | Pc_1 | Pc_2 | Pc_3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0 | 22.0 | 1 | 0 | 1.981001 | 2 | 0 | 1 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 1.0 | 1 | 38.0 | 1 | 0 | 4.266662 | 2 | 0 | 1 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 1.0 | 1 | 26.0 | 0 | 0 | 2.070022 | 1 | 1 | 0 | 0 | … | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 1.0 | 1 | 35.0 | 1 | 0 | 3.972177 | 2 | 0 | 1 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
4 | 0.0 | 0 | 35.0 | 0 | 0 | 2.085672 | 1 | 1 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows × 67 columns
In [ ]:
1
## 모델링
In [38]:
1
2
3
4
# train과 test로 나누기
train = dataset[:train_len]
test= dataset[train_len:]
test = test.drop(labels= ['Survived'], axis=1)
In [39]:
1
2
3
4
5
train['Survived']=train['Survived'].astype(int)
Y_train = train['Survived']
X_train = train.drop(labels=['Survived'], axis=1)
In [40]:
1
2
# Cross validate KFold 모델
kfold= StratifiedKFold(n_splits=10)
In [41]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 여러가지 머신러닝 알고리즘
random_state=2
classifiers=[]
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state, learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())
cv_results= []
for classifier in classifiers:
cv_results.append(cross_val_score(classifier, X_train, y= Y_train,scoring = 'accuracy', cv=kfold, n_jobs=6))
cv_means= []
cv_std=[]
for cv_result in cv_results:
cv_means.append(cv_result.mean())
cv_std.append(cv_result.std())
cv_res = pd.DataFrame({'CrossValMeans': cv_means, 'CrossValerrors':cv_std, 'Algorithm':['SVC','DecisionTree','AdaBoost','RandomForest','ExtraTrees','GradientBoosting','MultipleLayerPerceptron','KNeighbors','LogisticRegression',
'LinearDiscriminantAnalysis']})
g = sns.barplot("CrossValMeans",'Algorithm',data=cv_res, palette='Set3', orient='h', **{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross Validation Scrores")
In [42]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# ADABOOST, RF, EXTRATREES, GRADIENTBOOSTING 메타모델링
# Adaboost
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
ada_param_grid= {'base_estimator__criterion':['gini','entropy'],
"base_estimator__splitter": ['best','random'],
'algorithm': ['SAMME','SAMME.R'],
'n_estimators': [1,2],
'learning_rate': [0.0001,0.001,0.01,0.1,0.2,0.3,1.5]}
gsadaDTC=GridSearchCV(adaDTC, param_grid=ada_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)
gsadaDTC.fit(X_train,Y_train)
ada_best= gsadaDTC.best_estimator_
1
Fitting 10 folds for each of 112 candidates, totalling 1120 fits
In [45]:
1
gsadaDTC.best_score_
Out[45]:
1
0.8275536261491316
In [48]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Extra트리
ExtC= ExtraTreesClassifier()
## 최적의 매개변수를 위한 그리드 검색
ex_param_grid= {'max_depth':[None],
'max_features':[1,3,10],
'min_samples_split':[2,3,10],
'min_samples_leaf':[1,3,10],
'bootstrap':[False],
"n_estimators":[100,300],
'criterion':['gini']}
gsExtC = GridSearchCV(ExtC, param_grid = ex_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)
gsExtC.fit(X_train,Y_train)
ExtC_best= gsExtC.best_estimator_
# 최고 점수
gsExtC.best_score_
1
Fitting 10 folds for each of 54 candidates, totalling 540 fits
Out[48]:
1
0.8342696629213483
In [49]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# RFC 매개변수 튜닝
RFC= RandomForestClassifier()
## 최적의 매개변수를 위한 그리드 검색
rf_param_grid= {'max_depth':[None],
'max_features':[1,3,10],
'min_samples_split':[2,3,10],
'min_samples_leaf':[1,3,10],
'bootstrap':[False],
'n_estimators':[100,300],
'criterion':['gini']}
gsRFC = GridSearchCV(RFC, param_grid = rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=4, verbose=1)
gsRFC.fit(X_train, Y_train)
RFC_best= gsRFC.best_estimator_
# 최고 점수
gsRFC.best_score_
1
Fitting 10 folds for each of 54 candidates, totalling 540 fits
Out[49]:
1
0.836567926455567
In [54]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Gradient boosting 튜닝
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss':['deviance'],
'n_estimators' : [100,200,300],
'learning_rate': [0.1,0.05,0.01],
'max_depth':[4,8],
'min_samples_leaf':[100,150],
'max_features':[0.3,0.1]
}
gsGBC= GridSearchCV(GBC, param_grid= gb_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)
gsGBC.fit(X_train, Y_train)
GBC_best= gsGBC.best_estimator_
# 최고 점수
gsGBC.best_score_
1
Fitting 10 folds for each of 72 candidates, totalling 720 fits
Out[54]:
1
0.8388151174668028
In [55]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SVC 분류
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'],
'gamma': [0.001,0.01,0.1,1],
'C':[1,10,50,100,200,300,1000]
}
gsSVMC = GridSearchCV(SVMC, param_grid= svc_param_grid, cv=kfold, scoring='accuracy', n_jobs=6, verbose=1)
gsSVMC.fit(X_train,Y_train)
SVMC_best= gsSVMC.best_estimator_
# 최고 점수
gsSVMC.best_score_
1
Fitting 10 folds for each of 28 candidates, totalling 280 fits
Out[55]:
1
0.8331332992849847
In [58]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1,1.0,5)):
"""
Generate a simple plot of the test and training learning curve
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Training examples')
plt.ylabel('Score')
train_sizes, train_scores, test_scores= learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes= train_sizes
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std =np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std= np.std(test_scores,axis=1)
plt.grid()
plt.fill_between(train_sizes,train_scores_mean - train_scores_std,
train_scores_mean+train_scores_std, alpha=0.1,
color='r')
plt.fill_between(train_sizes, test_scores_mean-test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
plt.plot(train_sizes, test_scores_mean, 'o-',color='g', label='Cross-validation score')
plt.legend(loc='best')
return plt
In [62]:
1
2
3
4
5
g= plot_learning_curve(gsRFC.best_estimator_, 'RF learning curve', X_train,Y_train, cv=kfold)
g= plot_learning_curve(gsExtC.best_estimator_, 'ExtraTrees learning curves', X_train, Y_train, cv=kfold)
g=plot_learning_curve(gsSVMC.best_estimator_, 'SVC learning curves',X_train, Y_train, cv=kfold)
g= plot_learning_curve(gsadaDTC.best_estimator_, 'AdaBoost learning curves', X_train,Y_train, cv=kfold)
g= plot_learning_curve(gsGBC.best_estimator_,'GradientBoosting learning curves',X_train,Y_train, cv=kfold)
In [64]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 피쳐 중요도 검색
nrows= ncols=2
fig,axes= plt.subplots(nrows= nrows, ncols=ncols, sharex='all', figsize=(15,15))
names_classifiers= [('AdaBoosting',ada_best), ('ExtraTrees',ExtC_best),('RandomForest',RFC_best),("GradientBoosting",GBC_best)]
nclassifier=0
for row in range(nrows):
for col in range(ncols):
name= names_classifiers[nclassifier][0]
classifier = names_classifiers[nclassifier][1]
indices= np.argsort(classifier.feature_importances_)[::-1][:40]
g= sns.barplot(y= X_train.columns[indices][:40],x= classifier.feature_importances_[indices][:40], orient='h',ax= axes[row][col])
g.set_xlabel("Relative importance",fontsize=12)
g.set_ylabel("Features",fontsize=12)
g.tick_params(labelsize=9)
g.set_title(name+'feature importance')
nclassifier +=1
In [67]:
1
2
3
4
5
6
7
8
9
10
11
test_Survived_RFC= pd.Series(RFC_best.predict(test), name='RFC')
test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name='ExtC')
test_Survived_SVMC = pd.Series(SVMC_best.predict(test), name='SVC')
test_Survived_AdaC=pd.Series(ada_best.predict(test), name='Ada')
test_Survived_GBC = pd.Series(GBC_best.predict(test), name='GBC')
# 모든 분류 모델 결과 합치기
ensemble_results= pd.concat([test_Survived_RFC, test_Survived_ExtC,test_Survived_AdaC, test_Survived_GBC, test_Survived_SVMC], axis=1)
g= sns.heatmap(ensemble_results.corr(),annot=True)
In [70]:
1
2
3
4
5
# 앙상블
votingC = VotingClassifier(estimators=[('rfc',RFC_best),("extc",ExtC_best), ('svc',SVMC_best),("adac",ada_best),('gbc',GBC_best)], voting='soft', n_jobs=6)
votingC=votingC.fit(X_train,Y_train)
In [71]:
1
2
3
4
5
6
# 앙상블 예측
test_Survived =pd.Series(votingC.predict(test),name='Survived')
results= pd.concat([IDtest,test_Survived],axis=1)
results
Out[71]:
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 0 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
… | … | … |
413 | 1305 | 0 |
414 | 1306 | 1 |
415 | 1307 | 0 |
416 | 1308 | 0 |
417 | 1309 | 1 |
418 rows × 2 columns
In [72]:
1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:95% !important;}</style>"))
Comments powered by Disqus.