[Machine Learning] 타이타닉 생존자 머신러닝 -1-
In [4]:
1
2
df_train=pd.read_csv('Dataset/Titanic/train.csv')
df_test=pd.read_csv('Dataset/Titanic/test.csv')
In [5]:
1
df_train.head()
Out[5]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th… | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [6]:
1
df_train.describe()
Out[6]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
In [7]:
1
df_test.describe()
Out[7]:
PassengerId | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
In [15]:
1
2
3
for col in df_test.columns:
msg = "column: {:>10}\t Percent of NaN value: {:.2f}%".format(col,100*(df_test[col].isnull().sum()/df_test[col].shape[0]))
print(msg)
1
2
3
4
5
6
7
8
9
10
11
column: PassengerId Percent of NaN value: 0.00%
column: Pclass Percent of NaN value: 0.00%
column: Name Percent of NaN value: 0.00%
column: Sex Percent of NaN value: 0.00%
column: Age Percent of NaN value: 20.57%
column: SibSp Percent of NaN value: 0.00%
column: Parch Percent of NaN value: 0.00%
column: Ticket Percent of NaN value: 0.00%
column: Fare Percent of NaN value: 0.24%
column: Cabin Percent of NaN value: 78.23%
column: Embarked Percent of NaN value: 0.00%
In [16]:
1
2
3
for col in df_train.columns:
msg = "column: {:>10}\t Percent of NaN value: {:.2f}%".format(col,100*(df_train[col].isnull().sum()/df_train[col].shape[0]))
print(msg)
1
2
3
4
5
6
7
8
9
10
11
12
column: PassengerId Percent of NaN value: 0.00%
column: Survived Percent of NaN value: 0.00%
column: Pclass Percent of NaN value: 0.00%
column: Name Percent of NaN value: 0.00%
column: Sex Percent of NaN value: 0.00%
column: Age Percent of NaN value: 19.87%
column: SibSp Percent of NaN value: 0.00%
column: Parch Percent of NaN value: 0.00%
column: Ticket Percent of NaN value: 0.00%
column: Fare Percent of NaN value: 0.00%
column: Cabin Percent of NaN value: 77.10%
column: Embarked Percent of NaN value: 0.22%
In [17]:
1
msno.matrix(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8,0.5,0.2))
Out[17]:
1
<AxesSubplot:>
In [18]:
1
msno.bar(df=df_train.iloc[:,:],figsize=(8,8),color=(0.8,0.5,0.2))
Out[18]:
1
<AxesSubplot:>
In [19]:
1
2
3
4
5
6
7
8
9
f,ax=plt.subplots(1,2,figsize=(18,8))
df_train['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')
sns.countplot("Survived",data=df_train,ax=ax[1])
ax[1].set_title("Count plot - Survived")
plt.show()
In [1]:
1
2
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
1
2
C:\Users\easyd\AppData\Local\Temp\ipykernel_28056\3510566465.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
from IPython.core.display import display, HTML
This post is licensed under CC BY 4.0 by the author.
Comments powered by Disqus.