Read data and change its data type

data <- read.csv("titanicTrain.csv")
# 去除NA
# 整理資料格式
data <- data[1:1000,]
sapply(data, function(x) {sum(is.na(x))})
##    pclass  survived      name       sex       age     sibsp     parch 
##         0         0         0         0       139         0         0 
##    ticket      fare     cabin  embarked      boat      body home.dest 
##         0         0         0         0         0       905         0
data <- subset(data, select = -body)
data <- data[is.na(data$age)!=TRUE,]
data$fs <- as.integer(data$parch)+as.integer(data$sibsp)+1

data$boat <- as.character(data$boat)
data$survived <- as.factor(data$survived)
data$pclass <- as.factor(data$pclass)
data$sibsp <- as.factor(data$sibsp)
data$parch <- as.factor(data$parch)
data$fs <- as.factor(data$fs)
library(knitr)
kable(summary(data))
pclass survived name sex age sibsp parch ticket fare cabin embarked boat home.dest fs
1:284 0:480 Connolly, Miss. Kate : 2 : 0 Min. : 0.1667 0:554 0:635 CA 2144 : 8 Min. : 0.000 :599 : 2 Length:861 :179 1 :475
2:261 1:381 Kelly, Mr. James : 2 female:331 1st Qu.:22.0000 1:248 1:125 347077 : 7 1st Qu.: 9.688 C23 C25 C27 : 6 C:189 Class :character New York, NY : 55 2 :185
3:316 NA Abbing, Mr. Anthony : 1 male :530 Median :29.0000 2: 34 2: 85 347082 : 7 Median : 21.000 B57 B59 B63 B66: 5 Q: 41 Mode :character London : 14 3 :120
NA NA Abbott, Master. Eugene Joseph : 1 NA Mean :31.0842 3: 7 3: 8 PC 17608 : 7 Mean : 41.756 B96 B98 : 4 S:629 NA Montreal, PQ : 10 4 : 38
NA NA Abbott, Mr. Rossmore Edward : 1 NA 3rd Qu.:40.0000 4: 12 4: 2 S.O.C. 14879: 7 3rd Qu.: 49.500 C22 C26 : 4 NA NA Cornwall / Akron, OH : 9 7 : 16
NA NA Abbott, Mrs. Stanton (Rosa Hunt): 1 NA Max. :80.0000 5: 6 5: 4 113781 : 6 Max. :512.329 C78 : 4 NA NA Wiltshire, England Niagara Falls, NY: 8 5 : 12
NA NA (Other) :853 NA NA NA 6: 2 (Other) :819 NA (Other) :239 NA NA (Other) :586 (Other): 15

Use EDA to analyze the data.

Summary:

# EDA
library(ggplot2)

library(gridExtra)

ggplot(data, aes(x = survived, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  labs(x = 'How many people died and survived on the Titanic?') +
  geom_label(stat='count',aes(label=..count..), size=3) +
  theme_grey(base_size = 10)

性別與艙等影響

性別和艙等的EDA分析 發現class1和2女性活下來的比例相當高,而class2和3男性活下來的比例最低

#------- gender/sex-----------
b1 <- ggplot(data, aes(x = pclass)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3) +
  facet_grid(.~sex) +
  theme_grey(base_size = 10)

b2 <-
  ggplot(data, aes(x = pclass, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3, position = position_dodge(0.9)) +
  labs(x = 'pclass', y= "count") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()



# -----------pclass and sex-------------
### ----------class1.2女性活下來的比例相當高,class2.3男性活下來的比例最低
b3 <-
  ggplot(data, aes(x = pclass, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'pclass', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()



grid.arrange(b1,b2,nrow=1)

b3

艙等

而單就艙等來評估的話,雖然各艙等的人數並無明顯差距,1等艙的旅客存活人數佔多數,而3等艙的旅客死亡人數則佔多數,2等艙則是較無明顯區別

#------ pclass ----------
ggplot(data, aes(x = survived, fill = pclass)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3, position=position_dodge(0.9)) +
  labs(x = 'class') +
  theme_grey(base_size = 10)

b1 <- ggplot(data, aes(x = pclass, fill = pclass)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3) +
  theme_grey(base_size = 10)

b2 <-
  ggplot(data, aes(x = pclass, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3, position=position_dodge(0.9)) +
  theme_grey(base_size = 10)
grid.arrange(b1,b2, nrow=1)

性別與兄弟姊妹的影響

#-----sibsp-----------

length(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0& data$sex=="female"])/length(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0])
## [1] 0.7228916
length(data$age[data$survived==0 & data$sibsp!=0 & data$parch!=0& data$sex=="male"])/length(data$age[data$survived==0 & data$sibsp!=0 & data$parch!=0])
## [1] 0.6875
# total statistics
b1 <- ggplot(data, aes(x = sibsp, fill = sibsp)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1) +
  theme_grey(base_size = 15)
# statistics showing survival with sex difference
b2 <-
  ggplot(data, aes(x = sibsp, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1, position=position_dodge(0.9)) +
  facet_grid(.~sex) +
  theme_grey(base_size = 15)

# total statistics with sex difference
b3 <-
  ggplot(data, aes(x = sibsp, fill = sex)) +
  geom_bar(stat='count', position = 'dodge') +
  geom_label(stat = 'count', aes(label =..count..), size = 1, 
             position=position_dodge(0.9))+
  theme_grey(base_size = 15)

# percent statistics showing survival with sex difference
b4 <- 
  ggplot(data, aes(x = sibsp, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'sibsp', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()

grid.arrange(b1,b2,b3,b4, nrow=2)

左上圖為總旅客兄弟姊妹個數的柱狀圖分布情形 左下圖為總旅客兄弟姊妹個數以性別分類的柱狀圖分布情形 右上圖和右下圖為性別與兄弟姊妹個數和存活與否的關係,顯示男性存活比率都偏低;而女性的中若手足和配偶數量加起來不超過3,存活機率很高,幾乎都有達到80% 活下且有兄弟姊妹親人是女性的比例是

男性生存比例偏低,但若在sibsp為1和2之間的生存率有接近0.5 女性的生存率在sibsp為5以前都很高。

有雙親和小孩的影響

注意: parch為parch為2以前的樣本量較多。

#----- parch-------------

ggplot(data, aes(x = parch, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'parch', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()

男性生存比例都偏低,但若有雙親和小孩人數為1至2人的生存率有接近0.5 女性的生存率在sibsp為5以前都很高,但因人數太少所以參考價值不大

b1 <- ggplot(data, aes(x = parch, fill = parch)) +
  geom_bar(stat='count', position='dodge') +
  facet_grid(.~sex) +
  geom_label(stat='count',aes(label=..count..), size=2) +
  theme_grey(base_size = 15)

b2 <-
  ggplot(data, aes(x = parch, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=2, position=position_dodge(0.9)) +
  facet_grid(.~sex) +
  theme_grey(base_size = 15)



grid.arrange(b1,b2, nrow=1)

## 家庭人數的影響

# ------- family size -------

# total statistics
b1 <- ggplot(data, aes(x = fs, fill = fs)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1) +
  theme_grey(base_size = 15)
# statistics showing survival with sex difference
b2 <-
  ggplot(data, aes(x = fs, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1, position=position_dodge(0.9)) +
  facet_grid(.~sex) +
  theme_grey(base_size = 15)

# total statistics with sex difference
b3 <-
  ggplot(data, aes(x = fs, fill = sex)) +
  geom_bar(stat='count', position = 'dodge') +
  geom_label(stat = 'count', aes(label =..count..), size = 1, 
             position=position_dodge(0.9))+
  theme_grey(base_size = 15)

# percent statistics showing survival with sex difference
b4 <- 
  ggplot(data, aes(x = fs, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'sibsp', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()

grid.arrange(b1,b2,b3,b4, nrow=2)

family size為4的男生存活率最高,大概都是接近0.5 family size為4的女生存活率也最高

年齡

男性存活率最高的在5歲以下 女性則是大部分都存活率極高

# ------------ age----------
ggplot(data, aes(x = age, fill = survived)) +
  geom_histogram(aes(fill=factor(survived))) + labs(title="Survival density, known-ages, and sex") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 12)) +
  theme_grey() + facet_grid(.~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

船票票價

#-------fare--------

ggplot(data, aes(x = fare, fill = survived)) +
  geom_histogram(aes(fill=factor(survived))) + labs(title="Survival density, known-ages, and sex") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 12)) +
  theme_grey() + facet_grid(.~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#兄弟姊妹對存活率的影響

# sibsp的影響
data_sur <- data[data$survived==1, ]
data_un <- data[data$survived==0, ]

par(mfrow=c(1,3))
boxplot(as.numeric(as.character(data$sibsp)), range=2, main="sibsp")
boxplot(as.numeric(as.character(data_sur$sibsp)), col="red", main="survived sibsp")
boxplot(as.numeric(as.character(data_un$sibsp)), col="blue", main="unsurvived sibsp")

kable(summary(data_sur$sibsp))
x
0 221
1 135
2 16
3 6
4 3
5 0
kable(summary(data_un$sibsp))
x
0 333
1 113
2 18
3 1
4 9
5 6
# 存活比例在各兄弟姊妹數量的分布
kable(table(data_sur$sibsp)/length(data_sur$sibsp)*100)
Var1 Freq
0 58.0052493
1 35.4330709
2 4.1994751
3 1.5748031
4 0.7874016
5 0.0000000
# 死亡比例在各兄弟姊妹數量的分布
kable(table(data_un$sibsp)/length(data_un$sibsp)*100)
Var1 Freq
0 69.3750000
1 23.5416667
2 3.7500000
3 0.2083333
4 1.8750000
5 1.2500000
# 各兄弟姊妹數量的分布
kable(table(data$sibsp)/length(data$sibsp)*100)
Var1 Freq
0 64.3437863
1 28.8037166
2 3.9488966
3 0.8130081
4 1.3937282
5 0.6968641
看似兄弟 姊妹的數量並無明顯對存活率的影響

年齡和票價

par(pch=19)
plot(data$age, data$fare)
points(data$age[data$survived==1], data$fare[data$survived==1], col="red") #活下
points(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0], 
       data$fare[data$survived==1 & data$sibsp!=0 & data$parch!=0], col="yellow")
points(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0 & data$sex=="female"], data$fare[data$survived==1 & data$sibsp!=0 & data$parch!=0 & data$sex=="female"], 
       col="green")
legend("topright", legend=c("活下來",paste0("活下來","\n","有sibsp&parch"),paste0("活下來","\n","有sibsp&parch+女性")), col=c("red","yellow","green"),pch=19)

最高票價的2人都有活下來,但式除了最高票價以外,其實票價的多寡對存活率的影響並會太高

SVM

library(e1071)

dataas <- subset(data, select=c(pclass,age,sibsp,parch,survived))
dataa <- subset(data, select=c(pclass,age,sibsp,parch))
model <- svm(survived ~ ., data = dataas)
pred_result <- predict(model, dataa)
cm <- table(pred_result, data$survived)   
# accuracy
kable((cm[1]+cm[4])/sum(cm)*100)  
x
68.64111

準確率68.6%