titanic SVM

Read data and change its data type

data <- read.csv("titanicTrain.csv")
# 去除NA
# 整理資料格式
data <- data[1:1000,]
sapply(data, function(x) {sum(is.na(x))})

##    pclass  survived      name       sex       age     sibsp     parch 
##         0         0         0         0       139         0         0 
##    ticket      fare     cabin  embarked      boat      body home.dest 
##         0         0         0         0         0       905         0

data <- subset(data, select = -body)
data <- data[is.na(data$age)!=TRUE,]
data$fs <- as.integer(data$parch)+as.integer(data$sibsp)+1

data$boat <- as.character(data$boat)
data$survived <- as.factor(data$survived)
data$pclass <- as.factor(data$pclass)
data$sibsp <- as.factor(data$sibsp)
data$parch <- as.factor(data$parch)
data$fs <- as.factor(data$fs)
library(knitr)
kable(summary(data))

pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	home.dest	fs
1:284	0:480	Connolly, Miss. Kate : 2	: 0	Min. : 0.1667	0:554	0:635	CA 2144 : 8	Min. : 0.000	:599	: 2	Length:861	:179	1 :475
2:261	1:381	Kelly, Mr. James : 2	female:331	1st Qu.:22.0000	1:248	1:125	347077 : 7	1st Qu.: 9.688	C23 C25 C27 : 6	C:189	Class :character	New York, NY : 55	2 :185
3:316	NA	Abbing, Mr. Anthony : 1	male :530	Median :29.0000	2: 34	2: 85	347082 : 7	Median : 21.000	B57 B59 B63 B66: 5	Q: 41	Mode :character	London : 14	3 :120
NA	NA	Abbott, Master. Eugene Joseph : 1	NA	Mean :31.0842	3: 7	3: 8	PC 17608 : 7	Mean : 41.756	B96 B98 : 4	S:629	NA	Montreal, PQ : 10	4 : 38
NA	NA	Abbott, Mr. Rossmore Edward : 1	NA	3rd Qu.:40.0000	4: 12	4: 2	S.O.C. 14879: 7	3rd Qu.: 49.500	C22 C26 : 4	NA	NA	Cornwall / Akron, OH : 9	7 : 16
NA	NA	Abbott, Mrs. Stanton (Rosa Hunt): 1	NA	Max. :80.0000	5: 6	5: 4	113781 : 6	Max. :512.329	C78 : 4	NA	NA	Wiltshire, England Niagara Falls, NY: 8	5 : 12
NA	NA	(Other) :853	NA	NA	NA	6: 2	(Other) :819	NA	(Other) :239	NA	NA	(Other) :586	(Other): 15

Use EDA to analyze the data.

Summary:

# EDA
library(ggplot2)

library(gridExtra)

ggplot(data, aes(x = survived, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  labs(x = 'How many people died and survived on the Titanic?') +
  geom_label(stat='count',aes(label=..count..), size=3) +
  theme_grey(base_size = 10)

性別與艙等影響

性別和艙等的EDA分析發現class1和2女性活下來的比例相當高，而class2和3男性活下來的比例最低

#------- gender/sex-----------
b1 <- ggplot(data, aes(x = pclass)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3) +
  facet_grid(.~sex) +
  theme_grey(base_size = 10)

b2 <-
  ggplot(data, aes(x = pclass, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3, position = position_dodge(0.9)) +
  labs(x = 'pclass', y= "count") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()



# -----------pclass and sex-------------
### ----------class1.2女性活下來的比例相當高，class2.3男性活下來的比例最低
b3 <-
  ggplot(data, aes(x = pclass, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'pclass', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()



grid.arrange(b1,b2,nrow=1)

b3

艙等

而單就艙等來評估的話，雖然各艙等的人數並無明顯差距，1等艙的旅客存活人數佔多數，而3等艙的旅客死亡人數則佔多數，2等艙則是較無明顯區別

#------ pclass ----------
ggplot(data, aes(x = survived, fill = pclass)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3, position=position_dodge(0.9)) +
  labs(x = 'class') +
  theme_grey(base_size = 10)

b1 <- ggplot(data, aes(x = pclass, fill = pclass)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3) +
  theme_grey(base_size = 10)

b2 <-
  ggplot(data, aes(x = pclass, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=3, position=position_dodge(0.9)) +
  theme_grey(base_size = 10)
grid.arrange(b1,b2, nrow=1)

性別與兄弟姊妹的影響

#-----sibsp-----------

length(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0& data$sex=="female"])/length(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0])

## [1] 0.7228916

length(data$age[data$survived==0 & data$sibsp!=0 & data$parch!=0& data$sex=="male"])/length(data$age[data$survived==0 & data$sibsp!=0 & data$parch!=0])

## [1] 0.6875

# total statistics
b1 <- ggplot(data, aes(x = sibsp, fill = sibsp)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1) +
  theme_grey(base_size = 15)
# statistics showing survival with sex difference
b2 <-
  ggplot(data, aes(x = sibsp, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1, position=position_dodge(0.9)) +
  facet_grid(.~sex) +
  theme_grey(base_size = 15)

# total statistics with sex difference
b3 <-
  ggplot(data, aes(x = sibsp, fill = sex)) +
  geom_bar(stat='count', position = 'dodge') +
  geom_label(stat = 'count', aes(label =..count..), size = 1, 
             position=position_dodge(0.9))+
  theme_grey(base_size = 15)

# percent statistics showing survival with sex difference
b4 <- 
  ggplot(data, aes(x = sibsp, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'sibsp', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()

grid.arrange(b1,b2,b3,b4, nrow=2)

左上圖為總旅客兄弟姊妹個數的柱狀圖分布情形左下圖為總旅客兄弟姊妹個數以性別分類的柱狀圖分布情形右上圖和右下圖為性別與兄弟姊妹個數和存活與否的關係，顯示男性存活比率都偏低；而女性的中若手足和配偶數量加起來不超過3，存活機率很高，幾乎都有達到80% 活下且有兄弟姊妹親人是女性的比例是

男性生存比例偏低，但若在sibsp為1和2之間的生存率有接近0.5 女性的生存率在sibsp為5以前都很高。

有雙親和小孩的影響

注意: parch為parch為2以前的樣本量較多。

#----- parch-------------

ggplot(data, aes(x = parch, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'parch', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()

男性生存比例都偏低，但若有雙親和小孩人數為1至2人的生存率有接近0.5 女性的生存率在sibsp為5以前都很高，但因人數太少所以參考價值不大

b1 <- ggplot(data, aes(x = parch, fill = parch)) +
  geom_bar(stat='count', position='dodge') +
  facet_grid(.~sex) +
  geom_label(stat='count',aes(label=..count..), size=2) +
  theme_grey(base_size = 15)

b2 <-
  ggplot(data, aes(x = parch, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=2, position=position_dodge(0.9)) +
  facet_grid(.~sex) +
  theme_grey(base_size = 15)



grid.arrange(b1,b2, nrow=1)

## 家庭人數的影響

# ------- family size -------

# total statistics
b1 <- ggplot(data, aes(x = fs, fill = fs)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1) +
  theme_grey(base_size = 15)
# statistics showing survival with sex difference
b2 <-
  ggplot(data, aes(x = fs, fill = survived)) +
  geom_bar(stat='count', position='dodge') +
  geom_label(stat='count',aes(label=..count..), size=1, position=position_dodge(0.9)) +
  facet_grid(.~sex) +
  theme_grey(base_size = 15)

# total statistics with sex difference
b3 <-
  ggplot(data, aes(x = fs, fill = sex)) +
  geom_bar(stat='count', position = 'dodge') +
  geom_label(stat = 'count', aes(label =..count..), size = 1, 
             position=position_dodge(0.9))+
  theme_grey(base_size = 15)

# percent statistics showing survival with sex difference
b4 <- 
  ggplot(data, aes(x = fs, fill = survived)) +
  geom_bar(stat='count', position='fill') +
  labs(x = 'sibsp', y= "Percent") + facet_grid(.~sex) +
  theme(legend.position="none") + theme_grey()

grid.arrange(b1,b2,b3,b4, nrow=2)

family size為4的男生存活率最高，大概都是接近0.5 family size為4的女生存活率也最高

年齡

男性存活率最高的在5歲以下女性則是大部分都存活率極高

# ------------ age----------
ggplot(data, aes(x = age, fill = survived)) +
  geom_histogram(aes(fill=factor(survived))) + labs(title="Survival density, known-ages, and sex") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 12)) +
  theme_grey() + facet_grid(.~sex)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

船票票價

#-------fare--------

ggplot(data, aes(x = fare, fill = survived)) +
  geom_histogram(aes(fill=factor(survived))) + labs(title="Survival density, known-ages, and sex") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 12)) +
  theme_grey() + facet_grid(.~sex)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#兄弟姊妹對存活率的影響

# sibsp的影響
data_sur <- data[data$survived==1, ]
data_un <- data[data$survived==0, ]

par(mfrow=c(1,3))
boxplot(as.numeric(as.character(data$sibsp)), range=2, main="sibsp")
boxplot(as.numeric(as.character(data_sur$sibsp)), col="red", main="survived sibsp")
boxplot(as.numeric(as.character(data_un$sibsp)), col="blue", main="unsurvived sibsp")

kable(summary(data_sur$sibsp))

	x
0	221
1	135
2	16
3	6
4	3
5	0

kable(summary(data_un$sibsp))

	x
0	333
1	113
2	18
3	1
4	9
5	6

# 存活比例在各兄弟姊妹數量的分布
kable(table(data_sur$sibsp)/length(data_sur$sibsp)*100)

Var1	Freq
0	58.0052493
1	35.4330709
2	4.1994751
3	1.5748031
4	0.7874016
5	0.0000000

# 死亡比例在各兄弟姊妹數量的分布
kable(table(data_un$sibsp)/length(data_un$sibsp)*100)

Var1	Freq
0	69.3750000
1	23.5416667
2	3.7500000
3	0.2083333
4	1.8750000
5	1.2500000

# 各兄弟姊妹數量的分布
kable(table(data$sibsp)/length(data$sibsp)*100)

Var1	Freq
0	64.3437863
1	28.8037166
2	3.9488966
3	0.8130081
4	1.3937282
5	0.6968641
看似兄弟	姊妹的數量並無明顯對存活率的影響

年齡和票價

par(pch=19)
plot(data$age, data$fare)
points(data$age[data$survived==1], data$fare[data$survived==1], col="red") #活下
points(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0], 
       data$fare[data$survived==1 & data$sibsp!=0 & data$parch!=0], col="yellow")
points(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0 & data$sex=="female"], data$fare[data$survived==1 & data$sibsp!=0 & data$parch!=0 & data$sex=="female"], 
       col="green")
legend("topright", legend=c("活下來",paste0("活下來","\n","有sibsp&parch"),paste0("活下來","\n","有sibsp&parch+女性")), col=c("red","yellow","green"),pch=19)

最高票價的2人都有活下來，但式除了最高票價以外，其實票價的多寡對存活率的影響並會太高

SVM

library(e1071)

dataas <- subset(data, select=c(pclass,age,sibsp,parch,survived))
dataa <- subset(data, select=c(pclass,age,sibsp,parch))
model <- svm(survived ~ ., data = dataas)
pred_result <- predict(model, dataa)
cm <- table(pred_result, data$survived)   
# accuracy
kable((cm[1]+cm[4])/sum(cm)*100)

x
68.64111

準確率68.6%