data <- read.csv("titanicTrain.csv")
# 去除NA
# 整理資料格式
data <- data[1:1000,]
sapply(data, function(x) {sum(is.na(x))})
## pclass survived name sex age sibsp parch
## 0 0 0 0 139 0 0
## ticket fare cabin embarked boat body home.dest
## 0 0 0 0 0 905 0
data <- subset(data, select = -body)
data <- data[is.na(data$age)!=TRUE,]
data$fs <- as.integer(data$parch)+as.integer(data$sibsp)+1
data$boat <- as.character(data$boat)
data$survived <- as.factor(data$survived)
data$pclass <- as.factor(data$pclass)
data$sibsp <- as.factor(data$sibsp)
data$parch <- as.factor(data$parch)
data$fs <- as.factor(data$fs)
library(knitr)
kable(summary(data))
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | home.dest | fs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1:284 | 0:480 | Connolly, Miss. Kate : 2 | : 0 | Min. : 0.1667 | 0:554 | 0:635 | CA 2144 : 8 | Min. : 0.000 | :599 | : 2 | Length:861 | :179 | 1 :475 | |
2:261 | 1:381 | Kelly, Mr. James : 2 | female:331 | 1st Qu.:22.0000 | 1:248 | 1:125 | 347077 : 7 | 1st Qu.: 9.688 | C23 C25 C27 : 6 | C:189 | Class :character | New York, NY : 55 | 2 :185 | |
3:316 | NA | Abbing, Mr. Anthony : 1 | male :530 | Median :29.0000 | 2: 34 | 2: 85 | 347082 : 7 | Median : 21.000 | B57 B59 B63 B66: 5 | Q: 41 | Mode :character | London : 14 | 3 :120 | |
NA | NA | Abbott, Master. Eugene Joseph : 1 | NA | Mean :31.0842 | 3: 7 | 3: 8 | PC 17608 : 7 | Mean : 41.756 | B96 B98 : 4 | S:629 | NA | Montreal, PQ : 10 | 4 : 38 | |
NA | NA | Abbott, Mr. Rossmore Edward : 1 | NA | 3rd Qu.:40.0000 | 4: 12 | 4: 2 | S.O.C. 14879: 7 | 3rd Qu.: 49.500 | C22 C26 : 4 | NA | NA | Cornwall / Akron, OH : 9 | 7 : 16 | |
NA | NA | Abbott, Mrs. Stanton (Rosa Hunt): 1 | NA | Max. :80.0000 | 5: 6 | 5: 4 | 113781 : 6 | Max. :512.329 | C78 : 4 | NA | NA | Wiltshire, England Niagara Falls, NY: 8 | 5 : 12 | |
NA | NA | (Other) :853 | NA | NA | NA | 6: 2 | (Other) :819 | NA | (Other) :239 | NA | NA | (Other) :586 | (Other): 15 |
Use EDA to analyze the data.
# EDA
library(ggplot2)
library(gridExtra)
ggplot(data, aes(x = survived, fill = survived)) +
geom_bar(stat='count', position='dodge') +
labs(x = 'How many people died and survived on the Titanic?') +
geom_label(stat='count',aes(label=..count..), size=3) +
theme_grey(base_size = 10)
性別和艙等的EDA分析 發現class1和2女性活下來的比例相當高,而class2和3男性活下來的比例最低
#------- gender/sex-----------
b1 <- ggplot(data, aes(x = pclass)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=3) +
facet_grid(.~sex) +
theme_grey(base_size = 10)
b2 <-
ggplot(data, aes(x = pclass, fill = survived)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=3, position = position_dodge(0.9)) +
labs(x = 'pclass', y= "count") + facet_grid(.~sex) +
theme(legend.position="none") + theme_grey()
# -----------pclass and sex-------------
### ----------class1.2女性活下來的比例相當高,class2.3男性活下來的比例最低
b3 <-
ggplot(data, aes(x = pclass, fill = survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'pclass', y= "Percent") + facet_grid(.~sex) +
theme(legend.position="none") + theme_grey()
grid.arrange(b1,b2,nrow=1)
b3
而單就艙等來評估的話,雖然各艙等的人數並無明顯差距,1等艙的旅客存活人數佔多數,而3等艙的旅客死亡人數則佔多數,2等艙則是較無明顯區別
#------ pclass ----------
ggplot(data, aes(x = survived, fill = pclass)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=3, position=position_dodge(0.9)) +
labs(x = 'class') +
theme_grey(base_size = 10)
b1 <- ggplot(data, aes(x = pclass, fill = pclass)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=3) +
theme_grey(base_size = 10)
b2 <-
ggplot(data, aes(x = pclass, fill = survived)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=3, position=position_dodge(0.9)) +
theme_grey(base_size = 10)
grid.arrange(b1,b2, nrow=1)
#-----sibsp-----------
length(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0& data$sex=="female"])/length(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0])
## [1] 0.7228916
length(data$age[data$survived==0 & data$sibsp!=0 & data$parch!=0& data$sex=="male"])/length(data$age[data$survived==0 & data$sibsp!=0 & data$parch!=0])
## [1] 0.6875
# total statistics
b1 <- ggplot(data, aes(x = sibsp, fill = sibsp)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=1) +
theme_grey(base_size = 15)
# statistics showing survival with sex difference
b2 <-
ggplot(data, aes(x = sibsp, fill = survived)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=1, position=position_dodge(0.9)) +
facet_grid(.~sex) +
theme_grey(base_size = 15)
# total statistics with sex difference
b3 <-
ggplot(data, aes(x = sibsp, fill = sex)) +
geom_bar(stat='count', position = 'dodge') +
geom_label(stat = 'count', aes(label =..count..), size = 1,
position=position_dodge(0.9))+
theme_grey(base_size = 15)
# percent statistics showing survival with sex difference
b4 <-
ggplot(data, aes(x = sibsp, fill = survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'sibsp', y= "Percent") + facet_grid(.~sex) +
theme(legend.position="none") + theme_grey()
grid.arrange(b1,b2,b3,b4, nrow=2)
左上圖為總旅客兄弟姊妹個數的柱狀圖分布情形 左下圖為總旅客兄弟姊妹個數以性別分類的柱狀圖分布情形 右上圖和右下圖為性別與兄弟姊妹個數和存活與否的關係,顯示男性存活比率都偏低;而女性的中若手足和配偶數量加起來不超過3,存活機率很高,幾乎都有達到80% 活下且有兄弟姊妹親人是女性的比例是
男性生存比例偏低,但若在sibsp為1和2之間的生存率有接近0.5 女性的生存率在sibsp為5以前都很高。
注意: parch為parch為2以前的樣本量較多。
#----- parch-------------
ggplot(data, aes(x = parch, fill = survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'parch', y= "Percent") + facet_grid(.~sex) +
theme(legend.position="none") + theme_grey()
男性生存比例都偏低,但若有雙親和小孩人數為1至2人的生存率有接近0.5 女性的生存率在sibsp為5以前都很高,但因人數太少所以參考價值不大
b1 <- ggplot(data, aes(x = parch, fill = parch)) +
geom_bar(stat='count', position='dodge') +
facet_grid(.~sex) +
geom_label(stat='count',aes(label=..count..), size=2) +
theme_grey(base_size = 15)
b2 <-
ggplot(data, aes(x = parch, fill = survived)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=2, position=position_dodge(0.9)) +
facet_grid(.~sex) +
theme_grey(base_size = 15)
grid.arrange(b1,b2, nrow=1)
## 家庭人數的影響
# ------- family size -------
# total statistics
b1 <- ggplot(data, aes(x = fs, fill = fs)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=1) +
theme_grey(base_size = 15)
# statistics showing survival with sex difference
b2 <-
ggplot(data, aes(x = fs, fill = survived)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count',aes(label=..count..), size=1, position=position_dodge(0.9)) +
facet_grid(.~sex) +
theme_grey(base_size = 15)
# total statistics with sex difference
b3 <-
ggplot(data, aes(x = fs, fill = sex)) +
geom_bar(stat='count', position = 'dodge') +
geom_label(stat = 'count', aes(label =..count..), size = 1,
position=position_dodge(0.9))+
theme_grey(base_size = 15)
# percent statistics showing survival with sex difference
b4 <-
ggplot(data, aes(x = fs, fill = survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'sibsp', y= "Percent") + facet_grid(.~sex) +
theme(legend.position="none") + theme_grey()
grid.arrange(b1,b2,b3,b4, nrow=2)
family size為4的男生存活率最高,大概都是接近0.5 family size為4的女生存活率也最高
男性存活率最高的在5歲以下 女性則是大部分都存活率極高
# ------------ age----------
ggplot(data, aes(x = age, fill = survived)) +
geom_histogram(aes(fill=factor(survived))) + labs(title="Survival density, known-ages, and sex") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 12)) +
theme_grey() + facet_grid(.~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#-------fare--------
ggplot(data, aes(x = fare, fill = survived)) +
geom_histogram(aes(fill=factor(survived))) + labs(title="Survival density, known-ages, and sex") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 12)) +
theme_grey() + facet_grid(.~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#兄弟姊妹對存活率的影響
# sibsp的影響
data_sur <- data[data$survived==1, ]
data_un <- data[data$survived==0, ]
par(mfrow=c(1,3))
boxplot(as.numeric(as.character(data$sibsp)), range=2, main="sibsp")
boxplot(as.numeric(as.character(data_sur$sibsp)), col="red", main="survived sibsp")
boxplot(as.numeric(as.character(data_un$sibsp)), col="blue", main="unsurvived sibsp")
kable(summary(data_sur$sibsp))
x | |
---|---|
0 | 221 |
1 | 135 |
2 | 16 |
3 | 6 |
4 | 3 |
5 | 0 |
kable(summary(data_un$sibsp))
x | |
---|---|
0 | 333 |
1 | 113 |
2 | 18 |
3 | 1 |
4 | 9 |
5 | 6 |
# 存活比例在各兄弟姊妹數量的分布
kable(table(data_sur$sibsp)/length(data_sur$sibsp)*100)
Var1 | Freq |
---|---|
0 | 58.0052493 |
1 | 35.4330709 |
2 | 4.1994751 |
3 | 1.5748031 |
4 | 0.7874016 |
5 | 0.0000000 |
# 死亡比例在各兄弟姊妹數量的分布
kable(table(data_un$sibsp)/length(data_un$sibsp)*100)
Var1 | Freq |
---|---|
0 | 69.3750000 |
1 | 23.5416667 |
2 | 3.7500000 |
3 | 0.2083333 |
4 | 1.8750000 |
5 | 1.2500000 |
# 各兄弟姊妹數量的分布
kable(table(data$sibsp)/length(data$sibsp)*100)
Var1 | Freq |
---|---|
0 | 64.3437863 |
1 | 28.8037166 |
2 | 3.9488966 |
3 | 0.8130081 |
4 | 1.3937282 |
5 | 0.6968641 |
看似兄弟 | 姊妹的數量並無明顯對存活率的影響 |
par(pch=19)
plot(data$age, data$fare)
points(data$age[data$survived==1], data$fare[data$survived==1], col="red") #活下
points(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0],
data$fare[data$survived==1 & data$sibsp!=0 & data$parch!=0], col="yellow")
points(data$age[data$survived==1 & data$sibsp!=0 & data$parch!=0 & data$sex=="female"], data$fare[data$survived==1 & data$sibsp!=0 & data$parch!=0 & data$sex=="female"],
col="green")
legend("topright", legend=c("活下來",paste0("活下來","\n","有sibsp&parch"),paste0("活下來","\n","有sibsp&parch+女性")), col=c("red","yellow","green"),pch=19)
最高票價的2人都有活下來,但式除了最高票價以外,其實票價的多寡對存活率的影響並會太高
library(e1071)
dataas <- subset(data, select=c(pclass,age,sibsp,parch,survived))
dataa <- subset(data, select=c(pclass,age,sibsp,parch))
model <- svm(survived ~ ., data = dataas)
pred_result <- predict(model, dataa)
cm <- table(pred_result, data$survived)
# accuracy
kable((cm[1]+cm[4])/sum(cm)*100)
x |
---|
68.64111 |
準確率68.6%