各網購網站販賣男裝的特色比較

想要知道各知名網購網站的女裝和男裝的當季流行款式以及其特色比較!

2018/04/10 進度：

針對PCHOME和UNIQULO網站關鍵字女裝和男裝搜尋的結果進行比較

資料蒐集、清洗、斷詞
利用TF-IDF和文字雲進行各網購網站販賣的衣服特色的關鍵字比較

PCHOME:爬取資料

library(httr)
pageno <- 1:10    #搜尋頁數
g_url <- paste0("https://ecshweb.pchome.com.tw/search/v3.3/all/results?q=%E5%A5%B3%E8%A3%9D&page=", pageno, "&sort=rnk/dc")
b_url <- paste0("https://ecshweb.pchome.com.tw/search/v3.3/all/results?q=%E7%94%B7%E8%A3%9D&page=", pageno, "&sort=rnk/dc")
#url = "https://ecshweb.pchome.com.tw/search/v3.3/all/results?q=%E5%A5%B3&page=1&sort=rnk/dc"
g_prods = character()
b_prods = character()
getcontent <- function(url, prods){
  for(i in 1:length(pageno)){
    res = GET(url[i])
    res_json = httr::content(res)
    results <- data.frame(do.call(rbind,res_json$prods))
    prods[((pageno[i]-1)*20+1):(pageno[i]*20)] <- unlist(results$name)
  }
  return(prods)
}
g_prods <- c(g_prods, getcontent(g_url, g_prods))
Sys.sleep(60)
b_prods <- c(b_prods, getcontent(b_url, b_prods))

PCHOME:資料清洗、斷詞

#-------clean data--------
processdata <- function(prods){
  library(NLP)
  library(tm)
  library(jiebaRD)
  library(jiebaR)      #斷詞用
  library(RColorBrewer)
  library(wordcloud)
  library(tmcn)   #segmentCN
  docs <- Corpus(VectorSource(prods))
  
  toSpace <- content_transformer(function(x, pattern) {
    return (gsub(pattern, " ", x))
  }     
  )
  docs <- tm_map(docs,toSpace,"V1")
  docs <- tm_map(docs,toSpace,"\n")
  docs <- tm_map(docs,toSpace, "1")
  # 清除大小寫英文與數字
  docs <- tm_map(docs,toSpace, "[A-Za-z0-9]")
  #移除標點符號 (punctuation)
  #移除數字 (digits)、空白 (white space)
  docs <- tm_map(docs, removePunctuation)
  docs <- tm_map(docs, removeNumbers)
  docs <- tm_map(docs, stripWhitespace)
  
  
  #-------斷詞--------
  mixseg = worker()
  #new_user_word(mixseg,segment)   #Add user word
  
  
  #斷詞  mixseg[groups]
  jieba_tokenizer=function(d){
    unlist(segment(d[[1]],mixseg))
  }
  #1
  seg = lapply(docs, jieba_tokenizer)
  #2
  seg = lapply(docs, segmentCN)
  #seg = lapply(seg, strsplit, " ")
  return(seg)
}
g_seg <- processdata(g_prods)

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:httr':
## 
##     content

## # tmcn Version: 0.2-12

b_seg <- processdata(b_prods)



g_freqFrame = as.data.frame(table(unlist(g_seg)))
b_freqFrame = as.data.frame(table(unlist(b_seg)))

UNIQLO: 爬取資料

# 
library(rvest)

## Loading required package: xml2

page <- 0:12
p <- 48*(page)

u_getrvest <- function(url){
  res <- read_html(url)
  raw.titles <- res %>% html_nodes("div.unit .info .name")
  u_prods <- raw.titles %>% html_node("a") %>% html_attr('title')
  
  return(u_prods)
}

ug_url <- paste0("http://www.uniqlo.com/tw/store/search?qtext=%E5%A5%B3%E8%A3%9D&qbrand=10&qclv1=&qclv25=&qclv2=&qrange=&qcolor=&qsize=&qnew=&qdiscount=&qlimit=&qmulti=&qonline=&qspsize=&qstart=", p,"&sort=goods_disp_priority")

ug_prods <- mapply(u_getrvest, ug_url)
colnames(ug_prods) <- page+1
#raw.prices <- res %>% html_nodes("dd.price") %>% html_text
#raw.prices <- as.character(raw.prices)
#gsub("NT$", "", raw.prices)

UNIQLO: 資料清洗、斷詞

ug_seg <- processdata(as.character(ug_prods))
ug_freqFrame = as.data.frame(table(unlist(ug_seg)))


ub_url <- paste0("http://www.uniqlo.com/tw/store/search?qtext=%E7%94%B7%E8%A3%9D&qbrand=10&qclv1=&qclv25=&qclv2=&qrange=&qcolor=&qsize=&qnew=&qdiscount=&qlimit=&qmulti=&qonline=&qspsize=&qstart=", p,"&sort=goods_disp_priority")

ub_prods <- mapply(u_getrvest, ub_url)
colnames(ub_prods) <- page+1
#raw.prices <- res %>% html_nodes("dd.price") %>% html_text
#raw.prices <- as.character(raw.prices)
#gsub("NT$", "", raw.prices)

ub_seg <- processdata(as.character(ub_prods))
ub_freqFrame = as.data.frame(table(unlist(ub_seg)))

文字雲和TF-IDF 呈現

library(Matrix)   #nnzero
library(knitr)

PCHOME女裝

library(wordcloud2)
#wordcloud2(g_freqFrame[(g_freqFrame$Freq>tres)&(g_freqFrame$Freq!=max(g_freqFrame$Freq)),], fontFamily = "微軟雅黑",color = "random-light", backgroundColor = "grey", size = 1  )
windowsFonts(TC=windowsFont("Heiti TC Light"))
wordcloud(g_freqFrame$Var1,g_freqFrame$Freq,
          scale=c(10,1),
          min.freq=5,max.words=50,
          random.order=FALSE,random.color=FALSE, 
          rot.per=.2, colors=brewer.pal(11, "Paired")[c(1:25)],
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE,family="TC")

g_seg <- Corpus(VectorSource(g_seg))
g_tdm <- TermDocumentMatrix(g_seg, control = list(wordLengths = c(1,10)))

g_tf <- as.matrix(g_tdm)/apply(g_tdm, 2, sum)   #term frequency: the number of words in every document
g_idf <- log10(ncol(g_tdm)/apply(g_tdm, 1, nnzero))
g_tfidf <- g_tf*g_idf       #TF-IDF
g_tfidf <- apply(g_tfidf, 1, sum)
kable(head(g_tfidf[order(g_tfidf, decreasing = TRUE)], n = 15L))       #TF-IDF

	x
製	3.354518
台灣	3.258635
裝	2.992327
休閒	2.928374
款	2.861541
黑	2.463331
純棉	2.415807
藍	2.404825
標誌	2.399582
圓領	2.357932
涼爽	2.308220
襯衫	2.229045
長褲	2.215525
長袖	2.193486
	2.124789

PCHOME男裝

windowsFonts(TC=windowsFont("Heiti TC Light"))
wordcloud(b_freqFrame$Var1,b_freqFrame$Freq,
          scale=c(10,1),
          min.freq=5,max.words=50,
          random.order=FALSE,random.color=FALSE, 
          rot.per=.2, colors=brewer.pal(11, "Paired")[c(1:25)],
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE,family="TC")

b_seg <- Corpus(VectorSource(b_seg))
b_tdm <- TermDocumentMatrix(b_seg, control = list(wordLengths = c(1,10)))

b_tf <- as.matrix(b_tdm)/apply(b_tdm, 2, sum)   #term frequency: the number of words in every document
b_idf <- log10(ncol(b_tdm)/apply(b_tdm, 1, nnzero))
b_tfidf <- b_tf*b_idf       #TF-IDF
b_tfidf <- apply(b_tfidf, 1, sum)
kable(head(b_tfidf[order(b_tfidf, decreasing = TRUE)], n = 15L))       #TF-IDF

	x
恤	3.016743
標誌	3.004626
純棉	2.910211
短袖	2.868356
修身	2.767071
長袖	2.746028
休閒	2.562938
製	2.517071
台灣	2.497683
印花	2.492593
彈力	2.455270
黑	2.447407
素色	2.401527
簡約	2.372243
襯衫	2.357384

UNIQULO女裝

windowsFonts(TC=windowsFont("Heiti TC Light"))
wordcloud(ug_freqFrame$Var1,ug_freqFrame$Freq,
          scale=c(10,1),
          min.freq=5,max.words=50,
          random.order=FALSE,random.color=FALSE, 
          rot.per=.2, colors=brewer.pal(11, "Paired")[c(1:25)],
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE,family="TC")

ug_seg <- Corpus(VectorSource(ug_seg))
ug_tdm <- TermDocumentMatrix(ug_seg, control = list(wordLengths = c(1,10)))


ug_tf <- as.matrix(ug_tdm)/apply(ug_tdm, 2, sum)   #term frequency: the number of words in every document
ug_idf <- log10(ncol(ug_tdm)/apply(ug_tdm, 1, nnzero))
ug_tfidf <- ug_tf*ug_idf       #TF-IDF
ug_tfidf <- apply(ug_tfidf, 1, sum)
kable(head(ug_tfidf[order(ug_tfidf, decreasing = TRUE)], n = 15L))

	x
短袖	18.434666
恤	16.725048
長袖	15.855175
印花	15.334294
洋裝	11.094104
	10.473293
套	9.922262
襯衫	9.358149
棉質	8.879950
條紋	8.685286
內褲	8.533906
彈性	8.495799
背心	8.379082
上衣	8.343760
高腰	7.806251

UNIQULO男裝

windowsFonts(TC=windowsFont("Heiti TC Light"))
wordcloud(ub_freqFrame$Var1,ub_freqFrame$Freq,
          scale=c(10,1),
          min.freq=5,max.words=50,
          random.order=FALSE,random.color=FALSE, 
          rot.per=.2, colors=brewer.pal(11, "Paired")[c(1:25)],
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE,family="TC")

ub_seg <- Corpus(VectorSource(ub_seg))
ub_tdm <- TermDocumentMatrix(ub_seg, control = list(wordLengths = c(1,10)))


ub_tf <- as.matrix(ub_tdm)/apply(ub_tdm, 2, sum)   #term frequency: the number of words in every document
ub_idf <- log10(ncol(ub_tdm)/apply(ub_tdm, 1, nnzero))
ub_tfidf <- ub_tf*ub_idf       #TF-IDF
ub_tfidf <- apply(ub_tfidf, 1, sum)
kable(head(ub_tfidf[order(ub_tfidf, decreasing = TRUE)], n = 15L))

	x
短袖	19.097569
恤	18.071917
印花	18.044291
長袖	17.232719
襯衫	17.159204
彈性	11.973716
精紡	11.599073
	10.966874
平口	9.444476
格紋	9.422264
套	9.314474
條紋	9.266362
內褲	9.255005
圓領	8.959242
褲	8.505058

結果討論

PCHOME 女裝特色較著重休閒、純棉、圓領、涼爽、鬆緊

UNIQULO 女裝特色較著重印花、棉質、彈性、條紋、亞麻、緊身褲、高腰、特級

PCHOME 男裝特色較著重修身、休閒、簡約、印花、彈力、素色、簡約、純棉

UNIQULO 男裝特色較著重印花、彈性、精紡、格紋、條紋、平口、圓領、特級

文字雲表現最常出現的商品特徵
TF-IDF 的結果可以提供網站中較具有特色(該名超描述並非出現在許多商品中)的商品

之後預計完成

加入其他的網購網站資料，例如淘寶還有韓式風格的網購網站，比較其特色差異。

project_1

各網購網站販賣女裝的特色比較