task 4–fb API

FB API

利用R語言，爬取“低碳生活部落格”臉書粉絲專頁的貼文。

使用httr套件包

限定蒐集貼文的時間期間

20180330發現問題: 對於抓取橫跨不同年份的貼文，在while迴圈中會出現error

所以針對蒐集橫跨不同年份的貼文，需要另外處理。

20180330發現問題: 用Rfacebook套件的fbOAuth重新導向 URI，由於給出了site URL是HTTP，但facebook for developers強制使用 HTTPS

#--------限定時間期限的蒐集貼文方法------
library(httr)
prefex <- "https://graph.facebook.com/v2.10/"
token <- "EAACEdEose0cBAJUmuOyrZA9Hh61tIf5AHbSGwsckigyX9zrEAxbDZCWJPmqOpj3fEqajGblhqMwj1BfBwDwUgE2jvDOF2lqy5WoVhZC73g8ZCHsRZBmOfqkCRLnkzMh2kT5J5gLQT5fhZCtcejK08fv2NNAXkXUGQ3v9ZBVyPIJXKHae1vfvL1Ow5MC7LcGgukZD"
number <- 1      #只爬入一篇貼文post

# 175549807657 為低碳生活部落客的id
# 限定從2015-11-01到2015-12-31時間內最近的一篇貼文
target <- "175549807657/posts?limit="
control <- "&until=2015-12-31&since=2015-11-01"
attrs <- paste0(target, number, control,"&access_token=")  
url <- paste0(prefex, attrs, token)

res <- httr::GET(url)
data <-  httr::content(res)
#把data先解開他list的結構，再用matrix方式存下來
groups <- matrix(unlist(data$data))  #groups變成文字雲的時候就是變成文件doc了

#存成檔案(因為要分梯次存，所以藉由count這個變數來存取每一篇文章)
filename = paste0(1, ".txt")
write.table(groups,filename)
#要跳到下一頁
nextflg= data$paging$`next`    # == nextflg= data$paging[[2]]
                              #nextflg是要用來判斷是否到動態頁的最底端了( the last page of data)

count=1
while(class(nextflg) != "NULL"){
  
  count=count+1

  nurl = nextflg
  nres= httr::GET(nurl)
  ndata  = httr::content(nres)
  ngroups= matrix(unlist(ndata$data))
  #p1=ndata[["data"]][[1]]$message
  #p1=ndata$data[[1]]$message
  ##可用try_catch來測試，while loop停在哪一段 可以記錄走到哪一段停止
  nextflg = ndata$paging$`next`    # ndata$paging[[2]]
  filename = paste0(count, ".txt")  #檔名
  write.table(ngroups,filename)
}

進行斷字、文字清洗

library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)       #斷詞用
library(RColorBrewer)
library(wordcloud)
#進行文本清理
# 讀入wd內的資料夾中所有 *.txt 文章
# 變成docs作為被分析的語料庫
filenames <- list.files(getwd(), pattern="*.txt")  #pattern: an optional regular expression. Only file names which match the regular expression will be returned.
files <- lapply(filenames, readLines)  #Read some or all text lines from a connection.
docs <- Corpus(VectorSource(files))  #Representing and computing on corpora(語料庫).
#VectorSource: create a vector source
 # A vector source interprets each element of the vector x 
 # as a document.

#Corpora: Representing and computing on corpora.
#Corpus是一種文字檔資料格式--文本
# 移除可能有問題的符號


#要清洗掉的東西
## 進行清除停用字符，停用字符指的是一些文章中常見的單字，
## 但卻無法提供我們資訊的冗字。例如有些、以及、因此…等等字詞。
### 20180401發現要先清洗掉停用字，再清洗標點符號，不然有些文字會變成亂碼
### 例如「節能標竿」會變成「葛鉏衧\xf1 」
### 但仍不知道為什麼會這樣

toSpace <- content_transformer(function(x, pattern) {
  return (gsub(pattern, " ", x))
}     # Create content transformers, i.e., functions which modify the content of an R objec
      # gsub performs replacement of all matches.(以空白取代)
#定義清洗：清洗就是把你找到的符號用空白取代
)
# content_transformer 為內文取代 function
# tm_map(x, FUN, ...)
# Interface to apply transformation functions (also denoted as mappings) to corpora.
# x: A corpus. 語料庫("Corpus" is a collection of text documents.)
# FUN: a transformation function taking a text document (a character vector when x is a SimpleCorpus) as input and returning a text document (a character vector of the same length as the input vector for SimpleCorpus). The function content_transformer can be used to create a wrapper to get and set the content of text documents.
# 複製文本中你不要的符號刪除，可寫多行刪除多種符號
#docs <- tm_map(docs, toSpace, "文本中你不要的符號貼於此")
docs <- tm_map(docs,toSpace,"V1")
docs <- tm_map(docs,toSpace,"\n")
docs <- tm_map(docs,toSpace, "1")
docs <- tm_map(docs,toSpace, "的")
docs <- tm_map(docs,toSpace, "及")
docs <- tm_map(docs,toSpace, "為")
docs <- tm_map(docs,toSpace, "是")
docs <- tm_map(docs,toSpace, "在")
docs <- tm_map(docs,toSpace, "我")
docs <- tm_map(docs,toSpace, "沒有")
docs <- tm_map(docs,toSpace, "不要")
docs <- tm_map(docs,toSpace, "只")
docs <- tm_map(docs,toSpace, "正要")
docs <- tm_map(docs,toSpace, "尤其")
docs <- tm_map(docs,toSpace, "做")
docs <- tm_map(docs,toSpace, "活動")
docs <- tm_map(docs,toSpace, "停止")
docs <- tm_map(docs,toSpace, "一下")
docs <- tm_map(docs,toSpace, "進行")
docs <- tm_map(docs,toSpace, "要")
docs <- tm_map(docs,toSpace, "們")
docs <- tm_map(docs,toSpace, "有")
docs <- tm_map(docs,toSpace, "要")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "也")
docs <- tm_map(docs,toSpace, "會")
docs <- tm_map(docs,toSpace, "台灣")
docs <- tm_map(docs,toSpace, "台北")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "生活")
docs <- tm_map(docs,toSpace, "更")
docs <- tm_map(docs,toSpace, "不")
docs <- tm_map(docs,toSpace, "讓")
docs <- tm_map(docs,toSpace, "都")
docs <- tm_map(docs,toSpace, "與")
docs <- tm_map(docs,toSpace, "小編")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "喔")
docs <- tm_map(docs,toSpace, "新")
docs <- tm_map(docs,toSpace, "年")
docs <- tm_map(docs,toSpace, "可以")
docs <- tm_map(docs,toSpace, "啊")
docs <- tm_map(docs,toSpace, "文章")
docs <- tm_map(docs,toSpace, "到")
docs <- tm_map(docs,toSpace, "將")
docs <- tm_map(docs,toSpace, "演講")
docs <- tm_map(docs,toSpace, "格")
docs <- tm_map(docs,toSpace, "一個")
docs <- tm_map(docs,toSpace, "以")
docs <- tm_map(docs,toSpace, "中")
docs <- tm_map(docs,toSpace, "下午")
docs <- tm_map(docs,toSpace, "但")
docs <- tm_map(docs,toSpace, "吧")
docs <- tm_map(docs,toSpace, "還")
docs <- tm_map(docs,toSpace, "如何")
docs <- tm_map(docs,toSpace, "將")
docs <- tm_map(docs,toSpace, "﹍")
# 清除大小寫英文與數字
docs <- tm_map(docs,toSpace, "[A-Za-z0-9]")
#移除標點符號 (punctuation)
#移除數字 (digits)、空白 (white space)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
#在 worker() 內可以設定各種不同的全切分法模型與引用外部詞庫，在這裡直接使用
#預設的全切分法的混合模型，與 jieba 自帶的詞庫。
# 一般斷詞
mixseg = worker()
#適時的增加詞庫
# segment <- c("陳菊","布里斯本","高雄","重劃區","合作會","後勁溪")
segment <- c("低碳生活部落格","綠建築","節能","氣候變遷","電動車","台達電子","台達電子文教基金會","綠色能源")
new_user_word(mixseg,segment)   #Add user word

## [1] TRUE

#斷詞  mixseg[groups]
jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
#轉成文件
#詞頻結果:
freqFrame = as.data.frame(table(unlist(seg)))
# 觀察出現由多到寡的文字

library(knitr)
kable(head(freqFrame[order(freqFrame$Freq, decreasing = TRUE),]))

	Var1	Freq
50	了	92
737	低碳	84
1825	部落	84
2350	碳	56
2369	綠色	45
735	低	43

繪製文字雲

#有詞頻之後就可以去畫文字雲
#畫出文字雲
library(extrafont)

## Registering fonts with R

windowsFonts(TC=windowsFont("Heiti TC Light"))
wordcloud(freqFrame$Var1,freqFrame$Freq,
          scale=c(5,0.5),
          min.freq=5,max.words=50,
          random.order=FALSE,random.color=FALSE, 
          rot.per=.2, colors=brewer.pal(11, "Paired")[c(1:17)],
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE,family="TC")

library(wordcloud2)
wordcloud2(freqFrame[freqFrame$Freq>10,], size = 0.8, fontFamily = "微軟雅黑",
           color = "random-light", backgroundColor = "grey",
           shape = "star"
           )

低碳生活部落格是是由台達電子文教基金會所經營的平台，內容著重在探討氣候變遷的相關議題，可以看出內容主要是針對低碳、綠色、氣候變遷以及能夠減能減碳以及減緩氣候變遷的方法，例如太陽能、電動車、綠建築、再生能源相關，且因為推廣相關的觀念，這個粉絲專頁會分享文章或舉辦一些演講，所以像是演講、場、晚、文章的文字也很頻繁出現。

task 4–fb API

FB API

利用R語言，爬取“低碳生活部落格”臉書粉絲專頁的貼文。

使用httr套件包

限定蒐集貼文的時間期間

20180330發現問題: 對於抓取橫跨不同年份的貼文，在while迴圈中會出現error

所以針對蒐集橫跨不同年份的貼文，需要另外處理。

20180330發現問題: 用Rfacebook套件的fbOAuth重新導向 URI，由於給出了site URL是HTTP，但facebook for developers強制使用 HTTPS

觀察發現“雙城記”的字眼很常出現，原因為最近台達電子基文教基金會出了一本叫做“低碳生活雙城記”的新書