20180330發現問題: 對於抓取橫跨不同年份的貼文,在while迴圈中會出現error
所以針對蒐集橫跨不同年份的貼文,需要另外處理。
20180330發現問題: 用Rfacebook套件的fbOAuth重新導向 URI,由於給出了site URL是HTTP,但facebook for developers強制使用 HTTPS
#--------限定時間期限的蒐集貼文方法------
library(httr)
prefex <- "https://graph.facebook.com/v2.10/"
token <- "EAACEdEose0cBAJUmuOyrZA9Hh61tIf5AHbSGwsckigyX9zrEAxbDZCWJPmqOpj3fEqajGblhqMwj1BfBwDwUgE2jvDOF2lqy5WoVhZC73g8ZCHsRZBmOfqkCRLnkzMh2kT5J5gLQT5fhZCtcejK08fv2NNAXkXUGQ3v9ZBVyPIJXKHae1vfvL1Ow5MC7LcGgukZD"
number <- 1 #只爬入一篇貼文post
# 175549807657 為低碳生活部落客的id
# 限定從2015-11-01到2015-12-31時間內最近的一篇貼文
target <- "175549807657/posts?limit="
control <- "&until=2015-12-31&since=2015-11-01"
attrs <- paste0(target, number, control,"&access_token=")
url <- paste0(prefex, attrs, token)
res <- httr::GET(url)
data <- httr::content(res)
#把data先解開他list的結構,再用matrix方式存下來
groups <- matrix(unlist(data$data)) #groups變成文字雲的時候就是變成文件doc了
#存成檔案(因為要分梯次存,所以藉由count這個變數來存取每一篇文章)
filename = paste0(1, ".txt")
write.table(groups,filename)
#要跳到下一頁
nextflg= data$paging$`next` # == nextflg= data$paging[[2]]
#nextflg是要用來判斷是否到動態頁的最底端了( the last page of data)
count=1
while(class(nextflg) != "NULL"){
count=count+1
nurl = nextflg
nres= httr::GET(nurl)
ndata = httr::content(nres)
ngroups= matrix(unlist(ndata$data))
#p1=ndata[["data"]][[1]]$message
#p1=ndata$data[[1]]$message
##可用try_catch來測試,while loop停在哪一段 可以記錄走到哪一段停止
nextflg = ndata$paging$`next` # ndata$paging[[2]]
filename = paste0(count, ".txt") #檔名
write.table(ngroups,filename)
}
進行斷字、文字清洗
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR) #斷詞用
library(RColorBrewer)
library(wordcloud)
#進行文本清理
# 讀入wd內的資料夾中所有 *.txt 文章
# 變成docs作為被分析的語料庫
filenames <- list.files(getwd(), pattern="*.txt") #pattern: an optional regular expression. Only file names which match the regular expression will be returned.
files <- lapply(filenames, readLines) #Read some or all text lines from a connection.
docs <- Corpus(VectorSource(files)) #Representing and computing on corpora(語料庫).
#VectorSource: create a vector source
# A vector source interprets each element of the vector x
# as a document.
#Corpora: Representing and computing on corpora.
#Corpus是一種文字檔資料格式--文本
# 移除可能有問題的符號
#要清洗掉的東西
## 進行清除停用字符,停用字符指的是一些文章中常見的單字,
## 但卻無法提供我們資訊的冗字。例如有些、以及、因此…等等字詞。
### 20180401發現要先清洗掉停用字,再清洗標點符號,不然有些文字會變成亂碼
### 例如「節能標竿」會變成「葛鉏衧\xf1 」
### 但仍不知道為什麼會這樣
toSpace <- content_transformer(function(x, pattern) {
return (gsub(pattern, " ", x))
} # Create content transformers, i.e., functions which modify the content of an R objec
# gsub performs replacement of all matches.(以空白取代)
#定義清洗:清洗就是把你找到的符號用空白取代
)
# content_transformer 為內文取代 function
# tm_map(x, FUN, ...)
# Interface to apply transformation functions (also denoted as mappings) to corpora.
# x: A corpus. 語料庫("Corpus" is a collection of text documents.)
# FUN: a transformation function taking a text document (a character vector when x is a SimpleCorpus) as input and returning a text document (a character vector of the same length as the input vector for SimpleCorpus). The function content_transformer can be used to create a wrapper to get and set the content of text documents.
# 複製文本中你不要的符號刪除,可寫多行刪除多種符號
#docs <- tm_map(docs, toSpace, "文本中你不要的符號貼於此")
docs <- tm_map(docs,toSpace,"V1")
docs <- tm_map(docs,toSpace,"\n")
docs <- tm_map(docs,toSpace, "1")
docs <- tm_map(docs,toSpace, "的")
docs <- tm_map(docs,toSpace, "及")
docs <- tm_map(docs,toSpace, "為")
docs <- tm_map(docs,toSpace, "是")
docs <- tm_map(docs,toSpace, "在")
docs <- tm_map(docs,toSpace, "我")
docs <- tm_map(docs,toSpace, "沒有")
docs <- tm_map(docs,toSpace, "不要")
docs <- tm_map(docs,toSpace, "只")
docs <- tm_map(docs,toSpace, "正要")
docs <- tm_map(docs,toSpace, "尤其")
docs <- tm_map(docs,toSpace, "做")
docs <- tm_map(docs,toSpace, "活動")
docs <- tm_map(docs,toSpace, "停止")
docs <- tm_map(docs,toSpace, "一下")
docs <- tm_map(docs,toSpace, "進行")
docs <- tm_map(docs,toSpace, "要")
docs <- tm_map(docs,toSpace, "們")
docs <- tm_map(docs,toSpace, "有")
docs <- tm_map(docs,toSpace, "要")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "也")
docs <- tm_map(docs,toSpace, "會")
docs <- tm_map(docs,toSpace, "台灣")
docs <- tm_map(docs,toSpace, "台北")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "生活")
docs <- tm_map(docs,toSpace, "更")
docs <- tm_map(docs,toSpace, "不")
docs <- tm_map(docs,toSpace, "讓")
docs <- tm_map(docs,toSpace, "都")
docs <- tm_map(docs,toSpace, "與")
docs <- tm_map(docs,toSpace, "小編")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "喔")
docs <- tm_map(docs,toSpace, "新")
docs <- tm_map(docs,toSpace, "年")
docs <- tm_map(docs,toSpace, "可以")
docs <- tm_map(docs,toSpace, "啊")
docs <- tm_map(docs,toSpace, "文章")
docs <- tm_map(docs,toSpace, "到")
docs <- tm_map(docs,toSpace, "將")
docs <- tm_map(docs,toSpace, "演講")
docs <- tm_map(docs,toSpace, "格")
docs <- tm_map(docs,toSpace, "一個")
docs <- tm_map(docs,toSpace, "以")
docs <- tm_map(docs,toSpace, "中")
docs <- tm_map(docs,toSpace, "下午")
docs <- tm_map(docs,toSpace, "但")
docs <- tm_map(docs,toSpace, "吧")
docs <- tm_map(docs,toSpace, "還")
docs <- tm_map(docs,toSpace, "如何")
docs <- tm_map(docs,toSpace, "將")
docs <- tm_map(docs,toSpace, "﹍")
# 清除大小寫英文與數字
docs <- tm_map(docs,toSpace, "[A-Za-z0-9]")
#移除標點符號 (punctuation)
#移除數字 (digits)、空白 (white space)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
#在 worker() 內可以設定各種不同的全切分法模型與引用外部詞庫,在這裡直接使用
#預設的全切分法的混合模型,與 jieba 自帶的詞庫。
# 一般斷詞
mixseg = worker()
#適時的增加詞庫
# segment <- c("陳菊","布里斯本","高雄","重劃區","合作會","後勁溪")
segment <- c("低碳生活部落格","綠建築","節能","氣候變遷","電動車","台達電子","台達電子文教基金會","綠色能源")
new_user_word(mixseg,segment) #Add user word
## [1] TRUE
#斷詞 mixseg[groups]
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
#轉成文件
#詞頻結果:
freqFrame = as.data.frame(table(unlist(seg)))
# 觀察出現由多到寡的文字
library(knitr)
kable(head(freqFrame[order(freqFrame$Freq, decreasing = TRUE),]))
Var1 | Freq | |
---|---|---|
50 | 了 | 92 |
737 | 低碳 | 84 |
1825 | 部落 | 84 |
2350 | 碳 | 56 |
2369 | 綠色 | 45 |
735 | 低 | 43 |
繪製文字雲
#有詞頻之後就可以去畫文字雲
#畫出文字雲
library(extrafont)
## Registering fonts with R
windowsFonts(TC=windowsFont("Heiti TC Light"))
wordcloud(freqFrame$Var1,freqFrame$Freq,
scale=c(5,0.5),
min.freq=5,max.words=50,
random.order=FALSE,random.color=FALSE,
rot.per=.2, colors=brewer.pal(11, "Paired")[c(1:17)],
ordered.colors=FALSE,use.r.layout=FALSE,
fixed.asp=TRUE,family="TC")
library(wordcloud2)
wordcloud2(freqFrame[freqFrame$Freq>10,], size = 0.8, fontFamily = "微軟雅黑",
color = "random-light", backgroundColor = "grey",
shape = "star"
)