##網路爬蟲
source('pttTestFunction.R')
## # tmcn Version: 0.2-12
id = c(1:10)
URL = paste0("https://www.ptt.cc/bbs/Food/index", id, ".html")
filename = paste0(id, ".txt")
pttTestFunction(URL[1], filename[1])
mapply(pttTestFunction,
URL = URL, filename = filename)
## $`https://www.ptt.cc/bbs/Food/index1.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index2.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index3.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index4.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index5.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index6.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index7.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index8.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index9.html`
## NULL
##
## $`https://www.ptt.cc/bbs/Food/index10.html`
## NULL
##文本清理
rm(list=ls(all.names = TRUE))
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))
toSpace <- content_transformer(function(x, pattern) {
return (gsub(pattern, " ", x))
}
)
docs <- tm_map(docs, toSpace, "※")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "※"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "◆")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "◆"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "‧")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "‧"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "的")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "的"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "了")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "了"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "有")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "有"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "就")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "就"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "也")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "也"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "很")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "很"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "是")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "是"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "我")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "我"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "看板")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "看板"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "作者")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "作者"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "發信站")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "發信站"): transformation
## drops documents
docs <- tm_map(docs, toSpace, "批踢踢實業坊")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "批踢踢實業坊"):
## transformation drops documents
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "[a-zA-Z]"): transformation
## drops documents
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
docs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
##詞頻矩陣
mixseg = worker()
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
freqFrame = freqFrame[order(freqFrame$Freq,decreasing=TRUE), ]
library(knitr)
kable(head(freqFrame), format = "markdown")
1334 |
吃 |
930 |
11792 |
在 |
678 |
2333 |
都 |
596 |
709 |
不 |
593 |
9444 |
推 |
525 |
3562 |
還 |
453 |
##文字雲
par(family=("Heiti TC Light"))
wordcloud(freqFrame$Var1,freqFrame$Freq,
scale=c(5,0.1),min.freq=50,max.words=150,
random.order=TRUE, random.color=FALSE,
rot.per=.1, colors=brewer.pal(8, "Dark2"),
ordered.colors=FALSE,use.r.layout=FALSE,
fixed.asp=TRUE)