某所における声優統計の評判
あけましておめでとうございます。
某書が話題になっているそうですが、直接確かめる度胸はないので形態素で把握します。
形態素の処理は適当です。
# 参考 https://gist.github.com/r-linux/4958fd92355dbae01c7b library(RMeCab) library(XML) library(dplyr) u <- "http://fox.2ch.net/test/read.cgi/poverty/1420023769/" dat_freq <- htmlParse(u) %>% xpathSApply('//dd',xmlValue) %>% paste(collapse="。") %>% RMeCabC() %>% unlist %>% data_frame(POS1=names(.), TERM=.) %>% filter (POS1 %in% c("名詞","形容詞","動詞"), !(grepl(paste( c(LETTERS,letters, 0:9,"/", ":", ";",">","<","-","\\."," ", " ", "_", "_","\", "、","Д","'", "´", "\\*"),collapse = "|"), TERM))) %>% group_by(TERM) %>% summarise(Freq=n()) %>% filter(Freq<=50) # 高頻度のものはゴミが多かったので # ワードクラウドの作成 library(wordcloud) par(family = "HiraKakuProN-W6") wordcloud(dat_freq$TERM, dat_freq$Freq)