ランダムフォレストの実行速度

ランダムフォレストは便利だけど時間もかかるという話。
ということでデータの大きさによってどのくらい実行速度が変わるか測定してみた。
今回はデータ数を100、1000、10000と変化させて計測している。
左が経過時間、右がデータ数100の場合を1とした場合のグラフ。

データ数が100倍になると処理時間は1600倍まで増加している。
この辺考慮してやっていきたいもんですね。

library(randomForest)
library(ggplot2)
#サンプルサイズを100から10000まで変化させて実行速度を測定する
result <- NULL
for(size in c(100, 1000, 10000)){
samp <- sample(diamonds$price, size)
sample <- diamonds[samp, ]
time1 <- system.time(randomForest(data=sample, price~.))
result1 <- data.frame(cbind(size=size, t(time1)))
result <- rbind(result, result1)
}
#サンプルサイズ100の際の経過時間を1とした場合、何倍になるか計算
result$times <- result$elapsed/min(result$elapsed)

#グラフ作成
p1 <- ggplot(data=result, aes(x=factor(size), y=elapsed, label=round(elapsed,1))) + geom_bar(stat="identity") + xlab("size") + ylab("elapsed") + geom_text(vjust=-0.5)

p2 <- ggplot(data=result, aes(x=factor(size), y=times, label=round(times,0))) + geom_bar(stat="identity") + xlab("size") + ylab("times") + geom_text(vjust=-0.5)

#グラフ出力
png("time.png")
grid.newpage()
pushViewport(viewport(layout=grid.layout(1,2)))
print(p1, vp=viewport(layout.pos.row=1, layout.pos.col=1))
print(p2, vp=viewport(layout.pos.row=1, layout.pos.col=2))
dev.off()