########################################################## # how well do randomForest, gbm handle multicollinearity? # J Wheatley Feb 2014 ########################################################## require(ggplot2) requre(randomForest) require(gbm) #params n <- 500 #observations p <- 10 #predictors rho <- 0.8 #correlation sigma <- 0.7 #noise #simulation data x1 <- rnorm(n) x2 <- rnorm(n) x3 <- rho*x1+rnorm(n)*sqrt(1-rho^2) y <- x1 + x2 + sigma*rnorm(n) df <- as.data.frame(replicate(p-3,rnorm(n))) names(df) <- paste("x",seq(4,p),sep="") df1 <- data.frame(y=y,x1=x1,x2=x2,x3=x3) df <- cbind(df1,df) #rf and gbm gb <- gbm(y~.,data=df[,-1],n.trees=1000,interaction.depth=3,shrinkage=0.01,distribution="gaussian",bag.fraction=0.5,cv.folds=100) rf <- randomForest(y~.,data=df[,-1],ntree=1000) rf.imp <- as.data.frame(importance(rf)) rf.imp$var <- row.names(rf.imp) rf.imp$rel.influence <- rf.imp$IncNodePurity/sum(rf.imp$IncNodePurity)*100 rf.imp$IncNodePurity <- NULL gb.imp <- summary(gb) df.imp <- merge(gb.imp,rf.imp,by="var") df.imp <- df.imp[order(-df.imp$rel.influence),] names(df.imp) <- c("var","GB","RF") #relative importance plot df.imp.m <- melt(df.imp,id.vars="var") df.imp.m$var <- factor(df.imp.m$var, levels = paste("x",seq(1,p),sep="")) g <- ggplot(df.imp.m, aes(var,value,fill=variable)) + geom_bar(stat="identity",position="dodge")+facet_grid(.~variable) g <- g +coord_flip() + ylab("relative importance (%)")+xlab("predictor") g + scale_y_continuous(limits=c(0,50))+theme_bw() + theme(legend.position="none",text=element_text(size=15))+ggtitle("variable importance")