Setup

We load the necessary libraries first

We load the data and take and look if the data was imported correctly

spruceData <- read.table('spruceData.tsv', header=T, row.names = 1, stringsAsFactors = F)
head(spruceData[,1:5])
##           Gene1   Gene10  Gene100 Gene1000  Gene101
## W19-C1 7.310242 7.088565 7.073521 6.641622 6.872650
## W19-C2 7.167100 6.998046 7.214589 6.784180 6.806009
## W19-C3 7.159872 7.290680 7.138781 6.903825 6.447067
## W19-F1 7.196368 6.985777 7.417678 6.813525 6.534758
## W19-F2 7.247463 7.053132 7.243150 6.674302 6.776412
## W19-F3 7.427390 7.032896 7.050686 6.903508 6.563667

We use X for data notation and y for class vector. The class vector information is extracted from the row names. We can use 2 different prediction classes. Week or Treatment. The prediction classes are a factor because we want a classification.

X <- spruceData
treatment <- as.factor(substr(rownames(spruceData), 5,5))
week <- as.factor(substr(rownames(spruceData), 2,3))

We will use week as a prediction factor

y <- week

Base example

tic()
baseRF <- randomForest(x=X, y=y, ntree=100, importance=TRUE)
toc()
## 0.37 sec elapsed

We can explore the results using the importance function to give back the most important genes and extract the oobError.

baseImportance <- randomForest::importance(baseRF)
baseImportance <- baseImportance[order(baseImportance[,'MeanDecreaseAccuracy'], decreasing = TRUE),]
head(baseImportance)
##                19       20       21        22        23        24       25
## Gene135 1.7733173 0.200040 2.197231 2.2367584  1.653739  1.353881 0.000000
## Gene93  1.4002801 2.002970 2.015382 0.0000000  1.005038  0.000000 1.428571
## Gene489 0.0000000 0.000000 0.000000 0.0000000  0.000000  1.005038 1.005038
## Gene138 0.2627035 1.162476 2.188249 0.8491993 -1.005038  1.005038 1.005038
## Gene417 0.0000000 0.000000 1.005038 0.0000000  0.000000  0.000000 1.005038
## Gene537 1.4002801 1.005038 1.005038 1.4139250  1.005038 -1.005038 0.000000
##               26       28       31       32       34        35       36
## Gene135 0.000000 0.000000 1.005038 1.275153 1.005038 0.4937979 1.874178
## Gene93  1.005038 1.005038 1.353881 1.524986 1.005038 1.0050378 0.000000
## Gene489 0.000000 1.353881 0.000000 1.428571 0.000000 1.0050378 0.000000
## Gene138 1.428571 1.005038 0.000000 1.005038 1.005038 1.0050378 0.000000
## Gene417 1.695099 0.000000 1.005038 1.005038 1.005038 0.0000000 0.000000
## Gene537 0.000000 1.005038 1.005038 0.000000 1.005038 0.0000000 1.005038
##                37       38       39       40       41 MeanDecreaseAccuracy
## Gene135  1.005038 1.005038 1.005038 1.730969 1.275153             3.074120
## Gene93  -1.005038 0.000000 0.000000 1.005038 1.428571             2.931708
## Gene489  0.000000 0.000000 1.005038 1.005038 1.428571             2.546365
## Gene138  0.000000 1.005038 1.005038 1.005038 1.005038             2.495185
## Gene417  0.000000 0.000000 0.200040 1.005038 0.000000             2.476499
## Gene537  1.005038 0.000000 1.005038 1.005038 0.000000             2.464263
##         MeanDecreaseGini
## Gene135        1.1103009
## Gene93         0.8682773
## Gene489        0.3797226
## Gene138        0.5947100
## Gene417        0.3950070
## Gene537        0.5114833
oobError <- mean(baseRF$confusion[,'class.error'])
print(oobError)
## [1] 0.422807

Let’s create a function to simplify this. getResults accepts the randomForest object as a parameter and just prints a preview of the importance and the OOB error

getResults <- function(rf) {
  require(randomForest)
  imp <- randomForest::importance(rf)
  imp <- imp[order(imp[,'MeanDecreaseAccuracy'], decreasing = TRUE),]
  head(imp)
  oobError <- mean(rf$confusion[,'class.error'])
  print(paste("OOB error",oobError))
  
}

And another function to execute a single core random forest run.

singleCoreRF <- function(X, y, ntree) {
  baseRF <- randomForest(x=X, y=y, ntree=ntree, importance=TRUE)
  return(baseRF)
}

Single core tests

SINGLE CORE, 100 trees

tic()
single_100 <- singleCoreRF(X, y, 100)
toc()
## 0.36 sec elapsed
getResults(single_100)
## [1] "OOB error 0.43859649122807"

SINGLE CORE, 1000 trees

tic()
single_1000 <- singleCoreRF(X, y, 1000)
toc()
## 3.47 sec elapsed
getResults(single_1000)
## [1] "OOB error 0.250877192982456"

SINGLE CORE, 10000 trees

tic()
single_10000 <- singleCoreRF(X, y, 10000)
toc()
## 33.97 sec elapsed
getResults(single_10000)
## [1] "OOB error 0.226315789473684"

SINGLE CORE, 100000 trees

tic()
single_100000 <- singleCoreRF(X, y, 100000)
toc()
## 337 sec elapsed
getResults(single_100000)
## [1] "OOB error 0.217543859649123"

Multiple core tests

Basic test case, 2 cores, 100 trees

cores <- 2
registerDoParallel(cores=cores)
cl <- makeCluster(cores) 
tic()
baseRF2 <- foreach(ntree=rep(50, 2), 
                  .combine=randomForest::combine,
                  .multicombine=TRUE,
                  .packages='randomForest') %dopar% {
                    randomForest(X, as.factor(y), ntree=ntree, 
                                 importance=TRUE)
                  }

stopCluster(cl) 
toc()
## 0.23 sec elapsed

Let’s make another function to simplify

multiCoreRF <- function(X, y, ntree, cores) {

  treesPerCore <- ceiling(ntree/cores)
  registerDoParallel(cores=cores)
  cl <- makeCluster(cores) 

  baseRF2 <- foreach(ntree=rep(treesPerCore, cores), 
                     .combine=randomForest::combine,
                     .multicombine=TRUE,
                     .packages='randomForest') %dopar% {
                       randomForest(X, as.factor(y), ntree=ntree, 
                                    importance=TRUE)
                     }
  
  stopCluster(cl) 
}

2 cores, 10000 trees

tic()
multiCoreRF(X, y, 10000, 2)
toc()
## 17.71 sec elapsed

4 cores, 10000 trees

tic()
multiCoreRF(X, y, 10000, 4)
toc()
## 10.11 sec elapsed

6 cores, 10000 trees

tic()
multiCoreRF(X, y, 10000, 6)
toc()
## 8.76 sec elapsed

8 cores, 10000 trees

tic()
multiCoreRF(X, y, 10000, 8)
toc()
## 8.94 sec elapsed

Bonus test

8 cores, 100000 trees

tic()
x<- multiCoreRF(X, y, 100000, 8)
toc()
## 68.48 sec elapsed