We load the necessary libraries first
We load the data and take and look if the data was imported correctly
spruceData <- read.table('spruceData.tsv', header=T, row.names = 1, stringsAsFactors = F)
head(spruceData[,1:5])
## Gene1 Gene10 Gene100 Gene1000 Gene101
## W19-C1 7.310242 7.088565 7.073521 6.641622 6.872650
## W19-C2 7.167100 6.998046 7.214589 6.784180 6.806009
## W19-C3 7.159872 7.290680 7.138781 6.903825 6.447067
## W19-F1 7.196368 6.985777 7.417678 6.813525 6.534758
## W19-F2 7.247463 7.053132 7.243150 6.674302 6.776412
## W19-F3 7.427390 7.032896 7.050686 6.903508 6.563667
We use X for data notation and y for class vector. The class vector information is extracted from the row names. We can use 2 different prediction classes. Week or Treatment. The prediction classes are a factor because we want a classification.
X <- spruceData
treatment <- as.factor(substr(rownames(spruceData), 5,5))
week <- as.factor(substr(rownames(spruceData), 2,3))
We will use week as a prediction factor
y <- week
tic()
baseRF <- randomForest(x=X, y=y, ntree=100, importance=TRUE)
toc()
## 0.37 sec elapsed
We can explore the results using the importance function to give back the most important genes and extract the oobError.
baseImportance <- randomForest::importance(baseRF)
baseImportance <- baseImportance[order(baseImportance[,'MeanDecreaseAccuracy'], decreasing = TRUE),]
head(baseImportance)
## 19 20 21 22 23 24 25
## Gene135 1.7733173 0.200040 2.197231 2.2367584 1.653739 1.353881 0.000000
## Gene93 1.4002801 2.002970 2.015382 0.0000000 1.005038 0.000000 1.428571
## Gene489 0.0000000 0.000000 0.000000 0.0000000 0.000000 1.005038 1.005038
## Gene138 0.2627035 1.162476 2.188249 0.8491993 -1.005038 1.005038 1.005038
## Gene417 0.0000000 0.000000 1.005038 0.0000000 0.000000 0.000000 1.005038
## Gene537 1.4002801 1.005038 1.005038 1.4139250 1.005038 -1.005038 0.000000
## 26 28 31 32 34 35 36
## Gene135 0.000000 0.000000 1.005038 1.275153 1.005038 0.4937979 1.874178
## Gene93 1.005038 1.005038 1.353881 1.524986 1.005038 1.0050378 0.000000
## Gene489 0.000000 1.353881 0.000000 1.428571 0.000000 1.0050378 0.000000
## Gene138 1.428571 1.005038 0.000000 1.005038 1.005038 1.0050378 0.000000
## Gene417 1.695099 0.000000 1.005038 1.005038 1.005038 0.0000000 0.000000
## Gene537 0.000000 1.005038 1.005038 0.000000 1.005038 0.0000000 1.005038
## 37 38 39 40 41 MeanDecreaseAccuracy
## Gene135 1.005038 1.005038 1.005038 1.730969 1.275153 3.074120
## Gene93 -1.005038 0.000000 0.000000 1.005038 1.428571 2.931708
## Gene489 0.000000 0.000000 1.005038 1.005038 1.428571 2.546365
## Gene138 0.000000 1.005038 1.005038 1.005038 1.005038 2.495185
## Gene417 0.000000 0.000000 0.200040 1.005038 0.000000 2.476499
## Gene537 1.005038 0.000000 1.005038 1.005038 0.000000 2.464263
## MeanDecreaseGini
## Gene135 1.1103009
## Gene93 0.8682773
## Gene489 0.3797226
## Gene138 0.5947100
## Gene417 0.3950070
## Gene537 0.5114833
oobError <- mean(baseRF$confusion[,'class.error'])
print(oobError)
## [1] 0.422807
Let’s create a function to simplify this. getResults accepts the randomForest object as a parameter and just prints a preview of the importance and the OOB error
getResults <- function(rf) {
require(randomForest)
imp <- randomForest::importance(rf)
imp <- imp[order(imp[,'MeanDecreaseAccuracy'], decreasing = TRUE),]
head(imp)
oobError <- mean(rf$confusion[,'class.error'])
print(paste("OOB error",oobError))
}
And another function to execute a single core random forest run.
singleCoreRF <- function(X, y, ntree) {
baseRF <- randomForest(x=X, y=y, ntree=ntree, importance=TRUE)
return(baseRF)
}
tic()
single_100 <- singleCoreRF(X, y, 100)
toc()
## 0.36 sec elapsed
getResults(single_100)
## [1] "OOB error 0.43859649122807"
tic()
single_1000 <- singleCoreRF(X, y, 1000)
toc()
## 3.47 sec elapsed
getResults(single_1000)
## [1] "OOB error 0.250877192982456"
tic()
single_10000 <- singleCoreRF(X, y, 10000)
toc()
## 33.97 sec elapsed
getResults(single_10000)
## [1] "OOB error 0.226315789473684"
tic()
single_100000 <- singleCoreRF(X, y, 100000)
toc()
## 337 sec elapsed
getResults(single_100000)
## [1] "OOB error 0.217543859649123"
cores <- 2
registerDoParallel(cores=cores)
cl <- makeCluster(cores)
tic()
baseRF2 <- foreach(ntree=rep(50, 2),
.combine=randomForest::combine,
.multicombine=TRUE,
.packages='randomForest') %dopar% {
randomForest(X, as.factor(y), ntree=ntree,
importance=TRUE)
}
stopCluster(cl)
toc()
## 0.23 sec elapsed
Let’s make another function to simplify
multiCoreRF <- function(X, y, ntree, cores) {
treesPerCore <- ceiling(ntree/cores)
registerDoParallel(cores=cores)
cl <- makeCluster(cores)
baseRF2 <- foreach(ntree=rep(treesPerCore, cores),
.combine=randomForest::combine,
.multicombine=TRUE,
.packages='randomForest') %dopar% {
randomForest(X, as.factor(y), ntree=ntree,
importance=TRUE)
}
stopCluster(cl)
}
tic()
multiCoreRF(X, y, 10000, 2)
toc()
## 17.71 sec elapsed
tic()
multiCoreRF(X, y, 10000, 4)
toc()
## 10.11 sec elapsed
tic()
multiCoreRF(X, y, 10000, 6)
toc()
## 8.76 sec elapsed
tic()
multiCoreRF(X, y, 10000, 8)
toc()
## 8.94 sec elapsed
tic()
x<- multiCoreRF(X, y, 100000, 8)
toc()
## 68.48 sec elapsed