llsc-supercloud · AnushreeChopde · Dec 18, 2019 · Dec 18, 2019 · Dec 18, 2019 · Dec 18, 2019
diff --git a/R/random_forest/CTG.csv b/R/random_forest/CTG.csv
diff --git a/R/random_forest/Parallel/README.md b/R/random_forest/Parallel/README.md
@@ -0,0 +1,33 @@
+# Compiling the parallel execution of Random Forest in R example :
+___
+
+This example shows the parallel execution of random forest in R. Working of random forest is same as in serial version, where various decision trees are created in parallel using multiple cores, which increases performance in the field of execution time.
+
+Additional packages used are randomForest, foreach, caret and doParallel, which has parallelization built in their functions. 
+
+| Package | Purpose |
+| ------ | ------ |
+| Foreach | It is true horsepower of parallel processing in R. Uses %dopar% to parallelize tasks and returns it at a list of results vectors. |
+| doparallel | Provides parallel backend for %dopar% function. |
+| randomForest | Builds a number of decision trees. Uses foreach and the combine function to get parallelization. |
+| Caret | Provides framework to find optimal model by trying multiple models with resampling. |
+
+The random forest approach will same for parallel execution of the code.
+
+By using registerDoParallel(cores = n) and getDoParWorkers() we can set the number of cores and run multiple times to get system time as done in the serial version of the code. 
+
+Sys.time() gives the execution time of the program, hence we chose to run it multiple times and compared the execution time of serial and parallel execution. 
+
+We observe that, parallel execution gives output in less time as compared to serial execution, as the work is divided into cores by processes.
+
+# Submitting the Parallel R example :
+___
+
+We are submitting the R script rf_parallel1.R using shell script named submit.sh, which runs parallel execution in the script.
+
+Command used to run the R script is as follows :
+    • First go the directory where the parallel R file is located.
+    • Then run : Rscript rf_parallel.R
+Command to run the shell script to run parallel executions is as follows :
+    • First go the directory where the parallel shell script is located.
+    • Then submit batch job as : LLsub submit.sh
diff --git a/R/random_forest/Parallel/rf_parallel.R b/R/random_forest/Parallel/rf_parallel.R
@@ -0,0 +1,102 @@
+# Read Data
+data<-read.csv("../CTG.csv")
+str(data)
+data$NSP <- as.factor(data$NSP)
+table(data$NSP)
+
+# Data Partition. 70% data for training and 30% data for testing.
+set.seed(123)
+ind <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
+train <- data[ind==1,]
+test <- data[ind==2,]
+
+# Random Forest
+library(randomForest)
+library(e1071)
+library(ggplot2)
+library(doParallel)
+library(foreach)
+set.seed(222)
+
+#loop to run for multiple cores. In our batch script submit.sh Parallel_output0.sh has the execution times for different number of cores from 2 to 16. We used this time to plot the scaling plot and we notice that with increase in number of cores, execution time is decreased. 
+
+for(i in 2:16) 
+{
+    registerDoParallel(cores = i)
+    getDoParWorkers()
+
+    start_time <- Sys.time()
+
+    rf <- foreach(ntree=rep(200, 5), .combine=randomForest::combine,
+              .multicombine=TRUE, .packages='randomForest') %dopar% {
+                randomForest(NSP~., data=train, ntree=ntree, mtry=8)
+              }
+    end_time <- Sys.time()
+
+    time <- print(end_time - start_time)
+
+}
+
+#Optional part : To run individually for different number of cores, we can uncomment the below lines putting as many number of cores we need. In this example, we tried both with loop to print multiple values and individual runs with different number of cores. Same can be noticed from the commented part in the submit.sh batch script. 
+
+#registerDoParallel(cores = 2) #Parallel_output1.txt in submit.sh
+#registerDoParallel(cores = 4) #Parallel_output2.txt in submit.sh
+#registerDoParallel(cores = 8) Parallel_output3.txt in submit.sh
+#registerDoParallel(cores = 16) #Parallel_output4.txt in submit.sh
+#registerDoParallel(cores = 32) #Parallel_output5.txt in submit.sh
+#registerDoParallel(cores = 64) #Parallel_output6.txt in submit.sh
+
+#getDoParWorkers()
+
+#start_time <- Sys.time()
+
+#rf <- foreach(ntree=rep(200, 5), .combine=randomForest::combine,
+#              .multicombine=TRUE, .packages='randomForest') %dopar% {
+#                randomForest(NSP~., data=train, ntree=ntree, mtry=8)
+#              }
+#end_time <- Sys.time()
+
+#time <- print(end_time - start_time)
+
+print(rf)
+attributes(rf)
+
+#Prediction & Confusion Matrix - train data
+library(caret)
+p1 <- predict(rf, train)
+confusionMatrix(p1, train$NSP)
+
+#Prediction & Confusion Matrix - test data
+p2 <- predict(rf, test)
+confusionMatrix(p2, test$NSP)
+
+#Error rate of Random Forest
+plot(rf)
+
+#Tune mtry
+t <- tuneRF(train[,-22], train[,22],
+            stepFactor = 0.5,
+            plot = TRUE,
+            ntreeTry = 300,
+            trace = TRUE,
+            improve = 0.05)
+
+#No. of nodes for the trees
+hist(treesize(rf),
+    main = "No. of Nodes for the Trees",
+     col = "green")
+
+#Variable Importance
+varImpPlot(rf,
+           sort = T,
+           n.var = 10,
+           main = "Top 10 - Variable Importance")
+importance(rf)
+varUsed(rf)
+
+#Partial Dependence Plot
+partialPlot(rf, train, ASTV, "2")
+
+#Extract Single Tree
+getTree(rf, 1, labelVar = TRUE)
+
diff --git a/R/random_forest/Parallel/submit.sh b/R/random_forest/Parallel/submit.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+echo "Run parallel code"
+#Rscript rf_parallel.R >> Parallel_output1.txt #cores=2
+#Rscript rf_parallel.R >> Parallel_output2.txt #cores=4
+#Rscript rf_parallel.R >> Parallel_output3.txt #cores=8
+#Rscript rf_parallel.R >> Parallel_output4.txt #cores=16
+#Rscript rf_parallel.R >> Parallel_output5.txt #cores=32
+#Rscript rf_parallel.R >> Parallel_output6.txt #cores=64
+Rscript rf_parallel.R >> Parallel_output0.txt #cores=i 
diff --git a/R/random_forest/Serial/README.md b/R/random_forest/Serial/README.md
@@ -0,0 +1,33 @@
+# Compiling the serial execution of Random Forest in R example :
+___
+
+Random forest algorithm is based on aggregating the predictions of multiple decision trees of varying depth generated by the random forest. Every decision tree is trained over the dataset.
+
+Random forest chooses random subset of features and builds a number of decision trees. We train a number of decision tree classifiers on different random subset of the training dataset. For prediction, we obtain predictions of all decision trees, and then predict class that gets maximum votes. 
+
+The left-out samples of the decision trees in the forest are the out of bag (OOB) errors.
+We use packages randomForest to build number of decision trees and caret to find optimal model by trying multiple models with resampling to generate random forest model.
+
+Random forest approach :
+Step 1) Import the data
+Step 2) Data partition for training and testing 
+Step 3) Train the model
+Step 4) Construct accuracy function
+Step 5) Visualize the model
+Step 6) Evaluate the model
+Step 7) Visualize result for training and testing
+Step 8) Visualize results
+
+Sys.time() gives the execution time of the program.
+(We observe that, parallel execution gives output in less time as compared to serial execution, as the work is divided into cores by processes.)
+
+# Submitting the Serial R example :
+___
+
+We are submitting serial execution in batch script named submit.sh. 
+Command used to run the R script is as follows :
+    • First go the directory where the serial R file is located.
+    • Then run : Rscript rf_serial.R
+Command to run the shell script to run serial execution is as follows :
+    • First go the directory where the serial batch script is located.
+    • Then submit batch job as : LLsub submit.sh.
diff --git a/R/random_forest/Serial/rf_serial.R b/R/random_forest/Serial/rf_serial.R
@@ -0,0 +1,71 @@
+# Read Data
+data<-read.csv("../CTG.csv")
+str(data)
+data$NSP <- as.factor(data$NSP)
+table(data$NSP)
+
+# Data Partition
+set.seed(123)
+ind <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
+train <- data[ind==1,]
+test <- data[ind==2,]
+
+# Random Forest
+library(randomForest)
+set.seed(222)
+
+start_time <- Sys.time()
+
+rf <- randomForest(NSP~., data=train,
+                   ntree = 1000,
+                   mtry = 8,
+                   importance = TRUE,
+                   proximity = TRUE)
+end_time <- Sys.time()
+
+print(end_time - start_time)
+
+print(rf)
+attributes(rf)
+
+#Prediction & Confusion Matrix - train data
+library(caret)
+p1 <- predict(rf, train)
+confusionMatrix(p1, train$NSP)
+
+#Prediction & Confusion Matrix - test data
+p2 <- predict(rf, test)
+confusionMatrix(p2, test$NSP)
+
+# Error rate of Random Forest
+plot(rf)
+
+# Tune mtry
+t <- tuneRF(train[,-22], train[,22],
+       stepFactor = 0.5,
+       plot = TRUE,
+       ntreeTry = 300,
+       trace = TRUE,
+       improve = 0.05)
+
+# No. of nodes for the trees
+hist(treesize(rf),
+     main = "No. of Nodes for the Trees",
+     col = "green")
+
+# Variable Importance
+varImpPlot(rf,
+           sort = T,
+           n.var = 10,
+           main = "Top 10 - Variable Importance")
+importance(rf)
+varUsed(rf)
+
+# Partial Dependence Plot
+partialPlot(rf, train, ASTV, "2")
+
+# Extract Single Tree
+getTree(rf, 1, labelVar = TRUE)
+
+# Multi-dimensional Scaling Plot of Proximity Matrix
+MDSplot(rf, train$NSP)
diff --git a/R/random_forest/Serial/submit.sh b/R/random_forest/Serial/submit.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+echo "Run serial code"
+#Rscript rf_serial.R >> Serial_output1.txt 
+#Rscript rf_serial.R >> Serial_output2.txt 
+#Rscript rf_serial.R >> Serial_output3.txt 
+#Rscript rf_serial.R >> Serial_output4.txt 
+#Rscript rf_serial.R >> Serial_output5.txt
+#Rscript rf_serial.R >> Serial_output6.txt
+Rscript rf_serial.R >> Serial_output0.txt
+
+