-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparallel_programming.R
More file actions
163 lines (125 loc) · 3.9 KB
/
parallel_programming.R
File metadata and controls
163 lines (125 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Title: Parallel Programming
# Author: Anna Zink
# Date: 11/12/2020
# Description: Sample code for parallel programming
# References:
# https://bookdown.org/rdpeng/rprogdatascience/parallel-computation.html
# http://adv-r.had.co.nz/Profiling.html
# lineprof not on cran must download directly
#devtools::install_github("hadley/lineprof")
library(lineprof)
library(microbenchmark)
library(doParallel)
library(parallel)
library(foreach)
##################################
## PROFILING ##
##################################
# PROFILING - finding bottlemarks in time/memory
source('sample_code.R')
# run analysis
run_analysis<-function() {
ols()
predlist<-xval_ols()
boot(predlist)
}
# look at
l<-lineprof(run_analysis())
l
# output interpretation:
# time - time to run (seconds)
# alloc - memory allocation (megabytes)
# release - memory released (megabytes)
# dups - number of vector duplications that occured
# open with shiny
# shine(l)
##################################
## foreach & doParallel ##
##################################
# foreach & doParallel- way to parallelize loops in R
# doParallel provides parallel backend for each %dopar% function
# in the foreach statement
# update xval function in parallel using foreach and %dopar%
xval_ols_updt<-function() {
# you must register the cores you will use
cl<-makePSOCKcluster(4)
registerDoParallel(cl)
nfolds<-5
data$folds<-cut(seq(1,nrow(data)),breaks=nfolds,labels=FALSE)
# run OLS for each fold
results<-foreach(i=1:nfolds, .combine=rbind) %dopar% {
# split into train/test
index<-which(data$folds==i)
test_i<-data[index,]
train_i<-data[-index,]
# fit on train and predict on test
ra_i<-lm(totpay~., data=train_i)
fit_i<-predict(ra_i, data=test_i)
# add predicted data for the fold to the final dataset
fit_i<-data.frame(pred=fit_i, fold=i)
fit_i
}
return(results)
stopCluster(cl)
}
# benchmark update using package microbenchmark
a<-xval_ols()
b<-xval_ols_updt()
speed_test_xval<-microbenchmark(a, b)
speed_test_xval
# plot results
#ggplot2::autoplot(speed_test_xval)
##################################
## the parallel package ##
##################################
# ~~~ ONLY AVAILABLE ON MACS ~~~
# mclapply and mcmapply are parallel versions of
# lapply and mapply respectively
# bootstrap using mclapply
boot_updt<-function(predlist) {
mean.boot.updt<-mclapply(1:500, function(i) {
xnew<-sample(predlist$pred, replace = TRUE)
mean(xnew)
}, mc.cores = 4)
quantile(unlist(mean.boot.updt), c(.025,.975), na.rm=TRUE)
}
# compare update with benchmark
output<-xval_ols_updt()
a<-boot(output)
b<-boot_updt(output)
speed_test_boot<-microbenchmark(a, b)
speed_test_boot
# let's try our whole program now with updated functions
run_analysis_updt<-function() {
ols()
predlist<-xval_ols_updt()
boot_updt(predlist)
}
a<-run_analysis()
b<-run_analysis_updt()
speed_test_all<-microbenchmark(a, b)
speed_test_all
ggplot2::autoplot(speed_test_all)
# ~~~ NO MAC? ~~~
### Try using parLapply and other similar functions listed here:
?clusterApply
# On Windows you need to set up your environment (e.g. register the
# cluster, load packages on new clusters, etc.). McLapply clones
# all the work processes when it's called so you don't need to set
# up a new session removing this step for the user.
#############################################
## generating random #s in parallel ##
#############################################
# We want random #s in parallel to be DIFFERENT from each other
# but also reproducible... How do we ensure every time we run our
# parallel process we get the same different numbers for each process?
# must set this random # generator!
RNGkind("L'Ecuyer-CMRG")
set.seed(1248)
gen_randonums<-function() {
nums<-mclapply(1:4, function(i) {
rnorm(3)
}, mc.cores = 4)
return(nums)
}
gen_randonums()