hpds-parallel/parallel_programming.R at master · zinka88/hpds-parallel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Title: Parallel Programming
# Author: Anna Zink
# Date: 11/12/2020
# Description: Sample code for parallel programming
# References:
#   https://bookdown.org/rdpeng/rprogdatascience/parallel-computation.html
#   http://adv-r.had.co.nz/Profiling.html

# lineprof not on cran must download directly
#devtools::install_github("hadley/lineprof")

library(lineprof)
library(microbenchmark)
library(doParallel)
library(parallel)
library(foreach)

##################################
##        PROFILING             ##
##################################

# PROFILING - finding bottlemarks in time/memory
source('sample_code.R')

# run analysis
run_analysis<-function() {
  ols()
  predlist<-xval_ols()
  boot(predlist)
}

# look at
l<-lineprof(run_analysis())
l
# output interpretation:
# time - time to run (seconds)
# alloc - memory allocation (megabytes)
# release - memory released (megabytes)
# dups - number of vector duplications that occured

# open with shiny
# shine(l)

##################################
##    foreach &  doParallel     ##
##################################

# foreach & doParallel- way to parallelize loops in R
# doParallel provides parallel backend for each %dopar% function
# in the foreach statement

# update xval function in parallel using foreach and %dopar%
xval_ols_updt<-function() {

  # you must register the cores you will use
  cl<-makePSOCKcluster(4)
  registerDoParallel(cl)

  nfolds<-5
  data$folds<-cut(seq(1,nrow(data)),breaks=nfolds,labels=FALSE)

  # run OLS for each fold
  results<-foreach(i=1:nfolds, .combine=rbind) %dopar% {
    # split into train/test
    index<-which(data$folds==i)
    test_i<-data[index,]
    train_i<-data[-index,]

    # fit on train and predict on test
    ra_i<-lm(totpay~., data=train_i)
    fit_i<-predict(ra_i, data=test_i)

    # add predicted data for the fold to the final dataset
    fit_i<-data.frame(pred=fit_i, fold=i)

    fit_i
  }
  return(results)

  stopCluster(cl)
}


# benchmark update using package microbenchmark
a<-xval_ols()
b<-xval_ols_updt()
speed_test_xval<-microbenchmark(a, b)
speed_test_xval

# plot results
#ggplot2::autoplot(speed_test_xval)

##################################
##    the parallel package      ##
##################################

# ~~~ ONLY AVAILABLE ON MACS ~~~
# mclapply and mcmapply are parallel versions of
# lapply and mapply respectively

# bootstrap using mclapply
boot_updt<-function(predlist) {
  mean.boot.updt<-mclapply(1:500, function(i) {
    xnew<-sample(predlist$pred, replace = TRUE)
    mean(xnew)
  }, mc.cores = 4)


  quantile(unlist(mean.boot.updt), c(.025,.975), na.rm=TRUE)
}

# compare update with benchmark
output<-xval_ols_updt()

a<-boot(output)
b<-boot_updt(output)
speed_test_boot<-microbenchmark(a, b)
speed_test_boot

# let's try our whole program now with updated functions
run_analysis_updt<-function() {
  ols()
  predlist<-xval_ols_updt()
  boot_updt(predlist)
}

a<-run_analysis()
b<-run_analysis_updt()
speed_test_all<-microbenchmark(a, b)
speed_test_all

ggplot2::autoplot(speed_test_all)

# ~~~ NO MAC? ~~~
### Try using parLapply and other similar functions listed here:
?clusterApply

# On Windows you need to set up your environment (e.g. register the
# cluster, load packages on new clusters, etc.). McLapply clones
# all the work processes when it's called so you don't need to set
# up a new session removing this step for the user.

#############################################
##    generating random #s in parallel     ##
#############################################

# We want random #s in parallel to be DIFFERENT from each other
# but also reproducible... How do we ensure every time we run our
# parallel process we get the same different numbers for each process?

# must set this random # generator!
RNGkind("L'Ecuyer-CMRG")
set.seed(1248)

gen_randonums<-function() {
  nums<-mclapply(1:4, function(i) {
  rnorm(3)
}, mc.cores = 4)
  return(nums)
}

gen_randonums()