-
Notifications
You must be signed in to change notification settings - Fork 62
Expand file tree
/
Copy path07_partII_cloudcomputing.R
More file actions
119 lines (75 loc) · 2.71 KB
/
07_partII_cloudcomputing.R
File metadata and controls
119 lines (75 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# install packages for parallelization
install.packages("parallel", "doSNOW", "stringr")
# load packages
library(parallel)
library(doSNOW)
# verify no. of cores available
n_cores <- detectCores()
n_cores
# PREPARATION -----------------------------
# packages
library(stringr)
# import data
marketing <- read.csv("data/marketing_data.csv")
# clean/prepare data
marketing$Income <- as.numeric(gsub("[[:punct:]]", "", marketing$Income))
marketing$days_customer <- as.Date(Sys.Date())-
as.Date(marketing$Dt_Customer, "%m/%d/%y")
marketing$Dt_Customer <- NULL
# all sets of independent vars
indep <- names(marketing)[ c(2:19, 27,28)]
combinations_list <- lapply(1:length(indep),
function(x) combn(indep, x, simplify = FALSE))
combinations_list <- unlist(combinations_list, recursive = FALSE)
models <- lapply(combinations_list,
function(x) paste("Response ~", paste(x, collapse="+")))
# set cores for parallel processing
# ctemp <- makeCluster(ncores)
# registerDoSNOW(ctemp)
# prepare loop
N <- 10 # just for illustration, the actual code is N <- length(models)
# run loop in parallel
pseudo_Rsq <-
foreach ( i = 1:N, .combine = c) %dopar% {
# fit the logit model via maximum likelihood
fit <- glm(models[[i]], data=marketing, family = binomial())
# compute the proportion of deviance explained
#by the independent vars (~R^2)
return(1-(fit$deviance/fit$null.deviance))
}
# set cores for parallel processing
ctemp <- makeCluster(ncores)
registerDoSNOW(ctemp)
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
# this hook is used only when the linewidth option is not NULL
if (!is.null(n <- options$linewidth)) {
x = xfun::split_lines(x)
# any lines wider than n should be wrapped
if (any(nchar(x) > n)) x = strwrap(x, width = n)
x = paste(x, collapse = '\n')
}
hook_output(x, options)
})
aws emr create-cluster \
--release-label emr-6.1.0 \
--applications Name=Hadoop Name=Spark Name=Hive Name=Pig \
Name=Tez Name=Ganglia \
--name "EMR 6.1 RStudio + sparklyr" \
--service-role EMR_DefaultRole \
--instance-groups InstanceGroupType=MASTER,InstanceCount=1,\
InstanceType=m3.2xlarge,InstanceGroupType=CORE,\
InstanceCount=2,InstanceType=m3.2xlarge \
--bootstrap-action \
Path='s3://aws-bigdata-blog/artifacts/
aws-blog-emr-rstudio-sparklyr/rstudio_sparklyr_emr6.sh',\
Name="Install RStudio" --ec2-attributes InstanceProfile=EMR_EC2_DefaultRole,\
KeyName="sparklyr"
--configurations '[{"Classification":"spark",
"Properties":{"maximizeResourceAllocation":"true"}}]' \
--region us-east-1
# load packages
library(sparklyr)
# connect rstudio session to cluster
sc <- spark_connect(master = "yarn")