From 028ae82b4970a98318c6bea1d706953288d705b4 Mon Sep 17 00:00:00 2001 From: Chuheng Yu Date: Mon, 16 Dec 2019 15:26:30 -0500 Subject: [PATCH 1/2] Finish --- Assignment6.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Assignment6.Rmd b/Assignment6.Rmd index 8e65135..d9f1919 100644 --- a/Assignment6.Rmd +++ b/Assignment6.Rmd @@ -25,7 +25,7 @@ library(rpart) #Upload the data sets MOOC1.csv and MOOC2.csv M1 <- read.csv("MOOC1.csv", header = TRUE) -M2 <- +M2 <- read.csv("MOOC2.csv", header = TRUE) ``` @@ -33,10 +33,10 @@ M2 <- ```{r} #Using the rpart package generate a classification tree predicting certified from the other variables in the M1 data frame. Which variables should you use? -c.tree1 <- +c.tree1 <- rpart(certified~forum.posts+grade+assignment, method = "class", data = M1) #Check the results from the classifcation tree using the printcp() command - +printcp(c.tree1) #Plot your tree From abf5f9bd42afdb998ae5ffc3faf2ad8701c243ed Mon Sep 17 00:00:00 2001 From: Chuheng Yu Date: Mon, 16 Dec 2019 15:29:42 -0500 Subject: [PATCH 2/2] FInish --- .gitignore | 4 ++++ Assignment6.Rmd | 28 +++++++++++++++++++++++++++- assignment6.Rproj | 13 +++++++++++++ tree1.ps | Bin 0 -> 4728 bytes 4 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 assignment6.Rproj create mode 100644 tree1.ps diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Assignment6.Rmd b/Assignment6.Rmd index d9f1919..1eaa4b9 100644 --- a/Assignment6.Rmd +++ b/Assignment6.Rmd @@ -42,6 +42,8 @@ printcp(c.tree1) #Plot your tree post(c.tree1, file = "tree1.ps", title = "MOOC") #This creates a pdf image of the tree +rpart.plot::rpart.plot(c.tree1,type=3,box.palette = c("red", "green"), fallen.leaves = TRUE) +rpart.plot::rpart.plot(c.tree1) ``` @@ -53,10 +55,16 @@ post(c.tree1, file = "tree1.ps", title = "MOOC") #This creates a pdf image of th ```{r} c.tree2 <- prune(c.tree1, cp = )#Set cp to the level at which you want the tree to end +plotcp(c.tree1) +printcp(c.tree1) +rpart.plot::rpart.plot(c.tree1) +c.tree2 <- prune(c.tree1, cp =0.058182) #Visualize this tree and compare it to the one you generated earlier post(c.tree2, file = "tree2.ps", title = "MOOC") #This creates a pdf image of the tree +rpart.plot::rpart.plot(c.tree2) +printcp(c.tree2) ``` #Now use both the original tree and the pruned tree to make predictions about the the students in the second data set. Which tree has a lower error rate? @@ -69,7 +77,8 @@ M2$predict2 <- predict(c.tree2, M2, type = "class") table(M2$certified, M2$predict1) table(M2$certified, M2$predict2) - +mean(M2$certified==M2$predict1) +mean(M2$certified==M2$predict2) ``` ##Part III @@ -77,6 +86,23 @@ table(M2$certified, M2$predict2) Choose a data file from the (University of Michigan Open Data Set)[https://github.com/bkoester/PLA/tree/master/data]. Choose an outcome variable that you would like to predict. Build two models that predict that outcome from the other variables. The first model should use raw variables, the second should feature select or feature extract variables from the data. Which model is better according to the cross validation metrics? ```{r} +D3<- read.csv("student.record.csv",header = TRUE) +D3<- D3[,c(4:13)] +D3<- na.omit(D3) +c.tree3 <- rpart(as.factor(SEX) ~., method = "class", data = D3) +printcp(c.tree3) +HSGPA <- D3[,1] +SEX <- D3[, 10] +ACT <- D3[, 2:6] +ACT$ACTscore <- rowSums(ACT) +SAT <- D3[, 7:9] +SAT$SATscore <- rowSums(SAT) +D4 <- cbind(HSGPA, ACT, SAT, SEX) +c.tree4 <- rpart(as.factor(SEX) ~ ACTscore + SATscore + HSGPA, method = "class", data = D4) +printcp(c.tree4) +D4$predict1 <- predict(c.tree3, D4 , type = "class") +table(D4$SEX, D4$predict1) +sum(diag(table(D4$SEX, D4$predict1)))/sum(table(D4$SEX, D4$predict1)) ``` diff --git a/assignment6.Rproj b/assignment6.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/assignment6.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/tree1.ps b/tree1.ps new file mode 100644 index 0000000000000000000000000000000000000000..1a36db213ca45bef6f87aad75fa66477964dbad7 GIT binary patch literal 4728 zcmd5=-*4MS41U*N!94_UFyyQ|NtP6J1)4P70yIs~6xc)22U)r}+GI(STFA`$l6~u!%PjqCqxZ2`v~yaBVR6bl19fAx#Wc& z1+FBUW{3Dd=0nKT_ zgm<{CQs9l6w8)R?ZImZ<9F;72Qf%@h%~vPI-7(34Kc4SXpjwS@UH{w53%|_4{8>*kxhc@VKd>yv|^jLYt}r7BvgqQ=)II16uMHB+6Qp-9BKN zeWK4AQp+??jOct-ne|H7KsRM+W4gXAzLbDTki=)KyC&9M7sz5UVPw2sWof?4$jz3y z#*WSHL?t$yI4f%Xh<9V`+@x7HQ7O!F-BiV&Q^#Y&=&jh~ENqIZc%ls0@X;lO=waUu zgrvAAzmh{ovR&7A@h!q_9z7rgyn_a*^(Ncp?*~yzHn_*zQZZvyb7qz+GB8(tK6()2 zu$Rs=3i}K!Ci1%Y1{40XDtnAHCY~N0O#|A(Y75kY2>+3U>eF`|3mk|qX@XKHi?U~P zM034DRI4s4T!N^*Jtf*i?5F#jRWjP+_%%ybQbUe+42?x&Q^|wr7EPZMr^C@dhZ-{Q zdN^>dtEH}J+G6U1&@Hx)Ugx2L;w&u@jmPJuS}nPpbc2@|tzvpIaoCv~N0~OQ7INk} z!gfGtSr8jH3DaY?^pWPL=VvETt&>+Lr@d2zqV3|P&rCcl77B_S`1rx9z{t)(!4>MlyDKjlAcgYZ**4gvD_nvgK%=! z-oT?q+0k61C*iTxKo`09A0eUc)sT^@Zw8PdI~R6hB8#o z>Bzu#9kyfpO8OpNLeF+=x$E5Tw`rrViWFBUyn~F#;o*NlXitP+zI?FviHpTlN?k_r zv|-jmH9ab&gM;%t&NhkeZ|_H!_k6g=(Yx}5>&)Uh$aKEK$xhO^Y40QWvhB-${^tBO z>YBE=ukLA!`k~-C-wAI#{f!`wmVYRi z+{7?id}DcbQ|!Eb{vSY|I7QwQZen8XCe(Ha#!O4nT9%d!ctCBx1LcSyfiEz3&TbCb$;djwrNDGi9a6J|(`ffJ+pI-d z)J^?@{7`tz!JuP#t|MHZln;aITeo=m%Xo`9M#(KF1%FL@Z+{v99U>TtK;hvEZvlAl zOu-lv858gegknMpNAZp8w?u$>SvVmRcru6Yc_#b-gyi7hr=L=|j6x%Z95aC}4JX78 zAF+^3xI%>>T*EP*NFj{}uD}l?&TX+U9w6*|r?xsCu2c}x;oU&k1mAlKa-jOn zu?47h53@U{etVz>%-%A50@dm{0`7up{WVBH^-us3REyH!M-AK1fj88&YV&2wEJAQ+XDGu&s2wT#JJd{p|7Xb^@l?1E0DpC zOVh-s3;wv6`zq&v_T|r!#?R2w;T}M8;bTT6jBP@OI3*}A$Akpch69A|#FdL5m)+un zrjOHWqqhH>rYj9Z*LOLtv#$?rL!lgJy6&69&