diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Assignment6.Rmd b/Assignment6.Rmd index 8e65135..1eaa4b9 100644 --- a/Assignment6.Rmd +++ b/Assignment6.Rmd @@ -25,7 +25,7 @@ library(rpart) #Upload the data sets MOOC1.csv and MOOC2.csv M1 <- read.csv("MOOC1.csv", header = TRUE) -M2 <- +M2 <- read.csv("MOOC2.csv", header = TRUE) ``` @@ -33,15 +33,17 @@ M2 <- ```{r} #Using the rpart package generate a classification tree predicting certified from the other variables in the M1 data frame. Which variables should you use? -c.tree1 <- +c.tree1 <- rpart(certified~forum.posts+grade+assignment, method = "class", data = M1) #Check the results from the classifcation tree using the printcp() command - +printcp(c.tree1) #Plot your tree post(c.tree1, file = "tree1.ps", title = "MOOC") #This creates a pdf image of the tree +rpart.plot::rpart.plot(c.tree1,type=3,box.palette = c("red", "green"), fallen.leaves = TRUE) +rpart.plot::rpart.plot(c.tree1) ``` @@ -53,10 +55,16 @@ post(c.tree1, file = "tree1.ps", title = "MOOC") #This creates a pdf image of th ```{r} c.tree2 <- prune(c.tree1, cp = )#Set cp to the level at which you want the tree to end +plotcp(c.tree1) +printcp(c.tree1) +rpart.plot::rpart.plot(c.tree1) +c.tree2 <- prune(c.tree1, cp =0.058182) #Visualize this tree and compare it to the one you generated earlier post(c.tree2, file = "tree2.ps", title = "MOOC") #This creates a pdf image of the tree +rpart.plot::rpart.plot(c.tree2) +printcp(c.tree2) ``` #Now use both the original tree and the pruned tree to make predictions about the the students in the second data set. Which tree has a lower error rate? @@ -69,7 +77,8 @@ M2$predict2 <- predict(c.tree2, M2, type = "class") table(M2$certified, M2$predict1) table(M2$certified, M2$predict2) - +mean(M2$certified==M2$predict1) +mean(M2$certified==M2$predict2) ``` ##Part III @@ -77,6 +86,23 @@ table(M2$certified, M2$predict2) Choose a data file from the (University of Michigan Open Data Set)[https://github.com/bkoester/PLA/tree/master/data]. Choose an outcome variable that you would like to predict. Build two models that predict that outcome from the other variables. The first model should use raw variables, the second should feature select or feature extract variables from the data. Which model is better according to the cross validation metrics? ```{r} +D3<- read.csv("student.record.csv",header = TRUE) +D3<- D3[,c(4:13)] +D3<- na.omit(D3) +c.tree3 <- rpart(as.factor(SEX) ~., method = "class", data = D3) +printcp(c.tree3) +HSGPA <- D3[,1] +SEX <- D3[, 10] +ACT <- D3[, 2:6] +ACT$ACTscore <- rowSums(ACT) +SAT <- D3[, 7:9] +SAT$SATscore <- rowSums(SAT) +D4 <- cbind(HSGPA, ACT, SAT, SEX) +c.tree4 <- rpart(as.factor(SEX) ~ ACTscore + SATscore + HSGPA, method = "class", data = D4) +printcp(c.tree4) +D4$predict1 <- predict(c.tree3, D4 , type = "class") +table(D4$SEX, D4$predict1) +sum(diag(table(D4$SEX, D4$predict1)))/sum(table(D4$SEX, D4$predict1)) ``` diff --git a/assignment6.Rproj b/assignment6.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/assignment6.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/tree1.ps b/tree1.ps new file mode 100644 index 0000000..1a36db2 Binary files /dev/null and b/tree1.ps differ