diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd index 54b0e66..d840d16 100644 --- a/Assignment 4.Rmd +++ b/Assignment 4.Rmd @@ -1,5 +1,5 @@ --- -title: "Assignment 4: K Means Clustering" +title: 'Assignment 4: K Means Clustering' --- In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means: @@ -8,13 +8,17 @@ https://www.cs.uic.edu/~wilkinson/Applets/cluster.html ```{r} -library() +library(dplyr) +library(tidyr) +library(ggplot2) ``` Now, upload the file "Class_Motivation.csv" from the Assignment 4 Repository as a data frame called "K1"" ```{r} -K1 <- read.csv(...) +K1 <- read.csv("Class_Motivation.csv", header = TRUE) +K1b <- gather(K1, week, measure, 2:6) +plot(as.factor(K1b$week), K1b$measure) ``` @@ -26,7 +30,7 @@ The algorithm will treat each row as a value belonging to a person, so we need t ```{r} -K2 <- +K2 <- select(K1, 2:6) ``` @@ -41,13 +45,17 @@ We will remove people with missing values for this assignment, but keep in mind K3 <- na.omit(K2) #This command create a data frame with only those people with no missing values. It "omits" all rows with missing values, also known as a "listwise deletion". EG - It runs down the list deleting rows as it goes. +K3 <- K2 + +K3[is.na(K3)] <- 0 + ``` Another pre-processing step used in K-means is to standardize the values so that they have the same range. We do this because we want to treat each week as equally important - if we do not standardise then the week with the largest range will have the greatest impact on which clusters are formed. We standardise the values by using the "scale()" command. ```{r} -K3 <- +K3 <- scale(K3) ``` @@ -66,7 +74,13 @@ Also, we need to choose the number of clusters we think are in the data. We will ```{r} -fit <- + +fit1a <- kmeans(K3, 2) +fit1b <- kmeans(K3, 2) +fit1c <- kmeans(K3, 2) + +fit1a$cluster + #We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster. @@ -76,10 +90,25 @@ fit <- #We can also attach these clusters to the original dataframe by using the "data.frame" command to create a new data frame called K4. -K4 +K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster) + +fit1a$withinss +fit1b$withinss +fit1c$withinss + +fit1a$tot.withinss +fit1b$tot.withinss +fit1c$tot.withinss + +fit1a$betweenss +fit1b$betweenss +fit1c$betweenss + +K4 <- data.frame(K3, fit1c$cluster) #Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command. +names(K4) <- c("1", "2", "3", "4", "5", "cluster") ``` @@ -113,9 +142,9 @@ Likewise, since "cluster" is not numeric but rather a categorical label we want ```{r} -K6$week <- +K6$week <- as.numeric(K6(week)) -K6$cluster <- +K6$cluster <- as.factor(K6$cluster) ``` @@ -134,28 +163,93 @@ ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab What patterns do you see in the plot? - +#my code kept returning errors...however, looking at the plot in Charles' video, I can see the trend increasing after week 3. Before week 3 was a tremendous decline. It would be useful to determine how many people are in each cluster. We can do this easily with dplyr. ```{r} -K7 <- count(K4, cluster) +K7 <- dplyr::count(K4, cluster) ``` Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below: +##my code was running errors, but 3 clusters is probably more informative due to the increase in data and categories. + ##Part II Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters. +```{r} +library(tidyverse) +M1 <- read.csv("HUDK405020-cluster.csv", header = TRUE) +M2 <- select(M1, 4:9) +M2[M2==""] <- NA +M2 <- M@ %>% mutate_all(funs(as.numeric(levels(.))[.])) +M2 <- na.omit(M2) + +fit2a <- kmeans(M2, 1) +fit2b <- kmeans(M2, 2) +fit2c <- kmeans(M2, 3) +fit2d <- kmeans(M2, 4) +fit2e <- kmeans(M2, 5) +fit2f <- kmeans(M2, 6) +fit2g <- kmeans(M2, 7) + +mss <- c(fit2a$tot.withinss, fit2b$tot.withinss, fit2c$tot.withinss, fit2d$tot.withinss, fit2e$tot.withinss, fit2f$tot.withinss, fit2g$tot.withinss, fit2a$betweenss, fit2b$betweenss, fit2c$betweenss, fit2d$betweenss, fit2e$betweenss, fit2f$betweenss, fit2g$betweenss) + +cluster <- c(seq(1,7,1),seq(1,7,1)) +col <- c(rep("blue",7), rep("rep",7)) + +plot(clusters, mss, col= col) + +L1 <- select(M1, 2:3) + +plot(L1$long, L1$lat) +fit3a <- kmeans(L1, 2) +fit3b <- kmeans(L1, 2) +fit3c <- kmeans(L1, 2) + +fit3a$tot.withinss +fit3b$tot.withinss +fit3c$tot.withinss ##Part III Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers? ```{r} - +DF <- data.frame(table(ML$fit2c.cluster,ML$fit3a.cluster)) +ML2 <- ML %>% group_by(fit2c.cluster, fit3a.cluster, size = count)) + geom_point() +library(vcd) +P1 <- structable(fit2c$cluster ~ fit3a$cluster) +mosaic(P1, shade=TRUE, legend=TRUE) ``` +```{r} +D1$compare.features <- ifelse(D1$compare.features == "100% of the time", 100, + ifelse(D1$compare.features == "75% of the time", 75, + ifelse(D1$compare.features == "50% of the time", 50, + ifelse(D1$compare.features == "25% of the time", 25,0)))) + +D1$planner.use <- ifelse(D1$planner.use == "100% of the time", 100 + ifelse(D1$planner.use == "75% of the time", 75, + ifelse(D1$planner.use == "50% of the time", 50, + ifelse(D1$planner.use == "25% of the time", 25,0)))) + +D1$enjoy.discuss <- ifelse(D1$enjoy.discuss == "Yes, all the time", 100, + ifelse(D1$enjoy.discuss == "Most of the time", 75, + ifelse(D1$enjoy.discuss == "About half the time", 50, + ifelse(D1$enjoy.discuss == "Rarely", 25,0)))) + +D1$enjoy.group <- ifelse(D1$enjoy.group == "Yes, all the time", 100, + ifelse(D1$enjoy.group == "Most of the time", 75, + ifelse(D1$enjoy.group == "About half the time", 50, + ifelse(D1$enjoy.group == "Rarely", 25,0)))) + +D1$meet.deadline <- ifelse(D1$meet.deadline == "Yes, all the time", 100, + ifelse(D1$meet.deadline == "Most of the time", 75, + ifelse(D1$meet.deadline == "About half the time", 50, + ifelse(D1$meet.deadline == "Rarely", 25,0)))) + ## Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository.