diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd index 54b0e66..12e902a 100644 --- a/Assignment 4.Rmd +++ b/Assignment 4.Rmd @@ -1,5 +1,8 @@ --- -title: "Assignment 4: K Means Clustering" +title: 'Assignment 4: K Means Clustering' +output: + html_document: + df_print: paged --- In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means: @@ -8,14 +11,19 @@ https://www.cs.uic.edu/~wilkinson/Applets/cluster.html ```{r} -library() +library(dplyr) +library(tidyr) +library(ggplot2) ``` Now, upload the file "Class_Motivation.csv" from the Assignment 4 Repository as a data frame called "K1"" ```{r} -K1 <- read.csv(...) +K1 <- read.csv("Class_Motivation.csv", header = TRUE) +K1b <- gather(K1, week, measure, 2:6) + +plot(as.factor(K1b$week), K1b$measure) ``` This file contains the self-reported motivation scores for a class over five weeks. We are going to look for patterns in motivation over this time and sort people into clusters based on those patterns. @@ -26,7 +34,7 @@ The algorithm will treat each row as a value belonging to a person, so we need t ```{r} -K2 <- +K2 <- select(K1, 2:6) ``` @@ -41,13 +49,16 @@ We will remove people with missing values for this assignment, but keep in mind K3 <- na.omit(K2) #This command create a data frame with only those people with no missing values. It "omits" all rows with missing values, also known as a "listwise deletion". EG - It runs down the list deleting rows as it goes. +K3 <- K2 + +K3[is.na(K3)] <- 0 ``` Another pre-processing step used in K-means is to standardize the values so that they have the same range. We do this because we want to treat each week as equally important - if we do not standardise then the week with the largest range will have the greatest impact on which clusters are formed. We standardise the values by using the "scale()" command. ```{r} -K3 <- +K3 <- scale(K3) ``` @@ -66,21 +77,37 @@ Also, we need to choose the number of clusters we think are in the data. We will ```{r} -fit <- +fit1a <- kmeans(K3, 2) +fit1b <- kmeans(K3, 2) +fit1c <- kmeans(K3, 2) #We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster. #We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows. - +fit1a$cluster #We can also attach these clusters to the original dataframe by using the "data.frame" command to create a new data frame called K4. -K4 +K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster) -#Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command. +fit1a$withinss +fit1b$withinss +fit1c$withinss +fit1a$tot.withinss +fit1b$tot.withinss +fit1c$tot.withinss +fit1a$betweenss +fit1b$betweenss +fit1c$betweenss + +K4 <- data.frame(K3, fit1c$cluster) + +#Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command. + +names(K4) <- c("1","2","3","4","5","cluster") ``` Now we need to visualize the clusters we have created. To do so we want to play with the structure of our data. What would be most useful would be if we could visualize average motivation by cluster, by week. To do this we will need to convert our data from wide to long format. Remember your old friends tidyr and dplyr! @@ -95,7 +122,7 @@ Now lets use dplyr to average our motivation values by week and by cluster. ```{r} -K6 <- K5 %>% group_by(week, cluster) %>% summarise(K6, avg = mean(motivation)) +K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation)) ``` @@ -113,9 +140,9 @@ Likewise, since "cluster" is not numeric but rather a categorical label we want ```{r} -K6$week <- +K6$week <- as.numeric(K6$week) -K6$cluster <- +K6$cluster <- as.factor(K6$cluster) ``` @@ -148,12 +175,61 @@ Look at the number of people in each cluster, now repeat this process for 3 rath Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters. +```{r} +library(tidyverse) + +M1 <- read.csv("HUDK405020-cluster.csv", header = TRUE) + +M2 <- select(M1,4:9) + +fit2a <- kmeans(M2,1) +fit2b <- kmeans(M2,2) +fit2c <- kmeans(M2,3) +fit2d <- kmeans(M2,4) +fit2e <- kmeans(M2,5) +fit2f <- kmeans(M2,6) +fit2g <- kmeans(M2,7) + +mss <- c(fit2a$tot.withinss,fit2b$tot.withinss,fit2c$tot.withinss,fit2d$tot.withinss,fit2e$tot.withinss,fit2f$tot.withinss,fit2g$tot.withinss,fit2a$betweenss,fit2b$betweenss,fit2c$betweenss,fit2d$betweenss,fit2e$betweenss,fit2f$betweenss,fit2g$betweenss) + +clusters <- c(seq(1,7,1),seq(1,7,1)) +col <- c(rep("blue",7), rep("red",7)) + +plot(clusters, mss, col = col) + +L1 <- select(M1, 2:3) + +plot(L1$long, L1$lat) + +fit3a <- kmeans(L1, 2) +fit3b <- kmeans(L1, 2) +fit3c <- kmeans(L1, 2) + +fit3a$tot.withinss +fit3b$tot.withinss +fit3c$tot.withinss + +ML <- data.frame(M1$compare.features,M1$math.accuracy,M1$planner.use,M1$enjoy.discuss,M1$enjoy.group,M1$meet.deadline,fit2c$cluster,M1$lat,M1$long,fit3a$cluster) + +pairs(ML) +``` + ##Part III Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers? ```{r} +table(ML$fit2c.cluster,ML$fit3a.cluster) + +ML2 <- ML %>% group_by(fit2c.cluster,fit3a.cluster) %>% summarize(count = n()) + +ggplot(ML2, aes(x = fit2c.cluster, y = fit3a.cluster, size = count)) + +geom_point() +install.packages("vcd") +library(vcd) +P1 <- structable(fit2c$cluster ~ fit3a$cluster) +mosaic(P1, shade=TRUE, legend=TRUE) ``` diff --git a/Assignment-4_files/figure-html/unnamed-chunk-10-1.png b/Assignment-4_files/figure-html/unnamed-chunk-10-1.png new file mode 100644 index 0000000..5013f08 Binary files /dev/null and b/Assignment-4_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/Assignment-4_files/figure-html/unnamed-chunk-12-1.png b/Assignment-4_files/figure-html/unnamed-chunk-12-1.png new file mode 100644 index 0000000..d53b57c Binary files /dev/null and b/Assignment-4_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/Assignment-4_files/figure-html/unnamed-chunk-12-2.png b/Assignment-4_files/figure-html/unnamed-chunk-12-2.png new file mode 100644 index 0000000..4beb0c4 Binary files /dev/null and b/Assignment-4_files/figure-html/unnamed-chunk-12-2.png differ diff --git a/Assignment-4_files/figure-html/unnamed-chunk-12-3.png b/Assignment-4_files/figure-html/unnamed-chunk-12-3.png new file mode 100644 index 0000000..b803e55 Binary files /dev/null and b/Assignment-4_files/figure-html/unnamed-chunk-12-3.png differ diff --git a/Assignment-4_files/figure-html/unnamed-chunk-2-1.png b/Assignment-4_files/figure-html/unnamed-chunk-2-1.png new file mode 100644 index 0000000..ad6c8d1 Binary files /dev/null and b/Assignment-4_files/figure-html/unnamed-chunk-2-1.png differ