From bab570975ce422dc2f8a1f09503595c64d54b1a7 Mon Sep 17 00:00:00 2001 From: berjakian Date: Sat, 14 Nov 2020 17:40:25 -0500 Subject: [PATCH 1/2] commit --- .gitignore | 4 + Assignment 4.Rmd | 324 ++++++++++++++++++++++++++++++++++++++++++++-- assignment4.Rproj | 13 ++ desktop.ini | Bin 0 -> 244 bytes 4 files changed, 329 insertions(+), 12 deletions(-) create mode 100644 .gitignore create mode 100644 assignment4.Rproj create mode 100644 desktop.ini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd index 54b0e66..60c12ad 100644 --- a/Assignment 4.Rmd +++ b/Assignment 4.Rmd @@ -1,5 +1,7 @@ --- title: "Assignment 4: K Means Clustering" +author: "Berj Akian" +date: "11/1/2020" --- In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means: @@ -8,13 +10,30 @@ https://www.cs.uic.edu/~wilkinson/Applets/cluster.html ```{r} -library() +library(tidyverse) ``` Now, upload the file "Class_Motivation.csv" from the Assignment 4 Repository as a data frame called "K1"" ```{r} -K1 <- read.csv(...) +#read in the data +K1 <- read.csv("Class_Motivation.csv") + +#check out the data +class(K1) +dim(K1) +colnames(K1) +rownames(K1) +head(K1) +summary(K1) + +#finding duplicate id's and adding up the number of dups... BAM! +K1idtest <- count(group_by(K1,id)) +K1idtest +#show just the ids that are dupped +K1idtest <- subset(K1idtest,n!=1) +K1idtest + ``` @@ -26,28 +45,59 @@ The algorithm will treat each row as a value belonging to a person, so we need t ```{r} -K2 <- +#super sloppy way of dealing with the duplicate id's... i left them in the dataset but added a unique suffix to all id's (i would have preferred to add suffixes to only the dup id's, e.g. 01, 02, 03..., but i dont know how to elegantly do that) +K1b <- K1 +K1b$id2 <- sample(1:38, replace=FALSE) +K1c <- K1b %>% unite(newid, id, id2, sep = "-") + + +#create a new copy of the data.frame with the id column as the rowname +K2 <- K1c[,-1] +rownames(K2) <- K1c[,1] + + +#count rows that have NA... found this https://sebastiansauer.github.io/sum-isna/ +#still deciding which i like best... would have +K2.is.na1 <- sapply(K2, function(x) sum(is.na(x))) +K2.is.na1 + +K2.is.na2 <- map(K2, ~sum(is.na(.))) +K2.is.na2 + + +K2.is.na3 <- K2 %>% + summarise_all(funs(sum(is.na(.)))) +K2.is.na3 + +K2.is.na4 <- apply(K2, MARGIN = 1, function(x) sum(is.na(x))) +K2.is.na4 + ``` It is important to think about the meaning of missing values when clustering. We could treat them as having meaning or we could remove those people who have them. Neither option is ideal. What problems do you foresee if we recode or remove these values? Write your answers below: +#i would have preferred to ask the person who gave me the data to help understand the reason for the dup id's and maybe try to identify which was the most recent if i was to delete them... for now, i sloppily left them in the dataframe and just appended a suffix to each of them so they are unique. also, i would have preferred to understand the na's... i simply ended up deleting any row in the data that had an na. resulted in deleting 15 rows. + + + We will remove people with missing values for this assignment, but keep in mind the issues that you have identified. ```{r} - K3 <- na.omit(K2) #This command create a data frame with only those people with no missing values. It "omits" all rows with missing values, also known as a "listwise deletion". EG - It runs down the list deleting rows as it goes. + ``` Another pre-processing step used in K-means is to standardize the values so that they have the same range. We do this because we want to treat each week as equally important - if we do not standardise then the week with the largest range will have the greatest impact on which clusters are formed. We standardise the values by using the "scale()" command. ```{r} +#scale the data -K3 <- +K3.scale <- scale(K3) ``` @@ -66,19 +116,20 @@ Also, we need to choose the number of clusters we think are in the data. We will ```{r} -fit <- +Kfit <- kmeans(K3.scale,centers = 2) #We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster. #We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows. - +Kfit$cluster #We can also attach these clusters to the original dataframe by using the "data.frame" command to create a new data frame called K4. -K4 +K4 <- data.frame(K3,Kfit$cluster) #Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command. +names(K4) <- c('1','2','3','4','5','cluster') ``` @@ -95,7 +146,7 @@ Now lets use dplyr to average our motivation values by week and by cluster. ```{r} -K6 <- K5 %>% group_by(week, cluster) %>% summarise(K6, avg = mean(motivation)) +K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation)) ``` @@ -113,9 +164,9 @@ Likewise, since "cluster" is not numeric but rather a categorical label we want ```{r} -K6$week <- +K6$week <- as.numeric(K6$week) -K6$cluster <- +K6$cluster <- as.factor(K6$cluster) ``` @@ -128,32 +179,281 @@ Now we can plot our line plot using the ggplot command, "ggplot()". ```{r} -ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") +ggplot(K6, aes(x = week, y = avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") ``` What patterns do you see in the plot? +## I see the following patterns in the top and bottom lines (i reference them 'top' and 'bottom' because each time i run kmeans it randomly assigns them labels '1' and '2')... +#1. Both cluster groups eventually trend upwards, the bottom line after week 3, the top line after week 4 (when i ran kmeans several times, i noticed the upward pitch in latter weeks varied in it's upward slope) +#2. The bottom line cluster had more variance in their motivation from week to week as compared to top one (again this changed each time i ran kmeans) +#Note: Remember that these interpretations are skewed because the original dataset contained multiple observations where the id was the same and these observations were not removed... also, 8 rows of data that had missing motivation survey values were removed. + It would be useful to determine how many people are in each cluster. We can do this easily with dplyr. ```{r} K7 <- count(K4, cluster) +K7 ``` +##Noteworthy... In a two cluster grouping... cluster 1 has almost twice the population of cluster 2 + + Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below: +```{r} +#and then there were 3 clusters... +Kfit <- kmeans(K3.scale,centers = 3) +str(Kfit$cluster) + +K4 <- data.frame(K3,Kfit$cluster) + +names(K4) <- c('1','2','3','4','5','cluster') + +K5 <- gather(K4, "week", "motivation", 1:5) + +K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation)) + +K6$week <- as.numeric(K6$week) + +K6$cluster <- as.factor(K6$cluster) + +K7 <- count(K4, cluster) +K7 + +ggplot(K6, aes(x = week, y = avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") + +``` + +# I feel 3 clusters is better... the populations are more balanced in this three cluster grouping with 7 to 9 observations each. Also, it's clear that cluster 1 had essentially no change in motivation... i'm glad to identify them separately. +#Note: Remember that these interpretations are skewed because the original dataset contained multiple observations where the id was the same and these observations were not removed... also, 8 rows of data that had missing motivation survey values were removed. + + + +```{r} +#hmmmm.... why not... +maxclusters <- unique(K3.scale) %>% nrow +halfthemaxclusters <- maxclusters/2 + +Kfit <- kmeans(K3.scale,centers = halfthemaxclusters) +str(Kfit$cluster) + +Kfit$cluster + +K4 <- data.frame(K3,Kfit$cluster) + +names(K4) <- c('1','2','3','4','5','cluster') + +K5 <- gather(K4, "week", "motivation", 1:5) + +K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation)) + +K6$week <- as.numeric(K6$week) + +K6$cluster <- as.factor(K6$cluster) + +K7 <- count(K4, cluster) +K7 + +ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") + +``` + +## Is MaxClusters/2 clusters better? +# I found this to be helpful in identifying clusters where there was no change... if i were better at r, i would have put some interactivity in this graph to filter out certain clusters, e.g. the ones with no change... and what's with the cluster of survey respondents that voted '1' every week, how unhappy are they?!?. +#Note: Remember that these interpretations are skewed because the original dataset contained multiple observations where the id was the same and these observations were not removed... also, 8 rows of data that had missing motivation survey values were removed. + + + + + ##Part II Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters. + +```{r} + +#load +HD1 <- read.csv("HUDK405020-cluster.csv") + +#check out the data +class(HD1) +dim(HD1) +colnames(HD1) +rownames(HD1) +head(HD1) +summary(HD1) + + +#look for duplicate id's +HD1idtest <- count(group_by(HD1,id)) +HD1idtest <- subset(HD1idtest,n!=1) +HD1dupids <- sum(HD1idtest$n) - nrow(HD1idtest) +HD1dupids +#yay, none found + + +#move id column to rowname +HD2 <- HD1 +rownames(HD2) <- HD1[,1] +HD2 <- HD1[,2:9] +nrow(HD2) + +#look for na's +HD2.is.na <- HD1 %>% + summarise_all(funs(sum(is.na(.)))) +HD2.is.na +#yay, none found + +#remove rows with na's +#HD3 <- na.omit(HD2) +#nrow(HD3) +HD3 <- HD2 + +#create table of lat/lon +HD3LatLon <-HD3[,1:2] +HD3LatLon <- as.matrix(HD3LatLon) +class(HD3LatLon) + +#create table of survey answers +HD3Survey <-HD3[,3:8] +HD3Survey <- as.matrix(HD3Survey) +class(HD3Survey) + +#scaling: this is not appropriate on lat/lon data (i think?) +#scale(HD3LatLon) + +#scaling: in looking at the summary of survey, scaling does not seem necessary as all the data is within the same scale already... 0-100 +summary(HD3Survey) +#scale(HD3Survey) + + + + + +``` + + +```{r} + +##HD3LatLon...explore how many clusters are needed using elbow plot + +#first determine how many unique row values in the data... this is max number of k's +unique(HD3LatLon) +#42 + +# Use map_dbl to run many models with varying value of k (centers) +tot_withinss <- map_dbl(1:42, function(k){ + model <- kmeans(x = HD3LatLon, centers = k) + model$tot.withinss +}) + +# Generate a data frame containing both k and tot_withinss +elbow_df <- data.frame( + k = 1:42, + tot_withinss = tot_withinss +) + +# Plot the elbow plot +ggplot(elbow_df, aes(x = k, y = tot_withinss)) + + geom_line() + + scale_x_continuous(breaks = 1:42) + +#the elbow graph shows 2 distinct clusters for k... i'll use 2 clusters for LatLon table + +#occurs to me after watching the workout video again, i could have skipped the elbow plot and just done a pairs plot and visually observe that 2 clusters would be fine +pairs(HD3LatLon) + +``` +```{r} + + +##HD3Survey...explore how many clusters are needed using elbow plot + +#first determine how many unique row values in the data... this is max number of k's +unique(HD3Survey) +#73 + +# Use map_dbl to run many models with varying value of k (centers) +tot_withinss <- map_dbl(1:20, function(k){ + model <- kmeans(x = HD3Survey, centers = k) + model$tot.withinss +}) + +# Generate a data frame containing both k and tot_withinss +elbow_df <- data.frame( + k = 1:20, + tot_withinss = tot_withinss +) + +# Plot the elbow plot +ggplot(elbow_df, aes(x = k, y = tot_withinss)) + + geom_line() + + scale_x_continuous(breaks = 1:20) + +#yuk, at 73 k's, the elbow graph shows no distinct clusters for k + +#i reran at 20 k's a bunch of times, the elbow graph shows a turn at 5 distinct clusters for k... i'll use 5 clusters for Survey table + + +#occurs to me after watching the workout video again, i could done a pairs plot and visually observe the data... likely because there are 5 potential values in the data... the pairs visual leans towards 5 clusters +pairs(HD3Survey) + +``` + + +```{r} + +#kmeans clusters for both LatLon and Survey matrixes +HD3LatLonFit <- kmeans(HD3LatLon,centers = 2) +HD3LatLonFit$cluster + +HD3SurveyFit <- kmeans(HD3Survey,centers = 5) +HD3SurveyFit$cluster + +#add the cluster information back to the original table +HD4 <-mutate(HD3, LatLonCluster = HD3LatLonFit$cluster, SurveyCluster = HD3SurveyFit$cluster) + + +``` + + ##Part III Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers? ```{r} +pairs(HD4) + +#table view... got this from the workout, thank you!... i like it... interesting how that cluster 3 is an outlier... i might go back and do just 4 survey clusters +table(HD3LatLonFit$cluster,HD3SurveyFit$cluster) + + +#scatter plot... got this from the workout, thank you again!... +HD5 <- HD4 %>% group_by(LatLonCluster, SurveyCluster) %>% summarize(count = n()) +ggplot(HD5, aes(x = LatLonCluster, y = SurveyCluster, size = count)) + geom_point() + + + +``` + + + +```{r} +#installing vcd... got this from the workout, yay workouts!... +library(vcd) + +HD6 <- structable(HD3LatLonFit$cluster~HD3SurveyFit$cluster) +mosaic(HD6, shade = TRUE, legend = TRUE) +#that's cool! +#that cluster 3 has got to go! + + ``` diff --git a/assignment4.Rproj b/assignment4.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/assignment4.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/desktop.ini b/desktop.ini new file mode 100644 index 0000000000000000000000000000000000000000..7e2afc92dc9897151144be5543ed5b34819f64d3 GIT binary patch literal 244 zcmY+9I|{;35JgWdxQ8r2LQo44tdvM9MN4B5Lezj7F)A+KlUFD*%$v{sFmFf9mJ=0c zV{R6-M#hx`4SV|Z+&5vaxpN_At8z1hgpw1aH^6tc(g!=FCb4b<2s@~>+q0#;319^feUH||9 literal 0 HcmV?d00001 From afe04cde85ea52ea50bf46b38c6f3fed13a1ddb4 Mon Sep 17 00:00:00 2001 From: berjakian Date: Sat, 14 Nov 2020 17:44:27 -0500 Subject: [PATCH 2/2] commit --- Assignment-4.html | 1122 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1122 insertions(+) create mode 100644 Assignment-4.html diff --git a/Assignment-4.html b/Assignment-4.html new file mode 100644 index 0000000..8673097 --- /dev/null +++ b/Assignment-4.html @@ -0,0 +1,1122 @@ + + + + + + + + + + + + + + +Assignment 4: K Means Clustering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means:

+

https://www.cs.uic.edu/~wilkinson/Applets/cluster.html

+
library(tidyverse)
+
## -- Attaching packages ---------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
+
## v ggplot2 3.3.2     v purrr   0.3.4
+## v tibble  3.0.3     v dplyr   1.0.2
+## v tidyr   1.1.2     v stringr 1.4.0
+## v readr   1.3.1     v forcats 0.5.0
+
## Warning: package 'tidyr' was built under R version 4.0.3
+
## -- Conflicts ------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
+## x dplyr::filter() masks stats::filter()
+## x dplyr::lag()    masks stats::lag()
+

Now, upload the file “Class_Motivation.csv” from the Assignment 4 Repository as a data frame called “K1”"

+
#read in the data
+K1 <- read.csv("Class_Motivation.csv")
+
+#check out the data
+class(K1)
+
## [1] "data.frame"
+
dim(K1)
+
## [1] 38  6
+
colnames(K1)
+
## [1] "id"          "motivation1" "motivation2" "motivation3" "motivation4"
+## [6] "motivation5"
+
rownames(K1)
+
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
+## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
+## [31] "31" "32" "33" "34" "35" "36" "37" "38"
+
head(K1)
+
##         id motivation1 motivation2 motivation3 motivation4 motivation5
+## 1 10005216           2           2           2           2           2
+## 2 10033216           3          NA           3          NA          NA
+## 3 10004216           1           2           1           2           2
+## 4 10008216           1           2           1           2          NA
+## 5 10026216           3          NA           3          NA          NA
+## 6 10014216           2          NA           2          NA           2
+
summary(K1)
+
##        id            motivation1     motivation2   motivation3     motivation4 
+##  Min.   :10002216   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.0  
+##  1st Qu.:10011716   1st Qu.:1.000   1st Qu.:1.0   1st Qu.:1.000   1st Qu.:1.0  
+##  Median :10018216   Median :1.000   Median :2.0   Median :1.000   Median :2.0  
+##  Mean   :10018663   Mean   :1.605   Mean   :1.8   Mean   :1.605   Mean   :1.8  
+##  3rd Qu.:10025966   3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:2.000   3rd Qu.:2.0  
+##  Max.   :10035216   Max.   :3.000   Max.   :3.0   Max.   :3.000   Max.   :3.0  
+##                                     NA's   :8                     NA's   :8    
+##   motivation5   
+##  Min.   :1.000  
+##  1st Qu.:2.000  
+##  Median :2.000  
+##  Mean   :2.538  
+##  3rd Qu.:3.000  
+##  Max.   :5.000  
+##  NA's   :12
+
#finding duplicate id's and adding up the number of dups... BAM!
+K1idtest <- count(group_by(K1,id)) 
+K1idtest
+
## # A tibble: 30 x 2
+## # Groups:   id [30]
+##          id     n
+##       <int> <int>
+##  1 10002216     1
+##  2 10003216     1
+##  3 10004216     1
+##  4 10005216     1
+##  5 10006216     1
+##  6 10007216     1
+##  7 10008216     1
+##  8 10009216     1
+##  9 10010216     1
+## 10 10011216     1
+## # ... with 20 more rows
+
#show just the ids that are dupped
+K1idtest <- subset(K1idtest,n!=1)
+K1idtest
+
## # A tibble: 2 x 2
+## # Groups:   id [2]
+##         id     n
+##      <int> <int>
+## 1 10018216     8
+## 2 10028216     2
+

This file contains the self-reported motivation scores for a class over five weeks. We are going to look for patterns in motivation over this time and sort people into clusters based on those patterns.

+

But before we do that, we will need to manipulate the data frame into a structure that can be analyzed by our clustering algorithm.

+

The algorithm will treat each row as a value belonging to a person, so we need to remove the id variable.

+
#super sloppy way of dealing with the duplicate id's... i left them in the dataset but added a unique suffix to all id's (i would have preferred to add suffixes to only the dup id's, e.g. 01, 02, 03..., but i dont know how to elegantly do that)
+K1b <- K1
+K1b$id2 <- sample(1:38,  replace=FALSE)
+K1c <- K1b %>% unite(newid, id, id2, sep = "-")
+
+
+#create a new copy of the data.frame with the id column as the rowname
+K2 <- K1c[,-1]
+rownames(K2) <- K1c[,1]
+
+
+#count rows that have NA... found this https://sebastiansauer.github.io/sum-isna/
+#still deciding which i like best... would have 
+K2.is.na1 <- sapply(K2, function(x) sum(is.na(x)))
+K2.is.na1
+
## motivation1 motivation2 motivation3 motivation4 motivation5 
+##           0           8           0           8          12
+
K2.is.na2 <- map(K2, ~sum(is.na(.)))
+K2.is.na2
+
## $motivation1
+## [1] 0
+## 
+## $motivation2
+## [1] 8
+## 
+## $motivation3
+## [1] 0
+## 
+## $motivation4
+## [1] 8
+## 
+## $motivation5
+## [1] 12
+
K2.is.na3 <- K2 %>%
+  summarise_all(funs(sum(is.na(.)))) 
+
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
+## Please use a list of either functions or lambdas: 
+## 
+##   # Simple named list: 
+##   list(mean = mean, median = median)
+## 
+##   # Auto named with `tibble::lst()`: 
+##   tibble::lst(mean, median)
+## 
+##   # Using lambdas
+##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_warnings()` to see where this warning was generated.
+
K2.is.na3
+
##   motivation1 motivation2 motivation3 motivation4 motivation5
+## 1           0           8           0           8          12
+
K2.is.na4 <- apply(K2, MARGIN = 1, function(x) sum(is.na(x)))
+K2.is.na4
+
## 10005216-11  10033216-8 10004216-15  10008216-3 10026216-23 10014216-13 
+##           0           3           0           1           3           2 
+##  10021216-2 10013216-19 10035216-14 10015216-25 10031216-16 10007216-31 
+##           0           2           1           0           1           0 
+## 10010216-32 10020216-28 10002216-37  10011216-1 10023216-35 10028216-27 
+##           1           0           0           1           0           0 
+## 10028216-12 10034216-29 10003216-24  10029216-9 10022216-33 10006216-10 
+##           0           0           3           0           1           1 
+## 10009216-34 10018216-38  10018216-5 10018216-18 10018216-22  10018216-6 
+##           0           0           0           0           0           0 
+## 10018216-20  10018216-7 10018216-17 10027216-36  10025216-4 10019216-21 
+##           0           0           0           0           0           3 
+## 10032216-26 10017216-30 
+##           3           2
+

It is important to think about the meaning of missing values when clustering. We could treat them as having meaning or we could remove those people who have them. Neither option is ideal. What problems do you foresee if we recode or remove these values? Write your answers below:

+

#i would have preferred to ask the person who gave me the data to help understand the reason for the dup id’s and maybe try to identify which was the most recent if i was to delete them… for now, i sloppily left them in the dataframe and just appended a suffix to each of them so they are unique. also, i would have preferred to understand the na’s… i simply ended up deleting any row in the data that had an na. resulted in deleting 15 rows.

+

We will remove people with missing values for this assignment, but keep in mind the issues that you have identified.

+
K3 <- na.omit(K2) #This command create a data frame with only those people with no missing values. It "omits" all rows with missing values, also known as a "listwise deletion". EG - It runs down the list deleting rows as it goes.
+

Another pre-processing step used in K-means is to standardize the values so that they have the same range. We do this because we want to treat each week as equally important - if we do not standardise then the week with the largest range will have the greatest impact on which clusters are formed. We standardise the values by using the “scale()” command.

+
#scale the data
+
+K3.scale <- scale(K3)
+

Now we will run the K-means clustering algorithm we talked about in class. 1) The algorithm starts by randomly choosing some starting values 2) Associates all observations near to those values with them 3) Calculates the mean of those clusters of values 4) Selects the observation closest to the mean of the cluster 5) Re-associates all observations closest to this observation 6) Continues this process until the clusters are no longer changing

+

Notice that in this case we have 5 variables and in class we only had 2. It is impossible to vizualise this process with 5 variables.

+

Also, we need to choose the number of clusters we think are in the data. We will start with 2.

+
Kfit <- kmeans(K3.scale,centers = 2)
+
+#We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster.
+
+#We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows.
+
+Kfit$cluster
+
## 10005216-11 10004216-15  10021216-2 10015216-25 10007216-31 10020216-28 
+##           2           2           2           2           1           2 
+## 10002216-37 10023216-35 10028216-27 10028216-12 10034216-29  10029216-9 
+##           1           1           1           1           2           2 
+## 10009216-34 10018216-38  10018216-5 10018216-18 10018216-22  10018216-6 
+##           1           2           2           2           2           2 
+## 10018216-20  10018216-7 10018216-17 10027216-36  10025216-4 
+##           2           2           2           1           2
+
#We can also attach these clusters to the original dataframe by using the "data.frame" command to create a new data frame called K4.
+
+K4 <- data.frame(K3,Kfit$cluster)
+
+#Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command.
+names(K4) <- c('1','2','3','4','5','cluster')
+

Now we need to visualize the clusters we have created. To do so we want to play with the structure of our data. What would be most useful would be if we could visualize average motivation by cluster, by week. To do this we will need to convert our data from wide to long format. Remember your old friends tidyr and dplyr!

+

First lets use tidyr to convert from wide to long format.

+
K5 <- gather(K4, "week", "motivation", 1:5)
+

Now lets use dplyr to average our motivation values by week and by cluster.

+
K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation))
+
## `summarise()` regrouping output by 'week' (override with `.groups` argument)
+

Now it’s time to do some visualization:

+

https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html

+

And you can see the range of available graphics in ggplot here:

+

http://ggplot2.tidyverse.org/reference/index.html

+

We are going to create a line plot similar to the one created in this paper about school dropout Bowers, 2010. It will have motivation on the Y-axis and weeks on the X-axis. To do this we will want our weeks variables to be treated as a number, but because it was created from a variable name it is currently being treated as a character variable. You can see this if you click on the arrow on the left of K6 in the Data pane. Week is designated by “chr”. To convert it to numeric, we use the as.numeric command.

+

Likewise, since “cluster” is not numeric but rather a categorical label we want to convert it from an “integer” format to a “factor” format so that ggplot does not treat it as a number. We can do this with the as.factor() command.

+
K6$week <- as.numeric(K6$week)
+
+K6$cluster <- as.factor(K6$cluster)
+

Now we can plot our line plot using the ggplot command, “ggplot()”.

+
    +
  • The first argument in a ggplot is the dataframe we are using: K6
  • +
  • Next is what is called an aesthetic (aes), the aesthetic tells ggplot which variables to use and how to use them. Here we are using the variables “week” and “avg” on the x and y axes and we are going color these variables using the “cluster” variable
  • +
  • Then we are going to tell ggplot which type of plot we want to use by specifiying a “geom()”, in this case a line plot: geom_line()
  • +
  • Finally we are going to clean up our axes labels: xlab(“Week”) & ylab(“Average Motivation”)
  • +
+
ggplot(K6, aes(x = week, y = avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")
+

+

What patterns do you see in the plot?

+
+

I see the following patterns in the top and bottom lines (i reference them ‘top’ and ‘bottom’ because each time i run kmeans it randomly assigns them labels ‘1’ and ‘2’)…

+

#1. Both cluster groups eventually trend upwards, the bottom line after week 3, the top line after week 4 (when i ran kmeans several times, i noticed the upward pitch in latter weeks varied in it’s upward slope) #2. The bottom line cluster had more variance in their motivation from week to week as compared to top one (again this changed each time i ran kmeans) #Note: Remember that these interpretations are skewed because the original dataset contained multiple observations where the id was the same and these observations were not removed… also, 8 rows of data that had missing motivation survey values were removed.

+

It would be useful to determine how many people are in each cluster. We can do this easily with dplyr.

+
K7 <- count(K4, cluster)
+K7
+
##   cluster  n
+## 1       1  7
+## 2       2 16
+

##Noteworthy… In a two cluster grouping… cluster 1 has almost twice the population of cluster 2

+

Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below:

+
#and then there were 3 clusters...
+Kfit <- kmeans(K3.scale,centers = 3)
+str(Kfit$cluster)
+
##  Named int [1:23] 1 2 1 1 3 1 3 3 3 3 ...
+##  - attr(*, "names")= chr [1:23] "10005216-11" "10004216-15" "10021216-2" "10015216-25" ...
+
K4 <- data.frame(K3,Kfit$cluster)
+
+names(K4) <- c('1','2','3','4','5','cluster')
+
+K5 <- gather(K4, "week", "motivation", 1:5)
+
+K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation))
+
## `summarise()` regrouping output by 'week' (override with `.groups` argument)
+
K6$week <- as.numeric(K6$week)
+
+K6$cluster <- as.factor(K6$cluster)
+
+K7 <- count(K4, cluster)
+K7
+
##   cluster n
+## 1       1 7
+## 2       2 9
+## 3       3 7
+
ggplot(K6, aes(x = week, y = avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") 
+

+
+
+

I feel 3 clusters is better… the populations are more balanced in this three cluster grouping with 7 to 9 observations each. Also, it’s clear that cluster 1 had essentially no change in motivation… i’m glad to identify them separately.

+

#Note: Remember that these interpretations are skewed because the original dataset contained multiple observations where the id was the same and these observations were not removed… also, 8 rows of data that had missing motivation survey values were removed.

+
#hmmmm.... why not...
+maxclusters <- unique(K3.scale) %>% nrow
+halfthemaxclusters <- maxclusters/2
+
+Kfit <- kmeans(K3.scale,centers = halfthemaxclusters)
+str(Kfit$cluster)
+
##  Named int [1:23] 2 4 2 2 5 2 3 3 6 6 ...
+##  - attr(*, "names")= chr [1:23] "10005216-11" "10004216-15" "10021216-2" "10015216-25" ...
+
Kfit$cluster
+
## 10005216-11 10004216-15  10021216-2 10015216-25 10007216-31 10020216-28 
+##           2           4           2           2           5           2 
+## 10002216-37 10023216-35 10028216-27 10028216-12 10034216-29  10029216-9 
+##           3           3           6           6           2           1 
+## 10009216-34 10018216-38  10018216-5 10018216-18 10018216-22  10018216-6 
+##           6           4           4           4           4           4 
+## 10018216-20  10018216-7 10018216-17 10027216-36  10025216-4 
+##           4           4           4           6           2
+
K4 <- data.frame(K3,Kfit$cluster)
+
+names(K4) <- c('1','2','3','4','5','cluster')
+
+K5 <- gather(K4, "week", "motivation", 1:5)
+
+K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation))
+
## `summarise()` regrouping output by 'week' (override with `.groups` argument)
+
K6$week <- as.numeric(K6$week)
+
+K6$cluster <- as.factor(K6$cluster)
+
+K7 <- count(K4, cluster)
+K7
+
##   cluster n
+## 1       1 1
+## 2       2 6
+## 3       3 2
+## 4       4 9
+## 5       5 1
+## 6       6 4
+
ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") 
+

+
+

Is MaxClusters/2 clusters better?

+
+
+
+

I found this to be helpful in identifying clusters where there was no change… if i were better at r, i would have put some interactivity in this graph to filter out certain clusters, e.g. the ones with no change… and what’s with the cluster of survey respondents that voted ‘1’ every week, how unhappy are they?!?.

+

#Note: Remember that these interpretations are skewed because the original dataset contained multiple observations where the id was the same and these observations were not removed… also, 8 rows of data that had missing motivation survey values were removed.

+

##Part II

+

Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters.

+
#load 
+HD1 <- read.csv("HUDK405020-cluster.csv")
+
+#check out the data
+class(HD1)
+
## [1] "data.frame"
+
dim(HD1)
+
## [1] 84  9
+
colnames(HD1)   
+
## [1] "id"               "lat"              "long"             "compare.features"
+## [5] "math.accuracy"    "planner.use"      "enjoy.discuss"    "enjoy.group"     
+## [9] "meet.deadline"
+
rownames(HD1)   
+
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
+## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
+## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
+## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
+## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
+## [76] "76" "77" "78" "79" "80" "81" "82" "83" "84"
+
head(HD1)
+
##   id      lat      long compare.features math.accuracy planner.use
+## 1  1 39.93797 116.41759              100           100         100
+## 2  2 25.05846 121.54951              100            75         100
+## 3  3 30.25723 120.15718               75            75          75
+## 4  4 31.24603 121.48338               75            50          75
+## 5  5 39.93797 116.41759               75            50          75
+## 6  6 40.94486 -73.87174              100            75          50
+##   enjoy.discuss enjoy.group meet.deadline
+## 1            75          75            25
+## 2            75          75           100
+## 3            75          75           100
+## 4            75         100           100
+## 5           100          50           100
+## 6            50          50            75
+
summary(HD1)
+
##        id             lat              long         compare.features
+##  Min.   : 1.00   Min.   :-12.04   Min.   :-122.33   Min.   :  0.00  
+##  1st Qu.:21.75   1st Qu.: 31.25   1st Qu.: -73.99   1st Qu.: 75.00  
+##  Median :42.50   Median : 39.30   Median : 115.17   Median : 75.00  
+##  Mean   :42.50   Mean   : 35.52   Mean   :  29.21   Mean   : 80.06  
+##  3rd Qu.:63.25   3rd Qu.: 40.51   3rd Qu.: 118.82   3rd Qu.:100.00  
+##  Max.   :84.00   Max.   : 47.67   Max.   : 135.51   Max.   :100.00  
+##  math.accuracy     planner.use     enjoy.discuss     enjoy.group    
+##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
+##  1st Qu.: 50.00   1st Qu.: 50.00   1st Qu.: 50.00   1st Qu.: 50.00  
+##  Median : 75.00   Median : 75.00   Median : 50.00   Median : 75.00  
+##  Mean   : 67.56   Mean   : 70.54   Mean   : 59.82   Mean   : 65.48  
+##  3rd Qu.: 75.00   3rd Qu.:100.00   3rd Qu.: 75.00   3rd Qu.: 75.00  
+##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
+##  meet.deadline   
+##  Min.   :  0.00  
+##  1st Qu.: 75.00  
+##  Median :100.00  
+##  Mean   : 89.58  
+##  3rd Qu.:100.00  
+##  Max.   :100.00
+
#look for duplicate id's
+HD1idtest <- count(group_by(HD1,id)) 
+HD1idtest <- subset(HD1idtest,n!=1)
+HD1dupids <- sum(HD1idtest$n) - nrow(HD1idtest)
+HD1dupids
+
## [1] 0
+
#yay, none found
+
+
+#move id column to rowname
+HD2 <- HD1
+rownames(HD2) <- HD1[,1]
+HD2 <- HD1[,2:9]
+nrow(HD2)
+
## [1] 84
+
#look for na's
+HD2.is.na <- HD1 %>%
+  summarise_all(funs(sum(is.na(.)))) 
+HD2.is.na
+
##   id lat long compare.features math.accuracy planner.use enjoy.discuss
+## 1  0   0    0                0             0           0             0
+##   enjoy.group meet.deadline
+## 1           0             0
+
#yay, none found
+
+#remove rows with na's
+#HD3 <- na.omit(HD2)
+#nrow(HD3)
+HD3 <- HD2
+
+#create table of lat/lon
+HD3LatLon <-HD3[,1:2]
+HD3LatLon <- as.matrix(HD3LatLon)
+class(HD3LatLon)
+
## [1] "matrix" "array"
+
#create table of survey answers
+HD3Survey <-HD3[,3:8]
+HD3Survey <- as.matrix(HD3Survey)
+class(HD3Survey)
+
## [1] "matrix" "array"
+
#scaling: this is not appropriate on lat/lon data (i think?)
+#scale(HD3LatLon)
+
+#scaling: in looking at the summary of survey, scaling does not seem necessary as all the data is within the same scale already... 0-100
+summary(HD3Survey)
+
##  compare.features math.accuracy     planner.use     enjoy.discuss   
+##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
+##  1st Qu.: 75.00   1st Qu.: 50.00   1st Qu.: 50.00   1st Qu.: 50.00  
+##  Median : 75.00   Median : 75.00   Median : 75.00   Median : 50.00  
+##  Mean   : 80.06   Mean   : 67.56   Mean   : 70.54   Mean   : 59.82  
+##  3rd Qu.:100.00   3rd Qu.: 75.00   3rd Qu.:100.00   3rd Qu.: 75.00  
+##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
+##   enjoy.group     meet.deadline   
+##  Min.   :  0.00   Min.   :  0.00  
+##  1st Qu.: 50.00   1st Qu.: 75.00  
+##  Median : 75.00   Median :100.00  
+##  Mean   : 65.48   Mean   : 89.58  
+##  3rd Qu.: 75.00   3rd Qu.:100.00  
+##  Max.   :100.00   Max.   :100.00
+
#scale(HD3Survey)
+
##HD3LatLon...explore how many clusters are needed using elbow plot 
+
+#first determine how many unique row values in the data... this is max number of k's
+unique(HD3LatLon)
+
##             lat       long
+##  [1,]  39.93797  116.41759
+##  [2,]  25.05846  121.54951
+##  [3,]  30.25723  120.15718
+##  [4,]  31.24603  121.48338
+##  [5,]  40.94486  -73.87174
+##  [6,]  42.27187  -70.99956
+##  [7,]  38.98944  -77.11879
+##  [8,]  36.77747  121.17098
+##  [9,]  37.14840 -113.48841
+## [10,]  31.55043  120.28207
+## [11,]  42.35549  -71.04861
+## [12,]  32.72793 -117.15529
+## [13,]  33.76980  -84.41458
+## [14,]  22.55122  113.93054
+## [15,] -12.04318  -77.02824
+## [16,]  47.67345 -122.33154
+## [17,]  43.40878  -74.87162
+## [18,]  39.09900 -111.87882
+## [19,]  29.86608  121.55357
+## [20,]  40.75345  -73.93640
+## [21,]  28.01274  120.68044
+## [22,]  37.35456 -121.88384
+## [23,]  31.33441  118.36877
+## [24,]  40.68295  -73.97080
+## [25,]  36.08025  120.36352
+## [26,]  40.78128  -74.06928
+## [27,]  29.33908  116.41677
+## [28,]  40.74516  -74.02797
+## [29,]  40.71045  -74.07211
+## [30,]  13.03011   80.20305
+## [31,]  37.66944 -121.88589
+## [32,]  38.63596  -90.23986
+## [33,]  39.50370  -84.74947
+## [34,]  37.53962  127.00968
+## [35,]  39.99082  -75.16843
+## [36,]  33.67242 -117.76439
+## [37,]  33.72148   73.04329
+## [38,]  34.69180  135.50506
+## [39,]  40.44649  -74.48969
+## [40,]  30.65275  104.06811
+## [41,]  13.88159  100.64453
+## [42,]  34.05513 -118.25703
+
#42
+
+# Use map_dbl to run many models with varying value of k (centers)
+tot_withinss <- map_dbl(1:42,  function(k){
+  model <- kmeans(x = HD3LatLon, centers = k)
+  model$tot.withinss
+})
+
+# Generate a data frame containing both k and tot_withinss
+elbow_df <- data.frame(
+  k = 1:42,
+  tot_withinss = tot_withinss
+)
+
+# Plot the elbow plot
+ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
+  geom_line() +
+  scale_x_continuous(breaks = 1:42)
+

+
#the elbow graph shows 2 distinct clusters for k... i'll use 2 clusters for LatLon table
+
+#occurs to me after watching the workout video again, i could have skipped the elbow plot and just done a pairs plot and visually observe that 2 clusters would be fine 
+pairs(HD3LatLon)
+

+
##HD3Survey...explore how many clusters are needed using elbow plot  
+
+#first determine how many unique row values in the data... this is max number of k's
+unique(HD3Survey)
+
##       compare.features math.accuracy planner.use enjoy.discuss enjoy.group
+##  [1,]              100           100         100            75          75
+##  [2,]              100            75         100            75          75
+##  [3,]               75            75          75            75          75
+##  [4,]               75            50          75            75         100
+##  [5,]               75            50          75           100          50
+##  [6,]              100            75          50            50          50
+##  [7,]               75            50          25            50          50
+##  [8,]               75            75         100            50         100
+##  [9,]               50           100          75            75          75
+## [10,]              100            75          75            50          50
+## [11,]               75            75         100             0          50
+## [12,]               75            25          75            50          75
+## [13,]              100            75          50            25          25
+## [14,]               75            75         100            75          75
+## [15,]               75           100         100            75          50
+## [16,]               75            50         100            50          50
+## [17,]               75            75          25            75          50
+## [18,]               75            75         100            50          50
+## [19,]              100            75          50            50          50
+## [20,]              100            50          75            25          25
+## [21,]               75            75         100           100          75
+## [22,]               75            50          50            50         100
+## [23,]              100            75          75           100         100
+## [24,]               50            75          75            50          50
+## [25,]               75            50          75            50          75
+## [26,]               75           100          75           100         100
+## [27,]               75           100          75            25          25
+## [28,]               75            75         100            75         100
+## [29,]               75           100          75            75          50
+## [30,]               75            25          75            75         100
+## [31,]               75            75          75            50          50
+## [32,]              100            25         100            75          75
+## [33,]              100            50          25            50          50
+## [34,]               75            75          25            50          75
+## [35,]               75            75          75            75          50
+## [36,]               50            75          50            50          50
+## [37,]              100           100         100            25          50
+## [38,]               25            50         100           100          75
+## [39,]               25            75         100            25          25
+## [40,]               50            75          50            75          50
+## [41,]              100            75         100            75          50
+## [42,]               75            25          50            50          75
+## [43,]               50            75           0            25          25
+## [44,]              100            75          75            75          75
+## [45,]              100            50         100            75         100
+## [46,]              100            75         100            50          50
+## [47,]               75            50          50            75          75
+## [48,]              100            75          75            50          75
+## [49,]              100            75          75            75          50
+## [50,]              100            75          50            50         100
+## [51,]               75            50          75            50         100
+## [52,]               75            75          25            75          75
+## [53,]               75            75          75            25          50
+## [54,]               75            75          25            50          50
+## [55,]              100            75         100            50          75
+## [56,]               75           100          25            75          75
+## [57,]               75            75         100            75          75
+## [58,]              100            50          75            50          75
+## [59,]               75            50          50            75          75
+## [60,]              100            75         100            50         100
+## [61,]               25            50         100           100         100
+## [62,]                0             0           0             0           0
+## [63,]              100            75         100           100         100
+## [64,]              100            75           0           100         100
+## [65,]               75            50          50            50          75
+## [66,]              100           100         100            75          50
+## [67,]              100            25          50            75         100
+## [68,]               50            50           0            25          50
+## [69,]               75            50          50            50          50
+## [70,]               75           100          50            50          75
+## [71,]               75            75          75            75          75
+## [72,]              100             0          25            25          50
+## [73,]              100            75          25            25          50
+##       meet.deadline
+##  [1,]            25
+##  [2,]           100
+##  [3,]           100
+##  [4,]           100
+##  [5,]           100
+##  [6,]            75
+##  [7,]           100
+##  [8,]           100
+##  [9,]           100
+## [10,]           100
+## [11,]           100
+## [12,]            75
+## [13,]           100
+## [14,]           100
+## [15,]           100
+## [16,]            75
+## [17,]            75
+## [18,]           100
+## [19,]           100
+## [20,]           100
+## [21,]           100
+## [22,]           100
+## [23,]            50
+## [24,]           100
+## [25,]           100
+## [26,]           100
+## [27,]           100
+## [28,]           100
+## [29,]           100
+## [30,]            25
+## [31,]           100
+## [32,]            75
+## [33,]           100
+## [34,]           100
+## [35,]            50
+## [36,]           100
+## [37,]            75
+## [38,]            75
+## [39,]           100
+## [40,]           100
+## [41,]            50
+## [42,]           100
+## [43,]           100
+## [44,]           100
+## [45,]           100
+## [46,]           100
+## [47,]           100
+## [48,]           100
+## [49,]           100
+## [50,]            50
+## [51,]            75
+## [52,]            75
+## [53,]           100
+## [54,]           100
+## [55,]           100
+## [56,]           100
+## [57,]            75
+## [58,]           100
+## [59,]            50
+## [60,]           100
+## [61,]           100
+## [62,]             0
+## [63,]           100
+## [64,]           100
+## [65,]           100
+## [66,]           100
+## [67,]           100
+## [68,]            75
+## [69,]            75
+## [70,]           100
+## [71,]            75
+## [72,]            75
+## [73,]           100
+
#73
+
+# Use map_dbl to run many models with varying value of k (centers)
+tot_withinss <- map_dbl(1:20,  function(k){
+  model <- kmeans(x = HD3Survey, centers = k)
+  model$tot.withinss
+})
+
+# Generate a data frame containing both k and tot_withinss
+elbow_df <- data.frame(
+  k = 1:20,
+  tot_withinss = tot_withinss
+)
+
+# Plot the elbow plot
+ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
+  geom_line() +
+  scale_x_continuous(breaks = 1:20)
+

+
#yuk, at 73 k's, the elbow graph shows no distinct clusters for k
+
+#i reran at 20 k's a bunch of times, the elbow graph shows a  turn at 5 distinct clusters for k... i'll use 5 clusters for Survey table
+
+
+#occurs to me after watching the workout video again, i could done a pairs plot and visually observe the data... likely because there are 5 potential values in the data... the pairs visual leans towards 5 clusters  
+pairs(HD3Survey)
+

+
#kmeans clusters for both LatLon and Survey matrixes
+HD3LatLonFit <- kmeans(HD3LatLon,centers = 2)
+HD3LatLonFit$cluster
+
##  [1] 2 2 2 2 2 1 1 1 2 1 2 2 1 1 1 2 1 1 1 1 2 2 1 2 1 2 2 1 2 2 1 2 2 2 2 1 2 1
+## [39] 1 1 1 2 1 2 2 2 1 1 1 1 2 1 2 1 1 2 1 1 2 1 2 2 2 2 1 2 2 2 1 2 2 1 2 2 2 1
+## [77] 1 2 2 2 2 2 2 2
+
HD3SurveyFit <- kmeans(HD3Survey,centers = 5)
+HD3SurveyFit$cluster
+
##  [1] 4 3 3 3 3 1 1 3 3 3 2 2 4 2 3 3 3 3 2 1 2 1 2 3 1 4 2 3 3 2 3 3 4 2 2 4 1 2
+## [39] 1 4 1 2 5 2 1 4 1 1 1 3 3 2 2 1 3 3 4 4 3 1 2 1 3 1 3 3 2 4 3 5 1 3 1 1 3 4
+## [77] 2 1 1 1 3 1 3 1
+
#add the cluster information back to the original table
+HD4 <-mutate(HD3, LatLonCluster = HD3LatLonFit$cluster, SurveyCluster = HD3SurveyFit$cluster)
+

##Part III

+

Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers?

+
pairs(HD4)
+

+
#table view... got this from the workout, thank you!... i like it... interesting how that cluster 3 is an outlier... i might go back and do just 4 survey clusters
+table(HD3LatLonFit$cluster,HD3SurveyFit$cluster)
+
##    
+##      1  2  3  4  5
+##   1 11  6 12  6  1
+##   2 13 12 17  5  1
+
#scatter plot... got this from the workout, thank you again!...
+HD5 <- HD4 %>% group_by(LatLonCluster, SurveyCluster) %>% summarize(count = n())
+
## `summarise()` regrouping output by 'LatLonCluster' (override with `.groups` argument)
+
ggplot(HD5, aes(x = LatLonCluster, y = SurveyCluster, size = count)) + geom_point()
+

+
#installing vcd... got this from the workout, yay workouts!...
+library(vcd)
+
## Warning: package 'vcd' was built under R version 4.0.3
+
## Loading required package: grid
+
HD6 <- structable(HD3LatLonFit$cluster~HD3SurveyFit$cluster)
+mosaic(HD6, shade = TRUE, legend = TRUE)
+

+
#that's cool!
+#that cluster 3 has got to go!
+
+

Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository.

+
+
+ + + + +
+ + + + + + + + + + + + + + +