From 59231f0afc3ff2e8ab6d6eeb900082806b103e79 Mon Sep 17 00:00:00 2001 From: Xie <394637464@qq.com> Date: Wed, 9 Dec 2020 23:21:51 +0800 Subject: [PATCH 1/5] Xing Yixie Assignment 4 file --- Assignment 4.Rmd | 151 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 17 deletions(-) diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd index 54b0e66..f2556f3 100644 --- a/Assignment 4.Rmd +++ b/Assignment 4.Rmd @@ -8,13 +8,20 @@ https://www.cs.uic.edu/~wilkinson/Applets/cluster.html ```{r} -library() +library(dplyr) +library(tidyr) +library(ggplot2) ``` Now, upload the file "Class_Motivation.csv" from the Assignment 4 Repository as a data frame called "K1"" ```{r} -K1 <- read.csv(...) + +K1 <- read.csv("Class_Motivation.csv", header = TRUE) + +K1b <- gather(K1, week, measure, 2:6) + +plot(as.factor(K1b$week), K1b$measure) ``` @@ -26,12 +33,15 @@ The algorithm will treat each row as a value belonging to a person, so we need t ```{r} -K2 <- +K2 <- select(K1, 2:6) ``` It is important to think about the meaning of missing values when clustering. We could treat them as having meaning or we could remove those people who have them. Neither option is ideal. What problems do you foresee if we recode or remove these values? Write your answers below: +Answer: In my opinion, when we remove those people who have them, the sample size will decrease a lot because not every individual has all meaningful data. +Secondly, if we recode those missing values, it will also affect our statistical assumption, because we don't know where these missing values come and for what reasons. In this case, there are some solutions by R to deal with missing values, like multiple imputation by package "mice" (I saw this in ). +Therefore, I agree with that neither solution of them is ideal. We will remove people with missing values for this assignment, but keep in mind the issues that you have identified. @@ -39,7 +49,11 @@ We will remove people with missing values for this assignment, but keep in mind ```{r} -K3 <- na.omit(K2) #This command create a data frame with only those people with no missing values. It "omits" all rows with missing values, also known as a "listwise deletion". EG - It runs down the list deleting rows as it goes. +K3 <- na.omit(K2) +K3 <- K2 + +K3[is.na(K3)] <- 0 + ``` @@ -47,7 +61,7 @@ Another pre-processing step used in K-means is to standardize the values so that ```{r} -K3 <- +K3 <- scale(K3) ``` @@ -66,21 +80,30 @@ Also, we need to choose the number of clusters we think are in the data. We will ```{r} -fit <- - -#We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster. +fit1a <- kmeans(K3, 2) +fit1b <- kmeans(K3, 2) +fit1c <- kmeans(K3, 2) -#We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows. +fit1b$cluster +K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster) +fit1a$withinss +fit1b$withinss +fit1c$withinss -#We can also attach these clusters to the original dataframe by using the "data.frame" command to create a new data frame called K4. +fit1a$tot.withinss +fit1b$tot.withinss +fit1c$tot.withinss -K4 +fit1a$betweenss +fit1b$betweenss +fit1c$betweenss -#Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command. +K4 <- data.frame(K3, fit1a$cluster) +names(K4) <- c("1", "2", "3", "4", "5", "cluster") ``` Now we need to visualize the clusters we have created. To do so we want to play with the structure of our data. What would be most useful would be if we could visualize average motivation by cluster, by week. To do this we will need to convert our data from wide to long format. Remember your old friends tidyr and dplyr! @@ -93,9 +116,11 @@ K5 <- gather(K4, "week", "motivation", 1:5) Now lets use dplyr to average our motivation values by week and by cluster. -```{r} +```{r message=FALSE, warning=FALSE} +K6 <- K5 %>% + group_by(week, cluster) %>% + summarise(avg = mean(motivation)) -K6 <- K5 %>% group_by(week, cluster) %>% summarise(K6, avg = mean(motivation)) ``` @@ -113,9 +138,9 @@ Likewise, since "cluster" is not numeric but rather a categorical label we want ```{r} -K6$week <- +K6$week <- as.numeric(K6$week) -K6$cluster <- +K6$cluster <- as.factor(K6$cluster) ``` @@ -134,7 +159,7 @@ ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab What patterns do you see in the plot? - +## Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3. It would be useful to determine how many people are in each cluster. We can do this easily with dplyr. @@ -144,18 +169,110 @@ K7 <- count(K4, cluster) Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below: + +```{r Part 1 cluster 3, message=FALSE, warning=FALSE} + +fit1a <- kmeans(K3, 3) +fit1b <- kmeans(K3, 3) +fit1c <- kmeans(K3, 3) + +fit1b$cluster + +K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster) + +fit1a$withinss +fit1b$withinss +fit1c$withinss + +fit1a$tot.withinss +fit1b$tot.withinss +fit1c$tot.withinss + +fit1a$betweenss +fit1b$betweenss +fit1c$betweenss + +K4 <- data.frame(K3, fit1a$cluster) + +names(K4) <- c("1", "2", "3", "4", "5", "cluster") + +K5 <- gather(K4, "week", "motivation", 1:5) + +K6 <- K5 %>% + group_by(week, cluster) %>% + summarise(avg = mean(motivation)) + +K6$week <- as.numeric(K6$week) + +K6$cluster <- as.factor(K6$cluster) +ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation") +K7 <- count(K4, cluster) +``` + +### In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously. ##Part II Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters. +```{r} +p2 <- read.csv("HUDK405020-cluster.csv") +L1 <- p2[,c(2,3)] +set.seed(121) +fit1a <- kmeans(L1, 2) +fit1b <- kmeans(L1, 2) + +fit1a$cluster + +L2 <- data.frame(L1, fit1a$cluster) + + +names(L2) <- c("Lat", "Long", "clusterLoc") + +ggplot(L2, aes(Lat,Long, fill = factor(clusterLoc))) + geom_boxplot() + xlab("Lat") + ylab("Lon")+theme_bw() +``` + + +According to answers to questions +```{r} +k3 <- p2[,c(4:9)] +set.seed(121) +fit1a <- kmeans(k3, 2) +fit1b <- kmeans(k3, 2) + +fit1a$cluster + +K4 <- data.frame(k3, fit1a$cluster) + +names(K4)[[7]] <- "cluster" + +K5 <- gather(K4, "Questions","Answers", 1:6) + +K6 <- K5 %>% group_by(Questions, cluster) %>% summarise(avg = mean(Answers)) + +K6$cluster <- as.factor(K6$cluster) +ggplot(K6, aes(Questions, avg, colour = cluster)) + geom_point() + xlab("Questions") + ylab("Average score") +K7 <- count(K4, cluster) +``` + ##Part III Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers? + ```{r} +D1 <- cbind(K4,L2) +pairs(D1) + +chisq.test(D1$cluster,D1$clusterLoc) +tableone::CreateCatTable("cluster","clusterLoc",data = D1) + +library(vcd) +P1 <- structable(D1$cluster ~ D1$clusterLoc) +mosaic(P1, shade=TRUE, legend=TRUE) ``` + ## Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository. From 89592b3d4a322b5b0690de99667fc85370c3a7b9 Mon Sep 17 00:00:00 2001 From: Xie <394637464@qq.com> Date: Wed, 9 Dec 2020 23:22:34 +0800 Subject: [PATCH 2/5] Xing yixie Assignment 4 cluster output --- Assignment-4.html | 624 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 624 insertions(+) create mode 100644 Assignment-4.html diff --git a/Assignment-4.html b/Assignment-4.html new file mode 100644 index 0000000..b72942b --- /dev/null +++ b/Assignment-4.html @@ -0,0 +1,624 @@ + + + + + + + + + + + + + +Assignment 4: K Means Clustering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means:

+

https://www.cs.uic.edu/~wilkinson/Applets/cluster.html

+
library(dplyr)
+
## Warning: package 'dplyr' was built under R version 3.6.3
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(tidyr)
+
## Warning: package 'tidyr' was built under R version 3.6.3
+
library(ggplot2)
+

Now, upload the file “Class_Motivation.csv” from the Assignment 4 Repository as a data frame called “K1”"

+
K1 <- read.csv("Class_Motivation.csv", header = TRUE)
+
+K1b <- gather(K1, week, measure, 2:6)
+
+plot(as.factor(K1b$week), K1b$measure)
+

+

This file contains the self-reported motivation scores for a class over five weeks. We are going to look for patterns in motivation over this time and sort people into clusters based on those patterns.

+

But before we do that, we will need to manipulate the data frame into a structure that can be analyzed by our clustering algorithm.

+

The algorithm will treat each row as a value belonging to a person, so we need to remove the id variable.

+
K2 <- select(K1, 2:6)
+

It is important to think about the meaning of missing values when clustering. We could treat them as having meaning or we could remove those people who have them. Neither option is ideal. What problems do you foresee if we recode or remove these values? Write your answers below:

+

Answer: In my opinion, when we remove those people who have them, the sample size will decrease a lot because not every individual has all meaningful data. Secondly, if we recode those missing values, it will also affect our statistical assumption, because we don’t know where these missing values come and for what reasons. In this case, there are some solutions by R to deal with missing values, like multiple imputation by package “mice” (I saw this in ). Therefore, I agree with that neither solution of them is ideal.

+

We will remove people with missing values for this assignment, but keep in mind the issues that you have identified.

+
K3 <- na.omit(K2) 
+K3 <- K2
+
+K3[is.na(K3)] <- 0
+

Another pre-processing step used in K-means is to standardize the values so that they have the same range. We do this because we want to treat each week as equally important - if we do not standardise then the week with the largest range will have the greatest impact on which clusters are formed. We standardise the values by using the “scale()” command.

+
K3 <- scale(K3) 
+

Now we will run the K-means clustering algorithm we talked about in class. 1) The algorithm starts by randomly choosing some starting values 2) Associates all observations near to those values with them 3) Calculates the mean of those clusters of values 4) Selects the observation closest to the mean of the cluster 5) Re-associates all observations closest to this observation 6) Continues this process until the clusters are no longer changing

+

Notice that in this case we have 5 variables and in class we only had 2. It is impossible to vizualise this process with 5 variables.

+

Also, we need to choose the number of clusters we think are in the data. We will start with 2.

+
fit1a <- kmeans(K3, 2)
+fit1b <- kmeans(K3, 2)
+fit1c <- kmeans(K3, 2) 
+
+fit1b$cluster
+
##  [1] 1 2 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
+
K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
+
+fit1a$withinss
+
## [1]  22.77197 125.75675
+
fit1b$withinss
+
## [1] 76.21636 31.97163
+
fit1c$withinss
+
## [1] 31.97163 76.21636
+
fit1a$tot.withinss
+
## [1] 148.5287
+
fit1b$tot.withinss
+
## [1] 108.188
+
fit1c$tot.withinss
+
## [1] 108.188
+
fit1a$betweenss
+
## [1] 36.47128
+
fit1b$betweenss
+
## [1] 76.81201
+
fit1c$betweenss
+
## [1] 76.81201
+
K4 <- data.frame(K3, fit1a$cluster)
+
+
+names(K4) <- c("1", "2", "3", "4", "5", "cluster") 
+

Now we need to visualize the clusters we have created. To do so we want to play with the structure of our data. What would be most useful would be if we could visualize average motivation by cluster, by week. To do this we will need to convert our data from wide to long format. Remember your old friends tidyr and dplyr!

+

First lets use tidyr to convert from wide to long format.

+
K5 <- gather(K4, "week", "motivation", 1:5)
+

Now lets use dplyr to average our motivation values by week and by cluster.

+
K6 <- K5 %>% 
+  group_by(week, cluster) %>% 
+  summarise(avg = mean(motivation))
+

Now it’s time to do some visualization:

+

https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html

+

And you can see the range of available graphics in ggplot here:

+

http://ggplot2.tidyverse.org/reference/index.html

+

We are going to create a line plot similar to the one created in this paper about school dropout Bowers, 2010. It will have motivation on the Y-axis and weeks on the X-axis. To do this we will want our weeks variables to be treated as a number, but because it was created from a variable name it is currently being treated as a character variable. You can see this if you click on the arrow on the left of K6 in the Data pane. Week is designated by “chr”. To convert it to numeric, we use the as.numeric command.

+

Likewise, since “cluster” is not numeric but rather a categorical label we want to convert it from an “integer” format to a “factor” format so that ggplot does not treat it as a number. We can do this with the as.factor() command.

+
K6$week <- as.numeric(K6$week)
+
+K6$cluster <- as.factor(K6$cluster)
+

Now we can plot our line plot using the ggplot command, “ggplot()”.

+ +
ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")
+

+

What patterns do you see in the plot?

+
+

Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3.

+

It would be useful to determine how many people are in each cluster. We can do this easily with dplyr.

+
K7 <- count(K4, cluster)
+

Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below:

+
fit1a <- kmeans(K3, 3)
+fit1b <- kmeans(K3, 3)
+fit1c <- kmeans(K3, 3) 
+
+fit1b$cluster
+
##  [1] 1 3 2 2 3 3 1 3 1 1 2 1 1 1 2 1 2 2 2 1 3 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 3 3
+
K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
+
+fit1a$withinss
+
## [1] 46.559075 22.806584  9.038652
+
fit1b$withinss
+
## [1] 26.40481 34.65143 10.48789
+
fit1c$withinss
+
## [1] 10.48789 26.40481 34.65143
+
fit1a$tot.withinss
+
## [1] 78.40431
+
fit1b$tot.withinss
+
## [1] 71.54413
+
fit1c$tot.withinss
+
## [1] 71.54413
+
fit1a$betweenss
+
## [1] 106.5957
+
fit1b$betweenss
+
## [1] 113.4559
+
fit1c$betweenss
+
## [1] 113.4559
+
K4 <- data.frame(K3, fit1a$cluster)
+
+names(K4) <- c("1", "2", "3", "4", "5", "cluster") 
+
+K5 <- gather(K4, "week", "motivation", 1:5)
+
+K6 <- K5 %>% 
+  group_by(week, cluster) %>% 
+  summarise(avg = mean(motivation))
+
+K6$week <- as.numeric(K6$week)
+
+K6$cluster <- as.factor(K6$cluster)
+ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")
+

+
K7 <- count(K4, cluster)
+

##Part II

+

Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters.

+
p2 <- read.csv("HUDK405020-cluster.csv")
+L1 <- p2[,c(2,3)]
+set.seed(121)
+fit1a <- kmeans(L1, 2)
+fit1b <- kmeans(L1, 2)
+
+fit1a$cluster
+
##  [1] 2 2 2 2 2 1 1 1 2 1 2 2 1 1 1 2 1 1 1 1 2 2 1 2 1 2 2 1 2 2 1 2 2 2 2 1 2 1
+## [39] 1 1 1 2 1 2 2 2 1 1 1 1 2 1 2 1 1 2 1 1 2 1 2 2 2 2 1 2 2 2 1 2 2 1 2 2 2 1
+## [77] 1 2 2 2 2 2 2 2
+
L2 <- data.frame(L1, fit1a$cluster)
+
+
+names(L2) <- c("Lat", "Long", "clusterLoc") 
+
+ggplot(L2, aes(Lat,Long, fill = factor(clusterLoc))) + geom_boxplot() + xlab("Lat") + ylab("Lon")+theme_bw()
+

+

According to answers to questions

+
k3 <- p2[,c(4:9)]
+set.seed(121)
+fit1a <- kmeans(k3, 2)
+fit1b <- kmeans(k3, 2)
+
+fit1a$cluster
+
##  [1] 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1
+## [39] 2 1 2 1 1 1 2 1 2 2 2 1 1 1 1 2 1 1 2 1 1 2 1 2 1 2 1 1 1 2 1 1 2 1 2 2 1 2
+## [77] 1 2 2 2 1 2 1 2
+
K4 <- data.frame(k3, fit1a$cluster)
+
+names(K4)[[7]] <- "cluster"
+
+K5 <- gather(K4, "Questions","Answers", 1:6)
+
+K6 <- K5 %>% group_by(Questions, cluster) %>% summarise(avg = mean(Answers))
+
## `summarise()` regrouping output by 'Questions' (override with `.groups` argument)
+
K6$cluster <- as.factor(K6$cluster)
+ggplot(K6, aes(Questions, avg, colour = cluster)) + geom_point() + xlab("Questions") + ylab("Average score")
+

+
K7 <- count(K4, cluster)
+

##Part III

+

Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers?

+
D1 <- cbind(K4,L2)
+pairs(D1)
+

+
chisq.test(D1$cluster,D1$clusterLoc)
+
## 
+##  Pearson's Chi-squared test with Yates' continuity correction
+## 
+## data:  D1$cluster and D1$clusterLoc
+## X-squared = 0.92273, df = 1, p-value = 0.3368
+
tableone::CreateCatTable("cluster","clusterLoc",data = D1)
+
##                  Stratified by clusterLoc
+##                   1          2          p      test
+##   n               36         48                    
+##   cluster = 2 (%) 15 (41.7)  14 (29.2)   0.337
+
library(vcd)
+
## Warning: package 'vcd' was built under R version 3.6.3
+
## Loading required package: grid
+
P1 <- structable(D1$cluster ~ D1$clusterLoc)
+mosaic(P1, shade=TRUE, legend=TRUE) 
+

+
+
+

Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository.

+
+ + + + +
+ + + + + + + + + + + + + + + From a3cd3448190fa71ca94cf4c2d5542eb74524cfbd Mon Sep 17 00:00:00 2001 From: Xie <394637464@qq.com> Date: Thu, 10 Dec 2020 20:29:18 +0800 Subject: [PATCH 3/5] Xing Yixie Assignment 4 files --- Assignment 4.Rmd | 18 +++++++++++--- Assignment-4.html | 62 +++++++++++++++++++++++++++++------------------ 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd index f2556f3..c65eb1a 100644 --- a/Assignment 4.Rmd +++ b/Assignment 4.Rmd @@ -1,5 +1,6 @@ --- title: "Assignment 4: K Means Clustering" +Author: Xing Yixie --- In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means: @@ -159,7 +160,7 @@ ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab What patterns do you see in the plot? -## Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3. +### Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3. It would be useful to determine how many people are in each cluster. We can do this easily with dplyr. @@ -172,6 +173,12 @@ Look at the number of people in each cluster, now repeat this process for 3 rath ```{r Part 1 cluster 3, message=FALSE, warning=FALSE} +K3 <- na.omit(K2) +K3 <- K2 + +K3[is.na(K3)] <- 0 + + fit1a <- kmeans(K3, 3) fit1b <- kmeans(K3, 3) fit1c <- kmeans(K3, 3) @@ -209,10 +216,13 @@ ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab K7 <- count(K4, cluster) ``` -### In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously. -##Part II +### Answer: In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously. + + +## Part II Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters. + ```{r} p2 <- read.csv("HUDK405020-cluster.csv") L1 <- p2[,c(2,3)] @@ -269,9 +279,9 @@ tableone::CreateCatTable("cluster","clusterLoc",data = D1) library(vcd) P1 <- structable(D1$cluster ~ D1$clusterLoc) mosaic(P1, shade=TRUE, legend=TRUE) - ``` +### Answer: We didn't observe any geographical patterns that correspond to the answers. For Chi-square test (independence test), there is no significant relationships between geographical pattern and students' answers. Moreover, we can testify this conclusion from picture as belows. ## Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository. diff --git a/Assignment-4.html b/Assignment-4.html index b72942b..6afd9be 100644 --- a/Assignment-4.html +++ b/Assignment-4.html @@ -409,27 +409,27 @@

Assignment 4: K Means Clustering

fit1c <- kmeans(K3, 2) fit1b$cluster -
##  [1] 1 2 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
+
##  [1] 2 1 2 2 1 1 2 1 2 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1
K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
 
 fit1a$withinss
-
## [1]  22.77197 125.75675
+
## [1] 31.97163 76.21636
fit1b$withinss
-
## [1] 76.21636 31.97163
-
fit1c$withinss
## [1] 31.97163 76.21636
+
fit1c$withinss
+
## [1] 125.75675  22.77197
fit1a$tot.withinss
-
## [1] 148.5287
+
## [1] 108.188
fit1b$tot.withinss
## [1] 108.188
fit1c$tot.withinss
-
## [1] 108.188
+
## [1] 148.5287
fit1a$betweenss
-
## [1] 36.47128
+
## [1] 76.81201
fit1b$betweenss
## [1] 76.81201
fit1c$betweenss
-
## [1] 76.81201
+
## [1] 36.47128
K4 <- data.frame(K3, fit1a$cluster)
 
 
@@ -458,39 +458,45 @@ 

Assignment 4: K Means Clustering

  • Finally we are going to clean up our axes labels: xlab(“Week”) & ylab(“Average Motivation”)
  • ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")
    -

    +

    What patterns do you see in the plot?

    -
    -

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3.

    +
    +

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3.

    It would be useful to determine how many people are in each cluster. We can do this easily with dplyr.

    K7 <- count(K4, cluster)

    Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below:

    -
    fit1a <- kmeans(K3, 3)
    +
    K3 <- na.omit(K2) 
    +K3 <- K2
    +
    +K3[is.na(K3)] <- 0
    +
    +
    +fit1a <- kmeans(K3, 3)
     fit1b <- kmeans(K3, 3)
     fit1c <- kmeans(K3, 3) 
     
     fit1b$cluster
    -
    ##  [1] 1 3 2 2 3 3 1 3 1 1 2 1 1 1 2 1 2 2 2 1 3 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 3 3
    +
    ##  [1] 1 2 3 1 2 2 1 2 1 1 2 2 1 1 3 1 3 2 2 1 2 3 1 2 2 3 3 3 3 3 3 3 3 2 1 2 2 2
    K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
     
     fit1a$withinss
    -
    ## [1] 46.559075 22.806584  9.038652
    +
    ## [1] 20.18182 42.00000 12.16667
    fit1b$withinss
    -
    ## [1] 26.40481 34.65143 10.48789
    +
    ## [1] 20.18182 42.00000 12.16667
    fit1c$withinss
    -
    ## [1] 10.48789 26.40481 34.65143
    +
    ## [1] 46.526316  9.636364 17.250000
    fit1a$tot.withinss
    -
    ## [1] 78.40431
    +
    ## [1] 74.34848
    fit1b$tot.withinss
    -
    ## [1] 71.54413
    +
    ## [1] 74.34848
    fit1c$tot.withinss
    -
    ## [1] 71.54413
    +
    ## [1] 73.41268
    fit1a$betweenss
    -
    ## [1] 106.5957
    +
    ## [1] 111.7041
    fit1b$betweenss
    -
    ## [1] 113.4559
    +
    ## [1] 111.7041
    fit1c$betweenss
    -
    ## [1] 113.4559
    +
    ## [1] 112.64
    K4 <- data.frame(K3, fit1a$cluster)
     
     names(K4) <- c("1", "2", "3", "4", "5", "cluster") 
    @@ -505,9 +511,14 @@ 

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the K6$cluster <- as.factor(K6$cluster) ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")

    -

    +

    K7 <- count(K4, cluster)
    -

    ##Part II

    +
    +
    +

    Answer: In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously.

    +
    +
    +

    Part II

    Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters.

    p2 <- read.csv("HUDK405020-cluster.csv")
     L1 <- p2[,c(2,3)]
    @@ -570,6 +581,9 @@ 

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the
    P1 <- structable(D1$cluster ~ D1$clusterLoc)
     mosaic(P1, shade=TRUE, legend=TRUE) 

    +
    +

    Answer: We didn’t observe any geographical patterns that correspond to the answers. For Chi-square test (independence test), there is no significant relationships between geographical pattern and students’ answers. Moreover, we can testify this conclusion from picture as belows.

    +

    Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository.

    From 479b24502a3082f2e1280928a6de5e5386ee3936 Mon Sep 17 00:00:00 2001 From: Xie <394637464@qq.com> Date: Thu, 10 Dec 2020 20:29:18 +0800 Subject: [PATCH 4/5] Xingyi Xie Assignment 4 files --- Assignment 4.Rmd | 18 +++++++++++---- Assignment-4.html | 58 +++++++++++++++++++++++++++++------------------ 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd index f2556f3..1c3317a 100644 --- a/Assignment 4.Rmd +++ b/Assignment 4.Rmd @@ -1,5 +1,6 @@ --- title: "Assignment 4: K Means Clustering" +Author: Xingyi Xie --- In this assignment we will be applying the K-means clustering algorithm we looked at in class. At the following link you can find a description of K-means: @@ -159,7 +160,7 @@ ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab What patterns do you see in the plot? -## Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3. +### Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3. It would be useful to determine how many people are in each cluster. We can do this easily with dplyr. @@ -172,6 +173,12 @@ Look at the number of people in each cluster, now repeat this process for 3 rath ```{r Part 1 cluster 3, message=FALSE, warning=FALSE} +K3 <- na.omit(K2) +K3 <- K2 + +K3[is.na(K3)] <- 0 + + fit1a <- kmeans(K3, 3) fit1b <- kmeans(K3, 3) fit1c <- kmeans(K3, 3) @@ -209,10 +216,13 @@ ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab K7 <- count(K4, cluster) ``` -### In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously. -##Part II +### Answer: In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously. + + +## Part II Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters. + ```{r} p2 <- read.csv("HUDK405020-cluster.csv") L1 <- p2[,c(2,3)] @@ -269,9 +279,9 @@ tableone::CreateCatTable("cluster","clusterLoc",data = D1) library(vcd) P1 <- structable(D1$cluster ~ D1$clusterLoc) mosaic(P1, shade=TRUE, legend=TRUE) - ``` +### Answer: We didn't observe any geographical patterns that correspond to the answers. For Chi-square test (independence test), there is no significant relationships between geographical pattern and students' answers. Moreover, we can testify this conclusion from picture as belows. ## Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository. diff --git a/Assignment-4.html b/Assignment-4.html index b72942b..1280880 100644 --- a/Assignment-4.html +++ b/Assignment-4.html @@ -409,27 +409,27 @@

    Assignment 4: K Means Clustering

    fit1c <- kmeans(K3, 2) fit1b$cluster
    -
    ##  [1] 1 2 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
    +
    ##  [1] 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1
    K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
     
     fit1a$withinss
    ## [1]  22.77197 125.75675
    fit1b$withinss
    -
    ## [1] 76.21636 31.97163
    +
    ## [1] 71.23506 49.52381
    fit1c$withinss
    -
    ## [1] 31.97163 76.21636
    +
    ## [1] 71.23506 49.52381
    fit1a$tot.withinss
    ## [1] 148.5287
    fit1b$tot.withinss
    -
    ## [1] 108.188
    +
    ## [1] 120.7589
    fit1c$tot.withinss
    -
    ## [1] 108.188
    +
    ## [1] 120.7589
    fit1a$betweenss
    ## [1] 36.47128
    fit1b$betweenss
    -
    ## [1] 76.81201
    +
    ## [1] 64.24113
    fit1c$betweenss
    -
    ## [1] 76.81201
    +
    ## [1] 64.24113
    K4 <- data.frame(K3, fit1a$cluster)
     
     
    @@ -460,37 +460,43 @@ 

    Assignment 4: K Means Clustering

    ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")

    What patterns do you see in the plot?

    -
    -

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3.

    +
    +

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the week. Cluster2 people has stronger average motivation than Cluster1 people and the motivation of cluster 2 achieves highest in Week 1 and week 3. However, compared to cluster 2, cluster 1 people has lowest average motivation value in week 1 and 3.

    It would be useful to determine how many people are in each cluster. We can do this easily with dplyr.

    K7 <- count(K4, cluster)

    Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below:

    -
    fit1a <- kmeans(K3, 3)
    +
    K3 <- na.omit(K2) 
    +K3 <- K2
    +
    +K3[is.na(K3)] <- 0
    +
    +
    +fit1a <- kmeans(K3, 3)
     fit1b <- kmeans(K3, 3)
     fit1c <- kmeans(K3, 3) 
     
     fit1b$cluster
    -
    ##  [1] 1 3 2 2 3 3 1 3 1 1 2 1 1 1 2 1 2 2 2 1 3 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 3 3
    +
    ##  [1] 1 2 3 1 2 2 1 2 1 1 2 2 1 1 3 1 3 2 2 1 2 3 1 2 2 3 3 3 3 3 3 3 3 2 1 2 2 2
    K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
     
     fit1a$withinss
    -
    ## [1] 46.559075 22.806584  9.038652
    +
    ## [1] 20.18182 42.00000 12.16667
    fit1b$withinss
    -
    ## [1] 26.40481 34.65143 10.48789
    +
    ## [1] 20.18182 42.00000 12.16667
    fit1c$withinss
    -
    ## [1] 10.48789 26.40481 34.65143
    +
    ## [1] 12.16667 42.00000 20.18182
    fit1a$tot.withinss
    -
    ## [1] 78.40431
    +
    ## [1] 74.34848
    fit1b$tot.withinss
    -
    ## [1] 71.54413
    +
    ## [1] 74.34848
    fit1c$tot.withinss
    -
    ## [1] 71.54413
    +
    ## [1] 74.34848
    fit1a$betweenss
    -
    ## [1] 106.5957
    +
    ## [1] 111.7041
    fit1b$betweenss
    -
    ## [1] 113.4559
    +
    ## [1] 111.7041
    fit1c$betweenss
    -
    ## [1] 113.4559
    +
    ## [1] 111.7041
    K4 <- data.frame(K3, fit1a$cluster)
     
     names(K4) <- c("1", "2", "3", "4", "5", "cluster") 
    @@ -505,9 +511,14 @@ 

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the K6$cluster <- as.factor(K6$cluster) ggplot(K6, aes(week, avg, colour = cluster)) + geom_line() + xlab("Week") + ylab("Average Motivation")

    -

    +

    K7 <- count(K4, cluster)
    -

    ##Part II

    +
    +
    +

    Answer: In my opinion, clustering data into 2 clusters is more informative compared to 3 clusters. Because we can see the different pattern according to this graph obviously.

    +
    +
    +

    Part II

    Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters.

    p2 <- read.csv("HUDK405020-cluster.csv")
     L1 <- p2[,c(2,3)]
    @@ -570,6 +581,9 @@ 

    Answer: People of Cluter 1 and Cluster 2 have different pattern through the
    P1 <- structable(D1$cluster ~ D1$clusterLoc)
     mosaic(P1, shade=TRUE, legend=TRUE) 

    +
    +

    Answer: We didn’t observe any geographical patterns that correspond to the answers. For Chi-square test (independence test), there is no significant relationships between geographical pattern and students’ answers. Moreover, we can testify this conclusion from picture as belows.

    +

    Please render your code as an .html file using knitr and Pull Resquest both your .Rmd file and .html files to the Assignment 3 repository.

    From b96d9e2f5ebb041da2dd5310931c4897b49f1d85 Mon Sep 17 00:00:00 2001 From: Xingyi Xie <70902969+Xingyixie@users.noreply.github.com> Date: Wed, 16 Jun 2021 00:25:54 -0700 Subject: [PATCH 5/5] Delete README.md --- README.md | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index b6a298d..0000000 --- a/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Assignment 4 -### Cluster Analysis - -In the attached files you will find instructions for assignment 3. Please **fork** this repository to your own Github account and then clone it in RStudio. - -In Assignment 4 we will be looking at some class motivation data collected from this class two years ago. You will be expected to cluster and visualize the clusters. - -The instructions to Assignment 4 are in the Assignment 4.rmd file. Assignments are structured in three parts, in the first part you can just follow along with the code, in the second part you will need to apply the code and in the third part is completely freestyle, apply your new knowledge in a new way. - -**Please complete as much as you can by 5:00pm, 11/05/20** - -Once you have finished, commit, push and pull your assignment back to the main branch. - -Good luck! \ No newline at end of file