Rstudio

Gvs
Quiz5.R

install.packages("tidyverse") library(tidyverse) # data manipulation install.packages("cluster") library(cluster) # clustering algorithms install.packages("factoextra") library(factoextra) # clustering algorithms & visualization setwd("C:/Users/ialsmadi/Desktop/University_of_Cumberlands/Lectures/Week3/RScripts") getwd() # Import test data data<-read.csv("grades_km_input.csv") print(summary(data)) data1 <- na.omit(data) columns <- data[1, ] print(summary(data)) #As we don't want the clustering algorithm to depend to an arbitrary #variable unit, we start by scaling data using the R function scale: data1 <- scale(data1) head(data1) distance <- get_dist(data1) print(distance) # plot cluster library library(cluster) # K-Means Cluster Analysis # simplest example, just the dataset and number of clusters fit <- kmeans(data1, 5) # 5 cluster solution # get cluster means aggregate(data1,by=list(fit$cluster),FUN=mean) # append cluster assignment mydata <- data.frame(data1, fit$cluster) clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE, labels=2, lines=0) fit <- kmeans(data1, 8) # 8 cluster solution # get cluster means aggregate(data1,by=list(fit$cluster),FUN=mean) # append cluster assignment mydata <- data.frame(data1, fit$cluster) clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE, labels=2, lines=0) # K-Means Clustering with 5 clusters fit <- kmeans(mydata, 5) # Determine number of clusters wss <- (nrow(data1)-1)*sum(apply(data1,2,var)) for (i in 2:15) wss[i] <- sum(kmeans(data1, centers=i)$withinss) #A plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters. #The analyst looks for a bend in the plot similar to a scree test in factor analysis # We want (total within-cluster variation) to be the lowest plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # Determine number of clusters wss <- (nrow(data1)-1)*sum(apply(data1,2,var)) for (i in 2:15) wss[i] <- sum(kmeans(data1, centers=i)$withinss) plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # Cluster Plot against 1st 2 principal components # vary parameters for most readable graph library(cluster) clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE, labels=2, lines=0) # Centroid Plot against 1st 2 discriminant functions library(fpc) plotcluster(mydata, fit$cluster) fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) # try with 25 attempts, 2 clusters km <- kmeans(data1, centers = 2, nstart = 25) str(km) #The output of kmeans is a list with several bits of information. The most important being: # cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated. #centers: A matrix of cluster centers. #totss: The total sum of squares. #withinss: Vector of within-cluster sum of squares, one component per cluster. #tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss). #betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$. #size: The number of points in each cluster. # print the clusters print(km) # Plot clusters fviz_cluster(km, data = data1) (cl <- kmeans(data1, 8)) plot(data1, col = cl$cluster) points(cl$centers, col = 1:3, pch = 8, cex = 2) # sum of squares ss <- function(x) sum(scale(x, scale = FALSE)^2) ## cluster centers "fitted" to each obs.: fitted.data1 <- fitted(cl); head(fitted.data1) resid.data1 <- data1 - fitted(cl) ## Equalities : ---------------------------------- cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns c(ss(fitted.data1), ss(resid.data1), ss(data1))) stopifnot(all.equal(cl$ totss, ss(data1)), all.equal(cl$ tot.withinss, ss(resid.data1)), ## these three are the same: all.equal(cl$ betweenss, ss(fitted.data1)), all.equal(cl$ betweenss, cl$totss - cl$tot.withinss), ## and hence also all.equal(ss(data1), ss(fitted.data1) + ss(resid.data1)) ) kmeans(data1,1)$withinss # trivial one-cluster, (its W.SS == ss(x)) ## random starts do help here with too many clusters ## (and are often recommended anyway!): (cl <- kmeans(x, 5, nstart = 25)) plot(x, col = cl$cluster) points(cl$centers, col = 1:5, pch = 8)