Data Analytics- R (programming)
library(leaps) fileUrl <- "https://www.nrcan.gc.ca/sites/nrcan/files/oee/files/csv/MY2021%20Fuel%20Consumption%20Ratings.csv" download.file(fileUrl, destfile = "project.csv") CO2data = read.csv(file = 'project.csv') data = CO2data[c(2:936),c(2:15) ] sum(is.na(data)) ## check missing values colnames(data) <- c('Make','Model','Vehicle_Class','Engine_Size','Cylinders','Transmission','Fuel_Type','Consumption_City','Consumption_Hwy', 'Consumption_Comb1', 'Consumption_Comb2', 'CO2_Emissions','CO2_Rating','Smog_Rating') summary(data) sapply(data,class) data$Engine_Size=as.numeric(as.character(data$Engine_Size)) data$Cylinders=as.numeric(as.character(data$Cylinders)) data$Consumption_City<- as.numeric(as.character(data$Consumption_City)) data$Consumption_Hwy <- as.numeric(as.character(data$Consumption_Hwy)) data$Consumption_Comb1 <- as.numeric(as.character(data$Consumption_Comb1)) data$Consumption_Comb2 <- as.numeric(as.character(data$Consumption_Comb2)) data$CO2_Emissions <- as.numeric(as.character(data$CO2_Emissions)) data$CO2_Rating <- as.numeric(as.character(data$CO2_Rating)) data$Smog_Rating <- as.numeric(as.character(data$Smog_Rating)) data$Fuel_Type[data$Fuel_Type== "Z"] <- 91 data$Fuel_Type[data$Fuel_Type== "X"] <- 87 data$Fuel_Type[data$Fuel_Type== "D"] <- 47.5 data$Fuel_Type[data$Fuel_Type== "E"] <- 113 data$Fuel_Type <- as.numeric(as.character(data$Fuel_Type)) sapply(data,class) summary(data) dim(data) names(data) # split test data and training data 600/335 # generarte random number set.seed(1) train = sample(1:935,600) traindata = data[train,c(4,5,7,8,9,10,12)] testdata = data[-train,c(4,5,7,8,9,10,12)] train.X = data[train,c(4,5,7,8,9,10)] train.Y = data[train,12] test.X = data[-train,c(4,5,7,8,9,10)] test.Y = data[-train,12] #Linear regression lm.full=lm(CO2_Emissions~ Engine_Size + Cylinders+Fuel_Type+Consumption_City+Consumption_Hwy+Consumption_Comb1, data=data, subset=train) summary(lm.full) mean((predict(lm.full,data[-train,])- test.Y)^2) mse.lm = (predict(lm.full,data[-train,])- test.Y)^2 regfit.full=regsubsets(CO2_Emissions~ Engine_Size + Cylinders+Fuel_Type+Consumption_City+Consumption_Hwy+Consumption_Comb1, data=data,subset=train) reg.summary = summary(regfit.full) summary(regfit.full) reg.summary$adjr2 which.max(reg.summary$adjr2) plot(c(1,2,3,4,5,6),reg.summary$adjr2,) coef(regfit.full,which.max(reg.summary$adjr2)) coefi = coef(regfit.full,which.max(reg.summary$adjr2)) testones= t(t(rep(1,dim(testdata)[1]))) fulltestdata = cbind(testones,testdata[,c("Cylinders","Fuel_Type","Consumption_City","Consumption_Comb1")]) pred = sweep(fulltestdata, MARGIN=2, coefi, `*`) pred = rowSums(pred) mean((pred- test.Y)^2) #decision tree library(tree) tree.data = tree(CO2_Emissions~ Engine_Size + Cylinders+Fuel_Type+Consumption_City+Consumption_Hwy+Consumption_Comb1, data=data, subset=train) summary(tree.data) plot(tree.data) text(tree.data) yhat = predict(tree.data, newdata=data[-train,c(4,5,7,8,9,10)]) test.Y= data$CO2_Emissions[-train] plot(yhat, test.Y) mean((yhat - test.Y)^2) sqrt(mean((yhat - test.Y)^2)) #prune tree cv.data = cv.tree(tree.data) names(cv.data) summary(cv.data) plot(cv.data$size, cv.data$dev, type='b') cv.data$size cv.data$dev prune.data = prune.tree(tree.data, best=8) plot(prune.data) text(prune.data) yhat = predict(prune.data, newdata=data[-train,c(4,5,7,8,9,10)]) mean((yhat - test.Y)^2) sqrt(mean((yhat - test.Y)^2)) # KNN ### new scenario TV=100, Radio=50, Newspaper =25 install.packages("FNN") library(FNN) ?knn.reg knn.pred.k1=knn.reg(train.X,test.X,train.Y,k=1) knn.pred.k3=knn.reg(train.X,test.X,train.Y,k=3) knn.pred.k5=knn.reg(train.X,test.X,train.Y,k=5) knn.pred.k7=knn.reg(train.X,test.X,train.Y,k=7) knn.pred.k9=knn.reg(train.X,test.X,train.Y,k=9) knn.pred.k11=knn.reg(train.X,test.X,train.Y,k=11) knn.pred.k13=knn.reg(train.X,test.X,train.Y,k=13) knn.pred.k15=knn.reg(train.X,test.X,train.Y,k=15) knn.pred.k17=knn.reg(train.X,test.X,train.Y,k=17) knn.pred.k19=knn.reg(train.X,test.X,train.Y,k=19) knn.pred.k21=knn.reg(train.X,test.X,train.Y,k=21) knn.pred.k23=knn.reg(train.X,test.X,train.Y,k=23) knn.pred.k25=knn.reg(train.X,test.X,train.Y,k=25) knn.pred.k27=knn.reg(train.X,test.X,train.Y,k=27) knn.pred.k101=knn.reg(train.X,test.X,train.Y,k=101) mean((knn.pred.k1$pred - test.Y)^2) mean((knn.pred.k3$pred - test.Y)^2) mean((knn.pred.k5$pred - test.Y)^2) mean((knn.pred.k7$pred - test.Y)^2) mean((knn.pred.k9$pred - test.Y)^2) mean((knn.pred.k11$pred - test.Y)^2) mean((knn.pred.k13$pred - test.Y)^2) mean((knn.pred.k15$pred - test.Y)^2) mean((knn.pred.k17$pred - test.Y)^2) mean((knn.pred.k19$pred - test.Y)^2) mean((knn.pred.k21$pred - test.Y)^2) mean((knn.pred.k23$pred - test.Y)^2) mean((knn.pred.k25$pred - test.Y)^2) mean((knn.pred.k27$pred - test.Y)^2) mean((knn.pred.k101$pred - test.Y)^2)