R Statistics
# Set working directory to LAB 3 directory setwd("C:/Users/Family/Google Drive/LAB4") install.packages("party") library("party") ##Read the CSV file. Change the file path to the file location on your hard drive. credit<-read.csv(file="CreditApproval.csv", head=TRUE, sep=",") newcredit<-read.csv(file="newcredit.csv", head=TRUE, sep=",") str(credit) str(newcredit) #Pre-processing scripts apply(credit, 2, function (credit) sum(is.na(credit))) credit$A2[is.na(credit$A2)]<-mean(credit$A2, na.rm=TRUE) credit$A14[is.na(credit$A14)]<-mean(credit$A14, na.rm=TRUE) apply(credit, 2, function (credit) sum(is.na(credit))) # Set seed command and split data for credit approval set.seed(1234) ind <- sample(2, nrow(credit), replace = TRUE, prob = c(0.7, 0.3)) train.data <- credit[ind == 1, ] test.data <- credit[ind == 2, ] # Build ctree model and store in variable myFormula<-class~. credit_ctree <- ctree(myFormula, data = train.data) # Output model to textual print(credit_ctree) # Visualize and plot model plot(credit_ctree, main = "Credit Approval", type = "simple") # Build the confusion matrix for the training data table(predict(credit_ctree), train.data$class) prop.table(table(predict(credit_ctree), train.data$class)) # Build the confusion matrix for the test data testPred <- predict(credit_ctree, newdata = test.data) table (testPred, test.data$class) prop.table(table(testPred, test.data$class)) # Predict unkown dependant variable on newcredit colnames(newcredit)[1] <- "A1" newcredit$class<-factor(newcredit$class) newcredit$A14<-as.numeric(newcredit$A14) apply(newcredit, 2, function (newcredit) sum(is.na(newcredit))) newcreditPred <- predict(credit_ctree, newdata = newcredit) table (newcreditPred, newcredit$class) newcreditPred