ANLY600 data mining
Week 3 Assignment Instructions and Sample R codes
This assignment is to give you the hands-on experience using R for conducting logistic regression in real world data set. Please refer to the Example 10.3 and 10.5 in Chapter 10 in the reference textbook for details about how to generate logistic regression models and the evaluate the model performances. Then open this website, go over the Satisfaction example and use the same R codes to reproduce the results, study the way to explain the model and evaluate the results.
Now open this file Satisfaction2.csv (slightly different from the sample dataset) and repeat the same analysis as in the website to conduct a logistic regression analysis.
Hints:
Use getwd(), setwd() and dir() to save and retrieve dataset in R with much convenience
Don’t forget to use install.packages("") for prediction or ROCR if that packages are not installed
Reference
https://www.kaggle.com/arpina/logistic-regression-analysis/notebook
Satisfaction <-read.csv("Satisfaction2.csv")
summary(Satisfaction)
str(Satisfaction)
#Making appropriate corrections regarding variable classes.
Satisfaction$Seat.comfort <- factor(Satisfaction$Seat.comfort, ordered = TRUE)
Satisfaction$Departure.Arrival.time.convenient <- factor(Satisfaction$Departure.Arrival.time.convenient, ordered = TRUE)
Satisfaction$Food.and.drink <- factor(Satisfaction$Food.and.drink, ordered = TRUE)
Satisfaction$Inflight.wifi.service <- factor(Satisfaction$Inflight.wifi.service, ordered = TRUE)
Satisfaction$Inflight.entertainment <- factor(Satisfaction$Inflight.entertainment, ordered = TRUE)
Satisfaction$Leg.room.service <- factor(Satisfaction$Leg.room.service, ordered = TRUE)
Satisfaction$Baggage.handling <- factor(Satisfaction$Baggage.handling, ordered = TRUE)
Satisfaction$Checkin.service <- factor(Satisfaction$Checkin.service, ordered = TRUE)
Satisfaction$Cleanliness <- factor(Satisfaction$Cleanliness, ordered = TRUE)
Satisfaction$Online.boarding <- factor(Satisfaction$Online.boarding, ordered = TRUE)
library(prediction) #install.packages("prediction") if not installed
library(ggplot2)
library(lattice)
library(caret)
library(gplots)
library(ROCR) # need to use install.packages("ROCR") if ROCR is not installed
#Plotting graphs.
Probs_1 <- as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Class), 1))
ggplot(Probs_1, aes(x = Var2, y = Freq, fill = Var1)) + geom_bar(stat = "identity", position = "fill", color = "black") + theme_bw() +
scale_fill_brewer(palette = "Dark2") + labs( x = "Class", y = "Satisfaction", fill = "Satisfaction", title = "Relationship between satisfaction and customer class")
#Based on the barplot constructed we can conclude that most satisfied passengers we passengers from Business class, followed by eco plus and then with eco class.
#This makes sense since the more expensive a ticket is, the better should be service and satisfaction.
Probs_2 <- as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Food.and.drink), 1))
ggplot(Probs_2, aes(x = Var2, y = Freq, fill = Var1)) + geom_bar(stat = "identity", position = "fill", color = "black") + theme_bw() +
scale_fill_brewer(palette = "Dark2") + labs( x = "Food and drinks", y = "Satisfaction", fill = "Satisfcation", title = "Relationship between satisfaction and Food and drink")
#From the barplot constructed what can generally be concluded is that passenger rating on food and drinks has not much relationships with passenger satisfaction
#as both in the case of rating 0 and 5 satisfaction is almost equal.
Probs_3 <- as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Type.of.Travel), 1))
ggplot(Probs_3, aes(x = Var2, y = Freq, fill = Var1)) + geom_bar(stat = "identity", position = "fill", color = "black") + theme_bw() +
scale_fill_brewer(palette = "Dark2") + labs( x = "Type of travel", y = "Satisfaction", fill = "Satisfcation", title = "Relationship between satisfaction and type of travel")
#From the barplot constructed we can s ay that business travel passengers on average had higehr satisfaction rates.
#Splitting the data into Test and Train
set.seed(1)
trainingIndex <- createDataPartition(Satisfaction$satisfaction_v2, p = 0.8, list = FALSE)
Train <- Satisfaction[trainingIndex,]
Test <- Satisfaction[-trainingIndex,]
#Building the initial model using Logistic regression.
model1 <- glm(satisfaction_v2 ~., data = Train, family = "binomial")
summary(model1)
#Removing the insignifcant variables from the model.
model2 <- glm(satisfaction_v2~. -Seat.comfort- Gate.location - Online.boarding - Checkin.service- Cleanliness, data = Satisfaction, family = "binomial" )
summary(model2)
#Since the variables seat comfort, gate location, online support, on board service, online boarding, id, checkin sevices and cleanliness were shown to be insignificant in the summary
# of the first model i decided to not use them in the second model to improve the reliability of it.
coef(model2)
#One year increase in age decreases the log odds of satisfaction by 5.280300e-03.
#One extra minute of arrival delay decreases the log odds of satisfaction by 9.209865e-03.
#One extra minute of departure delay decreases the log odds of satisfaction by 3.696267e-03.
#The log odds of satisfaction given that the passenger is male is by 0.9% less than the likelihood of satisfaction of a female passanger.
#The log odds of satisfaction for a passenger doing a personal travel is by -8.617104e-01 less than the log odds of satisfaction for a customer doing a business type of travel.
#The log odds of satisfaction for a loyal passenger is by 1.994355 more than for a disloyal passenger.
exp(coef(model2))
#For a one-unit increase in age, we expect to see about 1% decrease in the odds satisfaction.
#One extra minute of arrival delay decreases the odds of satisfaction by 1%.
#One extra minute of departure delay increases the odds of satisfaction by 0.4%(this doesn't make sense).
#The odds of male satisfaction is 39% of the odds of female satisfaction.
#If we select loyal passengers the odd ratio of satisfaction would increase by 14 times from the odd ratio fo disloyal passengers.
#If we select passengers who were traveling for personal reasons the odd ratio of satisfaction would decrease by 2.36 times
# compared to choosing passengers doing business travel.
#Building a confusion matrix based on predictionis of the second model.
predict <- predict(model2, newdata = Test, type = "response" )
predict_class <- factor(ifelse(predict > 0.5, "satisfied", "neutral or dissatisfied"))
confusionMatrix(predict_class, Test$satisfaction_v2, positive= "satisfied")
#based in the confusion matrix, we can conclude that the model is pretty reliable in terms fo making predictions due to a high accuracy level and very small p-value.
#Sensitivity ratio shows that given that the customer was satisfied the model correctly predicted it in 88% of the cases.
#Specificity ratio shows that given that the customer wasn't satisfied the model did correct predictions in 85% of the cases.
#Out of all the predictions of a customer being satisfied the model was in fact correct in 88% of the cases.
#Out of all the times that the model predicted for a customer to not be satisfied , the customer in fact wan't satisfied in 85% of the cases.
#All the ratios show that the model was good in making predictions and is useful.
P_Test <- prediction(predict, Test$satisfaction_v2)
perf <- performance(P_Test, "tpr", "fpr")
plot(perf)
performance(prediction.obj = P_Test, "auc")@y.values
#The AUC curve which shows tha tradeoff between sensitivity and specificity has an auc of 94% . This again is a good indicator of a reliable model.
#To sum up the logistic regression model is well suited for the dataset, as the model built had above average accuracy measures, had a large AUC,
# and fairly good sensitivity, specificity, npv and ppv measures.
Wee
k 3
Assignment
I
nstructions
and Sample R co
des
This assignment is to give you the hands
-
on experience using R for conducting logistic regression in real
world data set. Please refer to the Example 10.3 and 10.5
in
Chapter 10 in
the reference textbook
for
details about how to generate logisti
c regression models
and the evaluate the model performances.
Then
open this website
, go over the Satisfaction example and use the same R codes to reproduce the
results, stu
dy the way to explain the model and evaluate the results.
Now open this file
Satisfaction2.csv
(slightly different from the sample dataset) and repeat the same
analysis as in the website to conduct a logistic regression an
alysis.
Hints:
Use
getwd(), setwd() and dir() to save and retrieve dataset in R with much convenience
Don’t forget to use install.packages("") for prediction or ROCR if that packages are not installed
Reference
https://www.kaggle.com/arpina/logistic
-
regression
-
analysis/notebook
Satisfaction <
-
read.csv("Satisfaction
2
.csv")
summary(Satisfaction)
str(Satisfaction)
#Making appropriate corrections regarding variable classes.
S
atisfaction$Seat.comfort <
-
factor(Satisfaction$Seat.comfort, ordered = TRUE)
Satisfaction$Departure.Arrival.time.convenient <
-
factor(Satisfaction$Departure.Arrival.time.convenient, ordered = TRUE)
Satisfaction$Food.and.drink <
-
factor(Satisfaction$Food.a
nd.drink, ordered = TRUE)
Satisfaction$Inflight.wifi.service <
-
factor(Satisfaction$Inflight.wifi.service, ordered = TRUE)
Satisfaction$Inflight.entertainment <
-
factor(Satisfaction$Inflight.entertainment, ordered = TRUE)
Satisfaction$Leg.room.service <
-
f
actor(Satisfaction$Leg.room.service, ordered = TRUE)
Satisfaction$Baggage.handling <
-
factor(Satisfaction$Baggage.handling, ordered = TRUE)
Satisfaction$Checkin.service <
-
factor(Satisfaction$Checkin.service, ordered = TRUE)
Satisfaction$Cleanliness <
-
fac
tor(Satisfaction$Cleanliness, ordered = TRUE)
Satisfaction$Online.boarding <
-
factor(Satisfaction$Online.boarding, ordered = TRUE)
library(prediction) #install.packages("prediction") if not installed
library(ggplot2)
library(lattice)
library(caret)
library(gplots)
library(ROCR) # need to use install.packages("ROCR") if ROCR is not installed
#Plotting graphs.
Probs_1 <
-
as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Class), 1))
ggplot(Probs_1, aes(x = Var2, y
= Freq, fill = Var1)) + geom_bar(stat = "identity", position = "fill", color =
"black") + theme_bw() +
scale_fill_brewer(palette = "Dark2") + labs( x = "Class", y = "Satisfaction", fill = "Satisfaction", title =
"Relationship between satisfaction and cus
tomer class")
#Based on the barplot constructed we can conclude that most satisfied passengers we passengers from
Business class, followed by eco plus and then with eco class.
#This makes sense since the more expensive a ticket is, the better should be
service and satisfaction.
Probs_2 <
-
as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Food.and.drink), 1))
Week 3 Assignment Instructions and Sample R codes
This assignment is to give you the hands-on experience using R for conducting logistic regression in real
world data set. Please refer to the Example 10.3 and 10.5 in Chapter 10 in the reference textbook for
details about how to generate logistic regression models and the evaluate the model performances.
Then open this website, go over the Satisfaction example and use the same R codes to reproduce the
results, study the way to explain the model and evaluate the results.
Now open this file Satisfaction2.csv (slightly different from the sample dataset) and repeat the same
analysis as in the website to conduct a logistic regression analysis.
Hints:
Use getwd(), setwd() and dir() to save and retrieve dataset in R with much convenience
Don’t forget to use install.packages("") for prediction or ROCR if that packages are not installed
Reference
https://www.kaggle.com/arpina/logistic-regression-analysis/notebook
Satisfaction <-read.csv("Satisfaction2.csv")
summary(Satisfaction)
str(Satisfaction)
#Making appropriate corrections regarding variable classes.
Satisfaction$Seat.comfort <- factor(Satisfaction$Seat.comfort, ordered = TRUE)
Satisfaction$Departure.Arrival.time.convenient <-
factor(Satisfaction$Departure.Arrival.time.convenient, ordered = TRUE)
Satisfaction$Food.and.drink <- factor(Satisfaction$Food.and.drink, ordered = TRUE)
Satisfaction$Inflight.wifi.service <- factor(Satisfaction$Inflight.wifi.service, ordered = TRUE)
Satisfaction$Inflight.entertainment <- factor(Satisfaction$Inflight.entertainment, ordered = TRUE)
Satisfaction$Leg.room.service <- factor(Satisfaction$Leg.room.service, ordered = TRUE)
Satisfaction$Baggage.handling <- factor(Satisfaction$Baggage.handling, ordered = TRUE)
Satisfaction$Checkin.service <- factor(Satisfaction$Checkin.service, ordered = TRUE)
Satisfaction$Cleanliness <- factor(Satisfaction$Cleanliness, ordered = TRUE)
Satisfaction$Online.boarding <- factor(Satisfaction$Online.boarding, ordered = TRUE)
library(prediction) #install.packages("prediction") if not installed
library(ggplot2)
library(lattice)
library(caret)
library(gplots)
library(ROCR) # need to use install.packages("ROCR") if ROCR is not installed
#Plotting graphs.
Probs_1 <- as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Class), 1))
ggplot(Probs_1, aes(x = Var2, y = Freq, fill = Var1)) + geom_bar(stat = "identity", position = "fill", color =
"black") + theme_bw() +
scale_fill_brewer(palette = "Dark2") + labs( x = "Class", y = "Satisfaction", fill = "Satisfaction", title =
"Relationship between satisfaction and customer class")
#Based on the barplot constructed we can conclude that most satisfied passengers we passengers from
Business class, followed by eco plus and then with eco class.
#This makes sense since the more expensive a ticket is, the better should be service and satisfaction.
Probs_2 <- as.data.frame(prop.table(table(Satisfaction$satisfaction_v2, Satisfaction$Food.and.drink), 1))