R studio ( statistics )
#In class lab 03/12/2018 GIS 470 #For this class we will be reviewing descriptive statistics, using R instead of SPSS ############## Prep: Set Working Directory setwd("~/Desktop/R_GIS470") #setting your working directory (the folder you will pull data from and save results to) #install.packages('dplyr') #We will be using the package dplyr, which is a favorite of data managers and will be a good skill for job market library('dplyr') library('ggplot2') library('e1071') #for skewness # %>% -> just think of as a comma, it connects "strings" of code together, tells the computer there is more coming #basic format: data.frame %>% command #advanced format: data.frame %>% # command 1 %>% # command 2 %>% # command n ############## Prep: Loading in Data #save "coded data" tab to a csv in the folder you selected for you directory lab_data=read.csv("05022018_GIS470Survey.csv", header=T) #READ IN CSV: read.csv("file_name.csv",header=T) ############## Prep: Looking at your Data head(lab_data) #view the first few lines of your data summary(lab_data) #gives summary (min, quartiles, median, mean and NAs of your data) #uhm.... 969 NAS???, we will deal with this in a second names(lab_data) str(lab_data) ############## Prep: Cleaning and Subsetting Data # We will be using the same 3 variables as the SPSS in class assigment: Q19_FastFood, Q20_SpicyFood,Q24_Countrvisi Spicy_Subset = lab_data %>% select(Q19_FastFood, Q20_SpicyFood,Q24_Countrvisi) Spicy_Subset #looking at subset we just selected, see all the NA's at the bottom? Spicy_Subset = Spicy_Subset %>% na.omit # removing the NA's that don't belong Spicy_Subset = Spicy_Subset %>% replace(Spicy_Subset<0, NA) Spicy_Subset #looking at what we just did again ############## Generate Descriptive Statistics or Frequency Table and Graph for each Variable #Categorical Variable (FastFood Preference): Frequency Table #Option 1: Dplyr Q19_Frequency_NA = Spicy_Subset %>% #dataframe count(Q19_FastFood) %>% # count of each variable mutate(prop = prop.table(n)) # proportion of each variable out of 100 Q19_Frequency_NA #Option 1: Dplyr , Removing NA values first Q19_Frequency_1 = Spicy_Subset %>% #dataframe select(Q19_FastFood) %>% # Selecting Column filter(!is.na(Q19_FastFood)) %>% # dropping NA values count(Q19_FastFood) %>% # count of each value mutate(prop = prop.table(n)) %>% # proportion of each variable out of 100 mutate(prop = prop*100) %>% # calculate % mutate(prop = round(prop,2)) # rounding decimal places Q19_Frequency_1 #Option 2: "table" Q19_Frequency = table(Spicy_Subset$Q19_FastFood) # Selecting Column and count of each value Q19_Frequency = data.frame(Q19_Frequency) #getting into dataframe format Q19_Frequency$prop = prop.table(Q19_Frequency$Freq) # proportion of each variable out of 100 Q19_Frequency$prop = Q19_Frequency$prop*100 # calculate % Q19_Frequency$prop = round(Q19_Frequency$prop,2) # rounding decimal places Q19_Frequency #Categorical Variable (FastFood Preference): Bar Chart barplot(Q19_Frequency_1$n,names.arg=c("Taco Bell","Chipotle"), col="blue",ylim=c(0,21), ylab="Number of Students",xlab="Fast Food Preference") #basic plot ggplot(Q19_Frequency, aes(x=n,fill=Q19_FastFood)) + geom_bar() + labs(x="Q19 Fast Food Preference", y="Number of Students") ggplot(Q19_Frequency_1, aes(x=c("Taco Bell","Chipotle"), y=n)) + geom_bar(stat="identity",fill="blue") + ylim(0, 20) + xlab("Q19 Fast Food Preference") + ylab("Number of Students") #Ordinal Variable (Spiciness): Frequency Table Q20_Frequency = Spicy_Subset %>% #dataframe count(Q20_SpicyFood) %>% # Selecting Column rbind(c(2,0)) %>% # Adding in value of 2 not in data (no responses) arrange(Q20_SpicyFood) #sorting from 1-6 spiciness Q20_Frequency #Ordinal Variable (Spiciness): Bar Chart Spice_Label=factor(c("Avoid", "Minimum", "Mild","Medium","Hot","Fire"), levels=c("Avoid", "Minimum", "Mild","Medium","Hot","Fire")) ggplot(Q20_Frequency, aes(x=Spice_Label, y=n)) + geom_bar(stat="identity",fill="blue") + ylim(0, 20) + xlab("Q20 Spiciness Preference") + ylab("Number of Students") #Continuous Variable (Countries): Descriptive Statistics #mean, median, standard deviation, skewness, and interquartile range Spicy_Subset %>% select (Q24_Countrvisi) %>% summary() Q24_DescriptiveStats = Spicy_Subset %>% summarize(mean=mean(Q24_Countrvisi), #mean median=median(Q24_Countrvisi), #median sd=sd(Q24_Countrvisi), #Standard deviation min=min(Q24_Countrvisi), #min max=max(Q24_Countrvisi), #max skewness=skewness(Q24_Countrvisi,type=2),#help (skewness) iqr=IQR(Q24_Countrvisi), #Inter Quartile Range total=length(Q24_Countrvisi))%>% mutate(above_1 = mean+sd) %>% #one sd above mean mutate(above_2 = mean+(2*sd)) %>% #two sd above mean mutate(below_1 = mean-sd) %>% #one sd above mean mutate(below_2 = mean-(2*sd)) %>% #one sd above mean t() %>% round(2) Q24_DescriptiveStats #Continuous Variable (Countries): Histogram ggplot(Spicy_Subset, aes(x=Q24_Countrvisi)) + geom_histogram(col="black",fill="orange",binwidth = 1) + ylim(0, 10) + xlab("Q24 Number of Countries Visited") + ylab("Number of Students") + scale_x_continuous(breaks=c(0:15)) ############## Properties of Continous Variable Q24_DescriptiveStats ############## Cluster by Categorical Variable #Continous Clustered by Categorical Q24_Cluster = Spicy_Subset %>% filter(!is.na(Q19_FastFood)) %>% # dropping NA values group_by(Q19_FastFood) %>% #cluster by Q19_FastFood #now we can use the same code to pull statistics summarize(mean=mean(Q24_Countrvisi), #mean median=median(Q24_Countrvisi), #median sd=sd(Q24_Countrvisi), #Standard deviation min=min(Q24_Countrvisi), #min max=max(Q24_Countrvisi), #max skewness=skewness(Q24_Countrvisi,type=2),#help (skewness) iqr=IQR(Q24_Countrvisi), #Inter Quartile Range total=length(Q24_Countrvisi)) %>% t() %>% round(2) colnames(Q24_Cluster) = c("Taco_Bell","Chipotle") ; Q24_Cluster #Ordinal Clustered by Categorical Q20_Cluster = Spicy_Subset %>% #dataframe filter(!is.na(Q19_FastFood)) %>% #frequency table count(Q19_FastFood,Q20_SpicyFood)%>% # Selecting Cluster then Variablecomplete(y, fill = list(count = 0)) rbind(c(6,1,0),c(6,2,0),c(6,5,0),c(7,1,0),c(7,2,0)) %>% # Adding in value of 2 not in data (no responses) arrange(Q19_FastFood,Q20_SpicyFood) Q20_Cluster ############## Plot Clusters by Categorical Variable #split histogram Spicy_Subset %>% filter(!is.na(Q19_FastFood)) %>% ggplot(aes(x=Q24_Countrvisi)) + geom_histogram(col="black",fill="orange",binwidth = 1) + ylim(0, 10) + xlab("Q24 Number of Countries Visited") + ylab("Number of Students") + facet_grid(~Q19_FastFood) #split bar chart ggplot(Q20_Cluster, aes(x=Q20_SpicyFood, y=n)) + geom_bar(stat="identity",fill="blue") + ylim(0, 10) + xlab("Q20 Spiciness Preference") + ylab("Number of Students") + facet_grid(~Q19_FastFood) #----------------------------------------------------- #Introduction to Loops and Probability Distributions #One of the probability distributions we explored in class was the binomial distribution #The binomial distribution is applicable to variables with only two possible outcomes #The distribution helps us understand the probability of x "successes" in n trials #Let's revisit our example from class related to multiple choice tests #Recall that we had a 10 question multiple choice test, with 5 choices for each question #Assuming that a student must randomly guess, what is the probability that they pass (50% or better)? #We need to calculate the probability of getting exactly 5 right, plus exactly 6, plus... all the way to 10. #Recall our formula: P(x) = C(n,x) * m^x * (1-m)^(n-x) here we have substituted m for "pi" from class #probability for getting 5 right: n=10 #number of trials x=5 #number of successes m=0.2 #probability of success on each trial P_5 = choose(n,x)*m^x*(1-m)^(n-x) #choose returns the number of combinations of 5 successes from 10 trials #the equation uses the variables we have assigned previously print(P_5) #the probability of getting 5 questions right is 0.0264 (2.64%) #now we need to calculate the probability of getting 6 through 10 right. #we could simply type out the equation 5 more times, but that doesn't seem very efficient. #when we are repeating a task, we often use a LOOP to have the computer repeat the task for us #loops rely on "counter" variables to keep track of how many times the loop has repeated #in our case, we want to run the loop 6 times...from 5 to 10 (5,6,7,8,9,10) #here is what a loop looks like in R for is the syntax that specifies a loop, counting from 1 to 6 for (i in 1:6){ #everything inside the brackets will be repeated each loop x=i+4 #x needs to change each time we run the loop, m and n stay the same P = choose(n,x)*m^x*(1-m)^(n-x) print(paste("probability of ", x, "successes:")) #the paste command combines different objects together print(P) } #select all lines of the loop and run them all together #you should see six different probabilities appear on your screen after the loop is run #that's great, but it sure would be nice to be able to put that information somewhere #let's INITIALIZE an empty matrix where we can store the values that we've created prob_table=matrix(NA,6,1) #here we have made a matrix of NAs, with 6 rows and 1 column #let's modify the code for our loop to dump the values from each iteration into our new table for (i in 1:6){ x=i+4 P = choose(n,x)*m^x*(1-m)^(n-x) prob_table[i]=P #each calculation of P will get added to row i in the table } #re-run the loop #have a look at prob_table print(prob_table) #now it's easy to get the answer to our original question - we just need to sum all of the rows of prob_table print(sum(prob_table)) #the sum should be 0.03279....in other words, less than a 3.3% chance that a student gets at least 5 questions right #on a multiple choice test of 10 questions where each question has 5 choices. studying is encouraged! #Let's now shift our attention to the Poisson distribution, which you hopefully recall is the distribution # that typically defines occurrences of events over intervals of time and/or space ("count" data) #here is a sample of some data collected by one of your classmates during the count activity before spring break count.data=read.csv("CarCount_SampleData.csv",header=T) #remember to ensure that the file is in your working directory! #these data are "Vehicles passing under "Don't Drink and Drive" sign on Mill Ave." collected at 30 second intervals #let's have a look at the distribution of the data with a histogram: ggplot(count.data, aes(x=CarCount)) + geom_histogram(col="black",fill="green",binwidth = 1) + ylim(0, 5) + xlab("Number of Cars") + ylab("Number of Intervals") + scale_x_continuous(breaks=c(0:15)) #convert the histogram to a probability distribution by using the "density" argument ggplot(count.data, aes(x=CarCount,..density..)) + geom_histogram(col="black",fill="green",binwidth = 1) + ylim(0, 0.25) + xlab("Number of Cars") + ylab("Probability of Occurrence") + scale_x_continuous(breaks=c(0:15)) #recall for a Poisson distribution we only need one variable to specify the shape of the distribution: lambda #for a true Poisson distribution, E(x)=V(x)=lambda (the mean and variance are equal) #what are the mean and variance of these data? print("Mean:") print(mean(count.data$CarCount)) print("Variance:") print(var(count.data$CarCount)) #for these data the variance is much higher than the mean...maybe Poisson is not a good representation? #we can generate a "true" Poisson probability distribution and see how well it compares to our data #the distribution function for Poisson is calculated using the "dpois" function #arguments are (x range, lambda) poi_dist=dpois(0:12,6) #calculate the Poisson probabilities over the range 0-12 for a distribution with lambda=6 print(poi_dist) #have a quick look at the values #let's look at what the histogram would be for Poisson data with lambda = 6 ggplot(transform(data.frame(x=c(0:12)), y=dpois(x, 6)), aes(x, y)) + geom_bar(stat="identity",col="black",fill="green") + xlab("Number of Cars [Modeled Poisson]") + ylab("Probability of Occurrence") #how well do our observational data match what would be expected in a true Poisson process?