Course Paper/Project
QUiz 3
setwd("C:/Users/ranjith/Desktop/Data Science")
getwd()
x <- 3
print(x)
v1 <- c(2,4,6,8,10)
print(v1)
print(v1[3])
v <- c(1:10)
print(v)
print(v[6])
# Import test data
test<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv")
test1<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv", sep=",")
test2<-read.table("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv", sep=",")
write.csv(test2, file="C:/Users/ranjith/Desktop/Data Science/top million/quiz3out.csv")
# Write CSV in R
write.table(test1, file = "quiz3out1.csv",row.names=TRUE, na="",col.names=TRUE, sep=",")
head(test)
tail(test)
summary(test)
head <- head(test)
tail <- tail(test)
sd(test$Number)
var(test$Number)
plot(test$Number)
hist(test$Number)
str(test$Number)
quit()
Quiz 4
setwd("C:/Users/ranjith/Desktop/Data Science")
getwd()
# Import test data
data<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv")
print(summary(data))
anscombe<-read.csv("C:/Users/ranjith/Desktop/Data Science/anscombe.csv")
print(summary(anscombe))
sd(anscombe$X)
var(anscombe$X)
sd(anscombe$x1)
var(anscombe$x1)
sd(anscombe$x2)
var(anscombe$x2)
sd(anscombe$x3)
var(anscombe$x3)
sd(anscombe$x4)
var(anscombe$x4)
sd(anscombe$y1)
var(anscombe$y1)
sd(anscombe$y2)
var(anscombe$y2)
sd(anscombe$y3)
var(anscombe$y3)
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
mods[[i]] <- lmi <- lm(ff, data = anscombe)
print(anova(lmi))
}
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))
for(i in 1:4) {
ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
xlim = c(3, 19), ylim = c(3, 13))
abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
plot(sort(data$Number))
hist(sort(data$Number))
plot(density(sort(data$Number)))
library(lattice)
densityplot(data$Number)
densityplot(log(data$Number))
densityplot(data$Number)
densityplot(log(data$Number))
hist(data$Number, breaks=100, main="Website",
xlab="com", col="gray")
abline(v = median(data$Number), col = "magenta", lwd = 4)
rug(data$Website)
boxplot(data$Number,data=data, main="Ranking based on Alexa",
xlab="Sales", ylab="Total")
# Boxplot of MPG by Car Cylinders, using one of R built-in datasets
boxplot(mpg~cyl,data=mtcars, main="Car Milage Data",
xlab="Number of Cylinders", ylab="Miles Per Gallon")
#in our boxplot above, we might want to draw a horizontal line at 12 where the national standard is.
abline(h = 12)
boxplot(data$Number,data=data, main="Ranking based on Alexa",
xlab="Sales", ylab="Total")
# Dot chart of a single numeric vector
dotchart(mtcars$mpg, labels = row.names(mtcars),
cex = 0.6, xlab = "mpg")
library(ROCR)
# Simple Scatterplot
attach(mtcars)
plot(wt, mpg, main="Scatterplot Example",
xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19)
plot(data$Number, data$Website)
lines(lowess(data$Website, data$Number), col="blue")
# Basic Scatterplot Matrix
pairs(data)
pairs(data[0:2])
# Scatterplot Matrices from the car Package
#install.packages("car")
library(car)
#install.packages("ggplot2")
library(ggplot2)
quit()
Quiz 5
install.packages("tidyverse")
library(tidyverse) # data manipulation
install.packages("cluster")
library(cluster) # clustering algorithms
install.packages("factoextra")
library(factoextra) # clustering algorithms & visualization
setwd("C:/Users/ranjith/Desktop/Data Science/top million/")
getwd()
# Import test data
data<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/majestic_million.csv")
print(summary(data))
data1 <- na.omit(data)
columns <- data[1,]
print(summary(data))
#As we don't want the clustering algorithm to depend to an arbitrary
#variable unit, we start by scaling data using the R function scale:
data1 <- scale(data1)
head(data1)
distance <- get_dist(data1)
print(distance)
# plot cluster library
library(cluster)
# K-Means Cluster Analysis
# simplest example, just the dataset and number of clusters
fit <- kmeans(data1, 2) # 5 cluster solution
# get cluster means
aggregate(data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(data1, fit$cluster)
clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)
fit <- kmeans(data1, 8) # 8 cluster solution
# get cluster means
aggregate(data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(data1, fit$cluster)
clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)
# K-Means Clustering with 5 clusters
fit <- kmeans(mydata, 5)
# Determine number of clusters
wss <- (nrow(data1)-1)*sum(apply(data1,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data1,
centers=i)$withinss)
#A plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters.
#The analyst looks for a bend in the plot similar to a scree test in factor analysis
# We want (total within-cluster variation) to be the lowest
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# Determine number of clusters
wss <- (nrow(data1)-1)*sum(apply(data1,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data1,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
library(cluster)
clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)
# Centroid Plot against 1st 2 discriminant functions
library(fpc)
plotcluster(mydata, fit$cluster)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
# try with 25 attempts, 2 clusters
km <- kmeans(data1, centers = 2, nstart = 25)
str(km)
#The output of kmeans is a list with several bits of information. The most important being:
# cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.
#centers: A matrix of cluster centers.
#totss: The total sum of squares.
#withinss: Vector of within-cluster sum of squares, one component per cluster.
#tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).
#betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.
#size: The number of points in each cluster.
# print the clusters
print(km)
# Plot clusters
fviz_cluster(km, data = data1)
(cl <- kmeans(data1, 8))
plot(data1, col = cl$cluster)
points(cl$centers, col = 1:3, pch = 8, cex = 2)
# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)
## cluster centers "fitted" to each obs.:
fitted.data1 <- fitted(cl); head(fitted.data1)
resid.data1 <- data1 - fitted(cl)
## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
c(ss(fitted.data1), ss(resid.data1), ss(data1)))
stopifnot(all.equal(cl$ totss, ss(data1)),
all.equal(cl$ tot.withinss, ss(resid.data1)),
## these three are the same:
all.equal(cl$ betweenss, ss(fitted.data1)),
all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
## and hence also
all.equal(ss(data1), ss(fitted.data1) + ss(resid.data1))
)
kmeans(data1,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)
if(!require(arules)) install.packages("arules")
if(!require(arulesViz)) install.packages("arulesViz")
if(!require(dplyr)) install.packages("dplyr")
if(!require(lubridate)) install.packages("lubridate")
if(!require(ggplot2)) install.packages("ggplot2")
if(!require(knitr)) install.packages("knitr")
if(!require(RColorBrewer)) install.packages("RColorBrewer")
library(arules)
library(arulesViz)
library(dplyr)
library(plyr)
library(lubridate)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(tidyverse)
setwd("C:/Users/ranjith/Desktop/Data Science/")
getwd()
grocery=read.transactions("groceries.csv",sep=",")
summary(grocery)
grocery[1:5,]%>%inspect
grocery[,1:5]%>%itemFrequency
itemFrequencyPlot(grocery, support=0.08)
itemFrequencyPlot(grocery, topN=10)
image(grocery[1:5,])
sample(grocery,10)%>%image
g_rule=apriori(data=grocery,parameter = list(supp=0.005,conf=0.2,target="rules"))
inspect(head(sort(g_rule, by = "lift")))
plot(g_rule)
head(quality(g_rule))
plot(g_rule, method = "grouped")
plot(g_rule,method = "scatterplot")
plot(g_rule,method = "graph")
inspect(g_rule[1:5])
(g_rule%>%sort(by="lift"))[1:10]%>%inspect
berryrules <- subset(g_rule, items %in% "berries")
inspect(berryrules)
b_rule=as(berryrules,"data.frame")
str(b_rule)
b_rule
# Import my dataset
df <- read.csv("majestic_million.csv")
head(df)
df <- df[complete.cases(df), ] # Drop missing values
# Change Domain and IDN_Domain columns to factors
# Factors are the data objects which are used to categorize the data and store it as levels.
df %>% mutate(Domain = as.factor(Domain),
IDN_Domain = as.factor(IDN_Domain))
# MBA analysis
# From package arules
tr <- read.transactions('majestic_million.csv', format = 'basket', sep=',')
summary(tr)
# plot the frequency of items
itemFrequencyPlot(tr)
itemFrequencyPlot(tr,topN=20,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute Item Frequency Plot")
arules::itemFrequencyPlot(tr,
topN=20,
col=brewer.pal(8,'Pastel2'),
main='Relative Item Frequency Plot',
type="relative",
ylab="Item Frequency (Relative)")
# Generate the a priori rules
association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8,maxlen=10))
summary(association.rules)
inspect(association.rules[1:10]) # Top 10 association rules
# Select rules which are subsets of larger rules -> Remove rows where the sums of the subsets are > 1
subset.rules <- which(colSums(is.subset(association.rules, association.rules)) > 1) # get subset rules in vector
# What did customers buy before buying "METAL"
metal.association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8),appearance = list(default="lhs",rhs="METAL"))
inspect(head(metal.association.rules))
# What did customers buy after buying "METAL"
metal.association.rules2 <- apriori(tr, parameter = list(supp=0.001, conf=0.8),appearance = list(lhs="METAL",default="rhs"))
inspect(head(metal.association.rules2))
# Plotting
# Filter rules with confidence greater than 0.4 or 40%
subRules<-association.rules[quality(association.rules)$confidence>0.4]
#Plot SubRules
plot(subRules)
# Top 10 rules viz
top10subRules <- head(subRules, n = 10, by = "confidence")
plot(top10subRules, method = "graph", engine = "htmlwidget")
# Filter top 20 rules with highest lift
# Paralell Coordinates plot - visualize which products along with which items cause what kind of sales.
# Closer arrows re bought together
subRules2<-head(subRules, n=20, by="lift")
plot(subRules2, method="paracoord")
Quiz 7
setwd("C:/Users/ranjith/Desktop/Data Science/")
getwd()
# Import test data
data=read.csv("majestic_million.csv")
head(data) # display the first 6 observations
print(summary(data))
scatter.smooth(x=data$GlobalRank, y=data$TldRank, main="GlobalRank ~ TldRank")# scatterplot
linearMod <- lm(TldRank ~ GlobalRank, data=data) # build linear regression model on full data
print(linearMod)
print(summary(linearMod))
plot(linearMod$residuals, pch = 16, col = "red")
modelSummary <- summary(linearMod) # capture model summary as an object
modelCoeffs <- modelSummary$coefficients # model coefficients
beta.estimate <- modelCoeffs["GlobalRank", "Estimate"] # get beta estimate for speed
std.error <- modelCoeffs["GlobalRank", "Std. Error"] # get std.error for speed
t_value <- beta.estimate/std.error # calc t statistic
p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars)) # calc p Value
f_statistic <- linearMod$fstatistic[1] # fstatistic
f <- summary(linearMod)$fstatistic # parameters for model p-value calc
model_p <- pf(f[1], f[2], f[3], lower=FALSE)
# For model comparison, the model with the lowest AIC and BIC score is preferred.
AIC(linearMod)
BIC(linearMod)
# Build the model on training data -
trainingData=read.csv("majestic_million.csv")
testData=read.csv("majestic_million.csv")
lmMod <- lm(TldRank ~ GlobalRank, data=trainingData) # build the model
distPred <- predict(lmMod, testData) # predict distance
summary (lmMod)
summary (distPred)
# calculate prediction accuracy and error rates
#actuals_preds <- data.frame(cbind(actuals=testData$dist, predicteds=distPred)) # make actuals_predicteds dataframe.
#correlation_accuracy <- cor(actuals_preds) #
#print(head(actuals_preds))
# second dataset, many variables, but all numeric
Top1Million=read.csv("majestic_million.csv")
lmTop1Million = lm(TldRank~., data = Top1Million) #Create the linear regression
summary(lmTop1Million) #Review the results
plot(lmTop1Million$residuals, pch = 16, col = "red")
modelSummary <- summary(lmTop1Million) # capture model summary as an object
modelCoeffs <- modelSummary$coefficients # model coefficients
beta.estimate <- modelCoeffs["GlobalRank", "Estimate"] # get beta estimate for speed
std.error <- modelCoeffs["GlobalRank", "Std. Error"] # get std.error for speed
t_value <- beta.estimate/std.error # calc t statistic
p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars)) # calc p Value
f_statistic <- lmTop1Million$fstatistic[1] # fstatistic
f <- summary(lmTop1Million)$fstatistic # parameters for model p-value calc
model_p <- pf(f[1], f[2], f[3], lower=FALSE)
# For model comparison, the model with the lowest AIC and BIC score is preferred.
AIC(lmTop1Million)
BIC(lmTop1Million)
# Build the model on training data -
Top1Million=read.csv("majestic_million.csv")
library(dplyr)
train<-sample_frac(Top1Million, 0.5)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-Top1Million[-sid,]
lmMod <- lm(GlobalRank ~ ., data=train) # build the model
plot(lmMod$residuals, pch = 16, col = "red")
distPred <- predict(lmMod, test) # predict distance
summary (lmMod)
# Import test data
data=read.csv("majestic_million.csv")
head(data) # display the first 6 observations
print(summary(data))
reg <- lm(GlobalRank ~ RefSubNets, data=data)
reg1 <- lm(GlobalRank ~ RefIPs + RefSubNets, data=data)
reg2 <- lm(GlobalRank ~ . , data=data)
print(reg)
print(summary(reg))
print(summary(reg$coefficients))
plot(reg$residuals, pch = 16, col = "red")
library(ggplot2)
qplot(x = GlobalRank, data = data)
qplot(x = TldRank, data = data)
attributes(reg)
print(summary(reg$model))
print(summary(reg$coefficients))
modelSummary <- summary(reg)
modelCoeffs <- modelSummary$coefficients # model coefficients
beta.estimate <- modelCoeffs["GlobalRank", "Estimate"] # get beta estimate for speed
std.error <- modelCoeffs["GlobalRank", "Std. Error"] # get std.error for speed
t_value <- beta.estimate/std.error # calc t statistic
p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars)) # calc p Value
f_statistic <- reg$fstatistic[1] # fstatistic
f <- summary(reg)$fstatistic # parameters for model p-value calc
model_p <- pf(f[1], f[2], f[3], lower=FALSE)
# For model comparison, the model with the lowest AIC and BIC score is preferred.
AIC(reg)
BIC(reg)
library(ggplot2)
qplot(x = GlobalRank, data = data)
qplot(x = TldRank, data = data)
Quiz 8
setwd("C:/Users/ranjith/Desktop/Data Science/")
getwd()
# Import test data
data=read.csv("majestic_million.csv")
head(data) # display the first 6 observations
print(summary(data))
# Check distribution between the 2 binary classes
xtabs(~DomainInfo + GlobalRank, data = data)
xtabs(~DomainInfo + TldRank, data = data)
#First, we convert GlobalRank to a factor to indicate that GlobalRank should be treated as a categorical variable.
data$GlobalRank <- factor(data$GlobalRank)
logit <- glm(DomainInfo ~ RefSubNets + TldRank + GlobalRank, data = data, family = "binomial")
summary(logit)
# We can use the confint function to obtain confidence intervals for the coefficient estimates.
## CIs using profiled log-likelihood
confint(logit)
## CIs using standard errors
confint.default(logit)
#To get the exponentiated coefficients, you tell R that you want to exponentiate (exp)
## odds ratios only
exp(coef(logit))
## odds ratios and 95% CI
exp(cbind(OR = coef(logit), confint(logit)))
#Now for example, we can say that for a one unit increase in TldRank, the odds of
#being Domainted to graduate school (versus not being Domainted) increase by a factor of 2.17.
with(logit, null.deviance - deviance)
data=read.csv("majestic_million.csv")
head(data) # display the first 6 observations
print(summary(data))
names(data)
# Histogram of variables
par(mfrow=c(1,13))
for(i in 1:13) {
hist(data[,i], main=names(data)[i])
}
install.packages("Amelia")
library(Amelia)
library(mlbench)
# Check missing data
missmap(data, col=c("blue", "red"), legend=FALSE)
#featutes/target distribution
pairs(data, col=data$DomainInfo)
# Logistics ReRefSubNetsssion
glm.fit <- glm(DomainInfo ~ RefSubNets + TldRank + GlobalRank, data = data, family = "binomial")
summary(glm.fit)
glm.probs <- predict(glm.fit,type = "response")
glm.probs[1:10]
train.raw <- read.csv('majestic_million.csv',header=T,na.strings=c(""))
#Now we need to check for missing values and look how many unique values there are for each variable using the sapply() function which applies the
#function passed as argument to each column of the dataframe.
sapply(train.raw,function(x) mean(is.na(x)))
missmap(train.raw, main = "Missing values vs observed")
Quiz9
library(readxl)
library(tm)
library(wordcloud)
library(e1071)
library(gmodels)
require(quanteda)#natural language processing package
require(RColorBrewer)
require(ggplot2)
setwd("C:/Users/ranjith/Desktop/Data Science/")
getwd()
GlobalRank <- read.csv("majestic_million.csv", header = FALSE, stringsAsFactors = FALSE)
GlobalRank <- GlobalRank[,1:2]
colnames(GlobalRank) <- c("Type", "Text")
GlobalRank$Type <- factor(GlobalRank$Type) #attaching the class labels to the corpus message text
str(GlobalRank)
# Check the number of spam and ham messages
table(GlobalRank$Type)
prop.table(table(GlobalRank$Type))
library(tm)
sms_corpus <- VCorpus(x = VectorSource(GlobalRank$Text))
# Print corpus
sms_corpus
# Check the text in some messages and their type
lapply(sms_corpus[5:8], as.character)
GlobalRank$Type[5:8]
corpus_clean <- sms_corpus
# Remove Numbers
corpus_clean <- tm_map(x = corpus_clean, FUN = removeNumbers)
corpus_clean <- tm_map(x = corpus_clean, FUN = removePunctuation)
# Print stopwords()
stopwords()
# Remove stop words
corpus_clean <- tm_map(x = corpus_clean, FUN = removeWords, stopwords())
# Install SnowballC package
#install.packages("SnowballC")
# Load package
library(SnowballC)
# Test the function
wordStem(words = c("cooks", "cooking", "cooked"))
# Stem words in corpus
corpus_clean <- tm_map(x = corpus_clean, FUN = stemDocument)
# Remove extra white spaces
corpus_clean <- tm_map(x = corpus_clean, FUN = stripWhitespace)
# Create Document Term Matrix
DTM <- DocumentTermMatrix(x = corpus_clean)
## Create training and test set
# Create Training Set
DTM_train <- DTM[1:round(nrow(DTM)*0.80, 0), ]
# Create Test Set
DTM_test <- DTM[(round(nrow(DTM)*0.80, 0)+1):nrow(DTM), ]
# Create vectors with labels for the training and test set
train_labels <- GlobalRank[1:round(nrow(GlobalRank)*0.80, 0), ]$Type
test_labels <- GlobalRank[(round(nrow(GlobalRank)*0.80, 0)+1):nrow(DTM), ]$Type
# Check proportion of ham and spam is similar on the training and test set
prop.table(table(train_labels))
prop.table(table(test_labels))
library(wordcloud)
# Create wordcloud for the whole dataset
wordcloud(words = corpus_clean,
min.freq = 100, # minimum number of times a word must be present before it appears
random.order = FALSE, # Arrange most frequent words to be in the center of the word cloud
color = (colors = c("#4575b4","#74add1","#abd9e9","#e0f3f8","#fee090","#fdae61","#f46d43","#d73027")) # Colour gradient for the font
)
threshold <- 0.1 # in %
min_freq = round(DTM$nrow*(threshold/100),0) # calculate minimum frequency
min_freq
# Create vector of most frequent words
frequent_words <- findFreqTerms(x = DTM, lowfreq = min_freq)
str(frequent_words)
# Filter DTM to only have most frequent words
DTM_train_most_frequent <- DTM_train[, frequent_words]
DTM_test_most_frequent <- DTM_test[, frequent_words]
dim(DTM_train_most_frequent)
is_present <- function(x) {
x <- ifelse(test = x > 0, yes = "Yes", no = "No")
}
x <- is_present(c(1, 0, 3, 4, 0, 0))
x
DTM_train_most_frequent <- apply(X = DTM_train_most_frequent,
MARGIN = 2,
FUN = is_present)
DTM_test_most_frequent <- apply(X = DTM_test_most_frequent,
MARGIN = 2,
FUN = is_present)
library(e1071)
IDN_TLD <- naiveBayes(x = DTM_train_most_frequent, y = train_labels)
IDN_TLD$tables$com
IDN_TLD$tables$org
IDN_TLD$tables$eu
test_predictions <- predict(object = IDN_TLD, newdata = DTM_test_most_frequent)
library(caret)
confusionMatrix(data = test_predictions, reference = test_labels, positive = "spam", dnn = c("Prediction", "Actual"))
library(e1071)
IDN_TLD_with_LE <- naiveBayes(x = DTM_train_most_frequent,
y = train_labels,
laplace = 1
)
test_predictions_with_LE <- predict(object = IDN_TLD_with_LE, newdata = DTM_test_most_frequent)
library(caret)
confusionMatrix(data = test_predictions_with_LE, reference = test_labels, positive = "spam", dnn = c("Prediction", "Actual"))
CM <- confusionMatrix(data = test_predictions, reference = test_labels);
naive_Accuracy <- round(CM$overall[["Accuracy"]], 4)*100
CM_LE <- confusionMatrix(data = test_predictions_with_LE, reference = test_labels);
naive_LE_Accuracy <- round(CM_LE$overall[["Accuracy"]], 4)*100
data.frame(`Without Laplace Estimator` = naive_Accuracy,
`With Laplace Estimator` = naive_LE_Accuracy,
row.names = c("Accuracy"))
library(tidyverse)
library(gmodels) # Crosstable
library(tm)
library(wordcloud)
library(e1071)
data.path <- "C:/Users/ranjith/Desktop/Data Science/"
txt.files <- list.files(path=data.path, full.names = T, recursive = T)
txt.files <- txt.files[str_detect(txt.files, "(majestic_million.csv)")]
LoadData <- function(file_list) {
tables <- lapply(file_list, ReadFile)
data.frame <- do.call(rbind, tables)
data.frame$Domain <- factor(data.frame$Domain, levels = c(0, 1),
labels = c("org","com"))
return(data.frame)
}
ReadFile <- function(majestic_million.csv) {
table <- read_delim(majestic_million.csv,
delim = "\t",
col_names = c("TldRank", "Domain"),
quote = "")
file.name <- str_split(majestic_million.csv, "/", simplify = TRUE)[2]
table['source'] <- str_split(file.name, "_", simplify = TRUE)[1]
return(table)
}
data.frame <- LoadData(txt.files)
set.seed(1985)
data.frame <- data.frame[order(runif(n=3000)),]
## Exploring the data
str(data.frame)
summary(data.frame)
head(data.frame)
corpus <- Corpus(VectorSource(data.frame$TldRank))
inspect(corpus[1:5])
#Before splitting the TldRank into words, we will need to perform some common cleaning steps in order to remove punctuation and other characters that may clutter the result. For example, we would like to count hello!, HELLO..., and Hello as instances of the word hello.
clean.corpus <- tm_map(corpus, tolower)
clean.corpus <- tm_map(clean.corpus, removeNumbers)
# Second...
clean.corpus <- tm_map(clean.corpus, removeWords, stopwords())
clean.corpus <- tm_map(clean.corpus, removePunctuation)
clean.corpus <- tm_map(clean.corpus, stripWhitespace)
inspect(clean.corpus[1:5])
clean.corpus.dtm <- DocumentTermMatrix(clean.corpus)
n <- nrow(data.frame)
raw.TldRank.train <- data.frame[1:round(.8 * n),]
raw.TldRank.test <- data.frame[(round(.8 * n)+1):n,]
nn <- length(clean.corpus)
clean.corpus.train <- clean.corpus[1:round(.8 * nn)]
clean.corpus.test <- clean.corpus[(round(.8 * nn)+1):nn]
nnn <- nrow(clean.corpus.dtm)
clean.corpus.dtm.train <- clean.corpus.dtm[1:round(.8 * nnn),]
clean.corpus.dtm.test <- clean.corpus.dtm[(round(.8 * nnn)+1):nnn,]
wordcloud(clean.corpus.train, min.freq = 30, random.order = FALSE)
com <- subset(raw.TldRank.train, Domain == "com")
org <- subset(raw.TldRank.train, Domain == "org")
wordcloud(com$TldRank, max.words = 30, scale = c(3, 0.5))
wordcloud(org$TldRank, max.words = 30, scale = c(3, 0.5))
freq.terms <- findFreqTerms(clean.corpus.dtm.train, 3)
clean.corpus.dtm.freq.train <- DocumentTermMatrix(clean.corpus.train, list(dictionary = freq.terms))
clean.corpus.dtm.freq.test <- DocumentTermMatrix(clean.corpus.test, list(dictionary = freq.terms))
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
clean.corpus.dtm.freq.train <- apply(clean.corpus.dtm.freq.train, MARGIN = 2, convert_counts)
clean.corpus.dtm.freq.test <- apply(clean.corpus.dtm.freq.test, MARGIN = 2, convert_counts)
TldRank.classifer <- naiveBayes(clean.corpus.dtm.freq.train, raw.TldRank.train$Domain)
TldRank.pred <- predict(TldRank.classifer, clean.corpus.dtm.freq.test)
CrossTable(TldRank.pred, raw.TldRank.test$Domain,
prop.chisq = FALSE,
prop.t = FALSE,
dnn = c('predicted', 'actual'))
TldRank.classifer.imp <- naiveBayes(clean.corpus.dtm.freq.train,
raw.TldRank.train$Domain,
laplace = 1)
TldRank.pred.imp <- predict(TldRank.classifer.imp,
clean.corpus.dtm.freq.test)
CrossTable(TldRank.pred.imp, raw.TldRank.test$Domain,
prop.chisq = FALSE,
prop.t = FALSE,
dnn = c('predicted', 'actual'))