Course Paper/Project

profilesivaranjithk
Rprograms.docx

QUiz 3

setwd("C:/Users/ranjith/Desktop/Data Science")

getwd()

x <- 3

print(x)

v1 <- c(2,4,6,8,10)

print(v1)

print(v1[3])

v <- c(1:10)

print(v)

print(v[6])

# Import test data

test<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv")

test1<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv", sep=",")

test2<-read.table("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv", sep=",")

write.csv(test2, file="C:/Users/ranjith/Desktop/Data Science/top million/quiz3out.csv")

# Write CSV in R

write.table(test1, file = "quiz3out1.csv",row.names=TRUE, na="",col.names=TRUE, sep=",")

head(test)

tail(test)

summary(test)

head <- head(test)

tail <- tail(test)

sd(test$Number)

var(test$Number)

plot(test$Number)

hist(test$Number)

str(test$Number)

quit()

Quiz 4

setwd("C:/Users/ranjith/Desktop/Data Science")

getwd()

# Import test data

data<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/top-1m.csv")

print(summary(data))

anscombe<-read.csv("C:/Users/ranjith/Desktop/Data Science/anscombe.csv")

print(summary(anscombe))

sd(anscombe$X)

var(anscombe$X)

sd(anscombe$x1)

var(anscombe$x1)

sd(anscombe$x2)

var(anscombe$x2)

sd(anscombe$x3)

var(anscombe$x3)

sd(anscombe$x4)

var(anscombe$x4)

sd(anscombe$y1)

var(anscombe$y1)

sd(anscombe$y2)

var(anscombe$y2)

sd(anscombe$y3)

var(anscombe$y3)

ff <- y ~ x

mods <- setNames(as.list(1:4), paste0("lm", 1:4))

for(i in 1:4) {

ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)

mods[[i]] <- lmi <- lm(ff, data = anscombe)

print(anova(lmi))

}

sapply(mods, coef)

lapply(mods, function(fm) coef(summary(fm)))

op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))

for(i in 1:4) {

ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)

plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,

xlim = c(3, 19), ylim = c(3, 13))

abline(mods[[i]], col = "blue")

}

mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)

par(op)

plot(sort(data$Number))

hist(sort(data$Number))

plot(density(sort(data$Number)))

library(lattice)

densityplot(data$Number)

densityplot(log(data$Number))

densityplot(data$Number)

densityplot(log(data$Number))

hist(data$Number, breaks=100, main="Website",

xlab="com", col="gray")

abline(v = median(data$Number), col = "magenta", lwd = 4)

rug(data$Website)

boxplot(data$Number,data=data, main="Ranking based on Alexa",

xlab="Sales", ylab="Total")

# Boxplot of MPG by Car Cylinders, using one of R built-in datasets

boxplot(mpg~cyl,data=mtcars, main="Car Milage Data",

xlab="Number of Cylinders", ylab="Miles Per Gallon")

#in our boxplot above, we might want to draw a horizontal line at 12 where the national standard is.

abline(h = 12)

boxplot(data$Number,data=data, main="Ranking based on Alexa",

xlab="Sales", ylab="Total")

# Dot chart of a single numeric vector

dotchart(mtcars$mpg, labels = row.names(mtcars),

cex = 0.6, xlab = "mpg")

library(ROCR)

# Simple Scatterplot

attach(mtcars)

plot(wt, mpg, main="Scatterplot Example",

xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19)

plot(data$Number, data$Website)

lines(lowess(data$Website, data$Number), col="blue")

# Basic Scatterplot Matrix

pairs(data)

pairs(data[0:2])

# Scatterplot Matrices from the car Package

#install.packages("car")

library(car)

#install.packages("ggplot2")

library(ggplot2)

quit()

Quiz 5

install.packages("tidyverse")

library(tidyverse) # data manipulation

install.packages("cluster")

library(cluster) # clustering algorithms

install.packages("factoextra")

library(factoextra) # clustering algorithms & visualization

setwd("C:/Users/ranjith/Desktop/Data Science/top million/")

getwd()

# Import test data

data<-read.csv("C:/Users/ranjith/Desktop/Data Science/top million/majestic_million.csv")

print(summary(data))

data1 <- na.omit(data)

columns <- data[1,]

print(summary(data))

#As we don't want the clustering algorithm to depend to an arbitrary

#variable unit, we start by scaling data using the R function scale:

data1 <- scale(data1)

head(data1)

distance <- get_dist(data1)

print(distance)

# plot cluster library

library(cluster)

# K-Means Cluster Analysis

# simplest example, just the dataset and number of clusters

fit <- kmeans(data1, 2) # 5 cluster solution

# get cluster means

aggregate(data1,by=list(fit$cluster),FUN=mean)

# append cluster assignment

mydata <- data.frame(data1, fit$cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,

labels=2, lines=0)

fit <- kmeans(data1, 8) # 8 cluster solution

# get cluster means

aggregate(data1,by=list(fit$cluster),FUN=mean)

# append cluster assignment

mydata <- data.frame(data1, fit$cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,

labels=2, lines=0)

# K-Means Clustering with 5 clusters

fit <- kmeans(mydata, 5)

# Determine number of clusters

wss <- (nrow(data1)-1)*sum(apply(data1,2,var))

for (i in 2:15) wss[i] <- sum(kmeans(data1,

centers=i)$withinss)

#A plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters.

#The analyst looks for a bend in the plot similar to a scree test in factor analysis

# We want (total within-cluster variation) to be the lowest

plot(1:15, wss, type="b", xlab="Number of Clusters",

ylab="Within groups sum of squares")

# Determine number of clusters

wss <- (nrow(data1)-1)*sum(apply(data1,2,var))

for (i in 2:15) wss[i] <- sum(kmeans(data1,

centers=i)$withinss)

plot(1:15, wss, type="b", xlab="Number of Clusters",

ylab="Within groups sum of squares")

# Cluster Plot against 1st 2 principal components

# vary parameters for most readable graph

library(cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,

labels=2, lines=0)

# Centroid Plot against 1st 2 discriminant functions

library(fpc)

plotcluster(mydata, fit$cluster)

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# try with 25 attempts, 2 clusters

km <- kmeans(data1, centers = 2, nstart = 25)

str(km)

#The output of kmeans is a list with several bits of information. The most important being:

# cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.

#centers: A matrix of cluster centers.

#totss: The total sum of squares.

#withinss: Vector of within-cluster sum of squares, one component per cluster.

#tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).

#betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.

#size: The number of points in each cluster.

# print the clusters

print(km)

# Plot clusters

fviz_cluster(km, data = data1)

(cl <- kmeans(data1, 8))

plot(data1, col = cl$cluster)

points(cl$centers, col = 1:3, pch = 8, cex = 2)

# sum of squares

ss <- function(x) sum(scale(x, scale = FALSE)^2)

## cluster centers "fitted" to each obs.:

fitted.data1 <- fitted(cl); head(fitted.data1)

resid.data1 <- data1 - fitted(cl)

## Equalities : ----------------------------------

cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns

c(ss(fitted.data1), ss(resid.data1), ss(data1)))

stopifnot(all.equal(cl$ totss, ss(data1)),

all.equal(cl$ tot.withinss, ss(resid.data1)),

## these three are the same:

all.equal(cl$ betweenss, ss(fitted.data1)),

all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),

## and hence also

all.equal(ss(data1), ss(fitted.data1) + ss(resid.data1))

)

kmeans(data1,1)$withinss # trivial one-cluster, (its W.SS == ss(x))

## random starts do help here with too many clusters

## (and are often recommended anyway!):

(cl <- kmeans(x, 5, nstart = 25))

plot(x, col = cl$cluster)

points(cl$centers, col = 1:5, pch = 8)

if(!require(arules)) install.packages("arules")

if(!require(arulesViz)) install.packages("arulesViz")

if(!require(dplyr)) install.packages("dplyr")

if(!require(lubridate)) install.packages("lubridate")

if(!require(ggplot2)) install.packages("ggplot2")

if(!require(knitr)) install.packages("knitr")

if(!require(RColorBrewer)) install.packages("RColorBrewer")

library(arules)

library(arulesViz)

library(dplyr)

library(plyr)

library(lubridate)

library(ggplot2)

library(knitr)

library(RColorBrewer)

library(tidyverse)

setwd("C:/Users/ranjith/Desktop/Data Science/")

getwd()

grocery=read.transactions("groceries.csv",sep=",")

summary(grocery)

grocery[1:5,]%>%inspect

grocery[,1:5]%>%itemFrequency

itemFrequencyPlot(grocery, support=0.08)

itemFrequencyPlot(grocery, topN=10)

image(grocery[1:5,])

sample(grocery,10)%>%image

g_rule=apriori(data=grocery,parameter = list(supp=0.005,conf=0.2,target="rules"))

inspect(head(sort(g_rule, by = "lift")))

plot(g_rule)

head(quality(g_rule))

plot(g_rule, method = "grouped")

plot(g_rule,method = "scatterplot")

plot(g_rule,method = "graph")

inspect(g_rule[1:5])

(g_rule%>%sort(by="lift"))[1:10]%>%inspect

berryrules <- subset(g_rule, items %in% "berries")

inspect(berryrules)

b_rule=as(berryrules,"data.frame")

str(b_rule)

b_rule

# Import my dataset

df <- read.csv("majestic_million.csv")

head(df)

df <- df[complete.cases(df), ] # Drop missing values

# Change Domain and IDN_Domain columns to factors

# Factors are the data objects which are used to categorize the data and store it as levels.

df %>% mutate(Domain = as.factor(Domain),

IDN_Domain = as.factor(IDN_Domain))

# MBA analysis

# From package arules

tr <- read.transactions('majestic_million.csv', format = 'basket', sep=',')

summary(tr)

# plot the frequency of items

itemFrequencyPlot(tr)

itemFrequencyPlot(tr,topN=20,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute Item Frequency Plot")

arules::itemFrequencyPlot(tr,

topN=20,

col=brewer.pal(8,'Pastel2'),

main='Relative Item Frequency Plot',

type="relative",

ylab="Item Frequency (Relative)")

# Generate the a priori rules

association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8,maxlen=10))

summary(association.rules)

inspect(association.rules[1:10]) # Top 10 association rules

# Select rules which are subsets of larger rules -> Remove rows where the sums of the subsets are > 1

subset.rules <- which(colSums(is.subset(association.rules, association.rules)) > 1) # get subset rules in vector

# What did customers buy before buying "METAL"

metal.association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8),appearance = list(default="lhs",rhs="METAL"))

inspect(head(metal.association.rules))

# What did customers buy after buying "METAL"

metal.association.rules2 <- apriori(tr, parameter = list(supp=0.001, conf=0.8),appearance = list(lhs="METAL",default="rhs"))

inspect(head(metal.association.rules2))

# Plotting

# Filter rules with confidence greater than 0.4 or 40%

subRules<-association.rules[quality(association.rules)$confidence>0.4]

#Plot SubRules

plot(subRules)

# Top 10 rules viz

top10subRules <- head(subRules, n = 10, by = "confidence")

plot(top10subRules, method = "graph", engine = "htmlwidget")

# Filter top 20 rules with highest lift

# Paralell Coordinates plot - visualize which products along with which items cause what kind of sales.

# Closer arrows re bought together

subRules2<-head(subRules, n=20, by="lift")

plot(subRules2, method="paracoord")

Quiz 7

setwd("C:/Users/ranjith/Desktop/Data Science/")

getwd()

# Import test data

data=read.csv("majestic_million.csv")

head(data) # display the first 6 observations

print(summary(data))

scatter.smooth(x=data$GlobalRank, y=data$TldRank, main="GlobalRank ~ TldRank")# scatterplot

linearMod <- lm(TldRank ~ GlobalRank, data=data) # build linear regression model on full data

print(linearMod)

print(summary(linearMod))

plot(linearMod$residuals, pch = 16, col = "red")

modelSummary <- summary(linearMod) # capture model summary as an object

modelCoeffs <- modelSummary$coefficients # model coefficients

beta.estimate <- modelCoeffs["GlobalRank", "Estimate"] # get beta estimate for speed

std.error <- modelCoeffs["GlobalRank", "Std. Error"] # get std.error for speed

t_value <- beta.estimate/std.error # calc t statistic

p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars)) # calc p Value

f_statistic <- linearMod$fstatistic[1] # fstatistic

f <- summary(linearMod)$fstatistic # parameters for model p-value calc

model_p <- pf(f[1], f[2], f[3], lower=FALSE)

# For model comparison, the model with the lowest AIC and BIC score is preferred.

AIC(linearMod)

BIC(linearMod)

# Build the model on training data -

trainingData=read.csv("majestic_million.csv")

testData=read.csv("majestic_million.csv")

lmMod <- lm(TldRank ~ GlobalRank, data=trainingData) # build the model

distPred <- predict(lmMod, testData) # predict distance

summary (lmMod)

summary (distPred)

# calculate prediction accuracy and error rates

#actuals_preds <- data.frame(cbind(actuals=testData$dist, predicteds=distPred)) # make actuals_predicteds dataframe.

#correlation_accuracy <- cor(actuals_preds) #

#print(head(actuals_preds))

# second dataset, many variables, but all numeric

Top1Million=read.csv("majestic_million.csv")

lmTop1Million = lm(TldRank~., data = Top1Million) #Create the linear regression

summary(lmTop1Million) #Review the results

plot(lmTop1Million$residuals, pch = 16, col = "red")

modelSummary <- summary(lmTop1Million) # capture model summary as an object

modelCoeffs <- modelSummary$coefficients # model coefficients

beta.estimate <- modelCoeffs["GlobalRank", "Estimate"] # get beta estimate for speed

std.error <- modelCoeffs["GlobalRank", "Std. Error"] # get std.error for speed

t_value <- beta.estimate/std.error # calc t statistic

p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars)) # calc p Value

f_statistic <- lmTop1Million$fstatistic[1] # fstatistic

f <- summary(lmTop1Million)$fstatistic # parameters for model p-value calc

model_p <- pf(f[1], f[2], f[3], lower=FALSE)

# For model comparison, the model with the lowest AIC and BIC score is preferred.

AIC(lmTop1Million)

BIC(lmTop1Million)

# Build the model on training data -

Top1Million=read.csv("majestic_million.csv")

library(dplyr)

train<-sample_frac(Top1Million, 0.5)

sid<-as.numeric(rownames(train)) # because rownames() returns character

test<-Top1Million[-sid,]

lmMod <- lm(GlobalRank ~ ., data=train) # build the model

plot(lmMod$residuals, pch = 16, col = "red")

distPred <- predict(lmMod, test) # predict distance

summary (lmMod)

# Import test data

data=read.csv("majestic_million.csv")

head(data) # display the first 6 observations

print(summary(data))

reg <- lm(GlobalRank ~ RefSubNets, data=data)

reg1 <- lm(GlobalRank ~ RefIPs + RefSubNets, data=data)

reg2 <- lm(GlobalRank ~ . , data=data)

print(reg)

print(summary(reg))

print(summary(reg$coefficients))

plot(reg$residuals, pch = 16, col = "red")

library(ggplot2)

qplot(x = GlobalRank, data = data)

qplot(x = TldRank, data = data)

attributes(reg)

print(summary(reg$model))

print(summary(reg$coefficients))

modelSummary <- summary(reg)

modelCoeffs <- modelSummary$coefficients # model coefficients

beta.estimate <- modelCoeffs["GlobalRank", "Estimate"] # get beta estimate for speed

std.error <- modelCoeffs["GlobalRank", "Std. Error"] # get std.error for speed

t_value <- beta.estimate/std.error # calc t statistic

p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars)) # calc p Value

f_statistic <- reg$fstatistic[1] # fstatistic

f <- summary(reg)$fstatistic # parameters for model p-value calc

model_p <- pf(f[1], f[2], f[3], lower=FALSE)

# For model comparison, the model with the lowest AIC and BIC score is preferred.

AIC(reg)

BIC(reg)

library(ggplot2)

qplot(x = GlobalRank, data = data)

qplot(x = TldRank, data = data)

Quiz 8

setwd("C:/Users/ranjith/Desktop/Data Science/")

getwd()

# Import test data

data=read.csv("majestic_million.csv")

head(data) # display the first 6 observations

print(summary(data))

# Check distribution between the 2 binary classes

xtabs(~DomainInfo + GlobalRank, data = data)

xtabs(~DomainInfo + TldRank, data = data)

#First, we convert GlobalRank to a factor to indicate that GlobalRank should be treated as a categorical variable.

data$GlobalRank <- factor(data$GlobalRank)

logit <- glm(DomainInfo ~ RefSubNets + TldRank + GlobalRank, data = data, family = "binomial")

summary(logit)

# We can use the confint function to obtain confidence intervals for the coefficient estimates.

## CIs using profiled log-likelihood

confint(logit)

## CIs using standard errors

confint.default(logit)

#To get the exponentiated coefficients, you tell R that you want to exponentiate (exp)

## odds ratios only

exp(coef(logit))

## odds ratios and 95% CI

exp(cbind(OR = coef(logit), confint(logit)))

#Now for example, we can say that for a one unit increase in TldRank, the odds of

#being Domainted to graduate school (versus not being Domainted) increase by a factor of 2.17.

with(logit, null.deviance - deviance)

data=read.csv("majestic_million.csv")

head(data) # display the first 6 observations

print(summary(data))

names(data)

# Histogram of variables

par(mfrow=c(1,13))

for(i in 1:13) {

hist(data[,i], main=names(data)[i])

}

install.packages("Amelia")

library(Amelia)

library(mlbench)

# Check missing data

missmap(data, col=c("blue", "red"), legend=FALSE)

#featutes/target distribution

pairs(data, col=data$DomainInfo)

# Logistics ReRefSubNetsssion

glm.fit <- glm(DomainInfo ~ RefSubNets + TldRank + GlobalRank, data = data, family = "binomial")

summary(glm.fit)

glm.probs <- predict(glm.fit,type = "response")

glm.probs[1:10]

train.raw <- read.csv('majestic_million.csv',header=T,na.strings=c(""))

#Now we need to check for missing values and look how many unique values there are for each variable using the sapply() function which applies the

#function passed as argument to each column of the dataframe.

sapply(train.raw,function(x) mean(is.na(x)))

missmap(train.raw, main = "Missing values vs observed")

Quiz9

library(readxl)

library(tm)

library(wordcloud)

library(e1071)

library(gmodels)

require(quanteda)#natural language processing package

require(RColorBrewer)

require(ggplot2)

setwd("C:/Users/ranjith/Desktop/Data Science/")

getwd()

GlobalRank <- read.csv("majestic_million.csv", header = FALSE, stringsAsFactors = FALSE)

GlobalRank <- GlobalRank[,1:2]

colnames(GlobalRank) <- c("Type", "Text")

GlobalRank$Type <- factor(GlobalRank$Type) #attaching the class labels to the corpus message text

str(GlobalRank)

# Check the number of spam and ham messages

table(GlobalRank$Type)

prop.table(table(GlobalRank$Type))

library(tm)

sms_corpus <- VCorpus(x = VectorSource(GlobalRank$Text))

# Print corpus

sms_corpus

# Check the text in some messages and their type

lapply(sms_corpus[5:8], as.character)

GlobalRank$Type[5:8]

corpus_clean <- sms_corpus

# Remove Numbers

corpus_clean <- tm_map(x = corpus_clean, FUN = removeNumbers)

corpus_clean <- tm_map(x = corpus_clean, FUN = removePunctuation)

# Print stopwords()

stopwords()

# Remove stop words

corpus_clean <- tm_map(x = corpus_clean, FUN = removeWords, stopwords())

# Install SnowballC package

#install.packages("SnowballC")

# Load package

library(SnowballC)

# Test the function

wordStem(words = c("cooks", "cooking", "cooked"))

# Stem words in corpus

corpus_clean <- tm_map(x = corpus_clean, FUN = stemDocument)

# Remove extra white spaces

corpus_clean <- tm_map(x = corpus_clean, FUN = stripWhitespace)

# Create Document Term Matrix

DTM <- DocumentTermMatrix(x = corpus_clean)

## Create training and test set

# Create Training Set

DTM_train <- DTM[1:round(nrow(DTM)*0.80, 0), ]

# Create Test Set

DTM_test <- DTM[(round(nrow(DTM)*0.80, 0)+1):nrow(DTM), ]

# Create vectors with labels for the training and test set

train_labels <- GlobalRank[1:round(nrow(GlobalRank)*0.80, 0), ]$Type

test_labels <- GlobalRank[(round(nrow(GlobalRank)*0.80, 0)+1):nrow(DTM), ]$Type

# Check proportion of ham and spam is similar on the training and test set

prop.table(table(train_labels))

prop.table(table(test_labels))

library(wordcloud)

# Create wordcloud for the whole dataset

wordcloud(words = corpus_clean,

min.freq = 100, # minimum number of times a word must be present before it appears

random.order = FALSE, # Arrange most frequent words to be in the center of the word cloud

color = (colors = c("#4575b4","#74add1","#abd9e9","#e0f3f8","#fee090","#fdae61","#f46d43","#d73027")) # Colour gradient for the font

)

threshold <- 0.1 # in %

min_freq = round(DTM$nrow*(threshold/100),0) # calculate minimum frequency

min_freq

# Create vector of most frequent words

frequent_words <- findFreqTerms(x = DTM, lowfreq = min_freq)

str(frequent_words)

# Filter DTM to only have most frequent words

DTM_train_most_frequent <- DTM_train[, frequent_words]

DTM_test_most_frequent <- DTM_test[, frequent_words]

dim(DTM_train_most_frequent)

is_present <- function(x) {

x <- ifelse(test = x > 0, yes = "Yes", no = "No")

}

x <- is_present(c(1, 0, 3, 4, 0, 0))

x

DTM_train_most_frequent <- apply(X = DTM_train_most_frequent,

MARGIN = 2,

FUN = is_present)

DTM_test_most_frequent <- apply(X = DTM_test_most_frequent,

MARGIN = 2,

FUN = is_present)

library(e1071)

IDN_TLD <- naiveBayes(x = DTM_train_most_frequent, y = train_labels)

IDN_TLD$tables$com

IDN_TLD$tables$org

IDN_TLD$tables$eu

test_predictions <- predict(object = IDN_TLD, newdata = DTM_test_most_frequent)

library(caret)

confusionMatrix(data = test_predictions, reference = test_labels, positive = "spam", dnn = c("Prediction", "Actual"))

library(e1071)

IDN_TLD_with_LE <- naiveBayes(x = DTM_train_most_frequent,

y = train_labels,

laplace = 1

)

test_predictions_with_LE <- predict(object = IDN_TLD_with_LE, newdata = DTM_test_most_frequent)

library(caret)

confusionMatrix(data = test_predictions_with_LE, reference = test_labels, positive = "spam", dnn = c("Prediction", "Actual"))

CM <- confusionMatrix(data = test_predictions, reference = test_labels);

naive_Accuracy <- round(CM$overall[["Accuracy"]], 4)*100

CM_LE <- confusionMatrix(data = test_predictions_with_LE, reference = test_labels);

naive_LE_Accuracy <- round(CM_LE$overall[["Accuracy"]], 4)*100

data.frame(`Without Laplace Estimator` = naive_Accuracy,

`With Laplace Estimator` = naive_LE_Accuracy,

row.names = c("Accuracy"))

library(tidyverse)

library(gmodels) # Crosstable

library(tm)

library(wordcloud)

library(e1071)

data.path <- "C:/Users/ranjith/Desktop/Data Science/"

txt.files <- list.files(path=data.path, full.names = T, recursive = T)

txt.files <- txt.files[str_detect(txt.files, "(majestic_million.csv)")]

LoadData <- function(file_list) {

tables <- lapply(file_list, ReadFile)

data.frame <- do.call(rbind, tables)

data.frame$Domain <- factor(data.frame$Domain, levels = c(0, 1),

labels = c("org","com"))

return(data.frame)

}

ReadFile <- function(majestic_million.csv) {

table <- read_delim(majestic_million.csv,

delim = "\t",

col_names = c("TldRank", "Domain"),

quote = "")

file.name <- str_split(majestic_million.csv, "/", simplify = TRUE)[2]

table['source'] <- str_split(file.name, "_", simplify = TRUE)[1]

return(table)

}

data.frame <- LoadData(txt.files)

set.seed(1985)

data.frame <- data.frame[order(runif(n=3000)),]

## Exploring the data

str(data.frame)

summary(data.frame)

head(data.frame)

corpus <- Corpus(VectorSource(data.frame$TldRank))

inspect(corpus[1:5])

#Before splitting the TldRank into words, we will need to perform some common cleaning steps in order to remove punctuation and other characters that may clutter the result. For example, we would like to count hello!, HELLO..., and Hello as instances of the word hello.

clean.corpus <- tm_map(corpus, tolower)

clean.corpus <- tm_map(clean.corpus, removeNumbers)

# Second...

clean.corpus <- tm_map(clean.corpus, removeWords, stopwords())

clean.corpus <- tm_map(clean.corpus, removePunctuation)

clean.corpus <- tm_map(clean.corpus, stripWhitespace)

inspect(clean.corpus[1:5])

clean.corpus.dtm <- DocumentTermMatrix(clean.corpus)

n <- nrow(data.frame)

raw.TldRank.train <- data.frame[1:round(.8 * n),]

raw.TldRank.test <- data.frame[(round(.8 * n)+1):n,]

nn <- length(clean.corpus)

clean.corpus.train <- clean.corpus[1:round(.8 * nn)]

clean.corpus.test <- clean.corpus[(round(.8 * nn)+1):nn]

nnn <- nrow(clean.corpus.dtm)

clean.corpus.dtm.train <- clean.corpus.dtm[1:round(.8 * nnn),]

clean.corpus.dtm.test <- clean.corpus.dtm[(round(.8 * nnn)+1):nnn,]

wordcloud(clean.corpus.train, min.freq = 30, random.order = FALSE)

com <- subset(raw.TldRank.train, Domain == "com")

org <- subset(raw.TldRank.train, Domain == "org")

wordcloud(com$TldRank, max.words = 30, scale = c(3, 0.5))

wordcloud(org$TldRank, max.words = 30, scale = c(3, 0.5))

freq.terms <- findFreqTerms(clean.corpus.dtm.train, 3)

clean.corpus.dtm.freq.train <- DocumentTermMatrix(clean.corpus.train, list(dictionary = freq.terms))

clean.corpus.dtm.freq.test <- DocumentTermMatrix(clean.corpus.test, list(dictionary = freq.terms))

convert_counts <- function(x) {

x <- ifelse(x > 0, 1, 0)

x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))

return(x)

}

clean.corpus.dtm.freq.train <- apply(clean.corpus.dtm.freq.train, MARGIN = 2, convert_counts)

clean.corpus.dtm.freq.test <- apply(clean.corpus.dtm.freq.test, MARGIN = 2, convert_counts)

TldRank.classifer <- naiveBayes(clean.corpus.dtm.freq.train, raw.TldRank.train$Domain)

TldRank.pred <- predict(TldRank.classifer, clean.corpus.dtm.freq.test)

CrossTable(TldRank.pred, raw.TldRank.test$Domain,

prop.chisq = FALSE,

prop.t = FALSE,

dnn = c('predicted', 'actual'))

TldRank.classifer.imp <- naiveBayes(clean.corpus.dtm.freq.train,

raw.TldRank.train$Domain,

laplace = 1)

TldRank.pred.imp <- predict(TldRank.classifer.imp,

clean.corpus.dtm.freq.test)

CrossTable(TldRank.pred.imp, raw.TldRank.test$Domain,

prop.chisq = FALSE,

prop.t = FALSE,

dnn = c('predicted', 'actual'))