top of page

R Code

##Set directory

setwd("E:/Business Analytics (sem-2)/New Project")

train_data<-read.csv("Train_UWu5bXk.csv")

test_data<-read.csv("Test_u94Q5KV.csv")

getwd()

 

##Data visulizition

library(ggplot2)

ggplot(train_data, aes(x= Item_Visibility, y = Item_Outlet_Sales)) + geom_point(size = 2.5, color="orange")+ggtitle('Scatter Plot') + xlab("Item Visibility") + ylab("Item Outlet Sales")

 

ggplot(train_data, aes(Item_Type, Item_Outlet_Sales)) + geom_bar( stat = "identity")+ggtitle('Bar Plot') +theme(axis.text.x = element_text(angle = 70, vjust = 0.5, color = "navy"))

 

ggplot(train_data, aes(Item_Type, Item_MRP)) +geom_boxplot() +ggtitle("Box Plot") + theme(axis.text.x = element_text(angle = 70, vjust = 0.5, color = 'red'))

 

library(VIM)

aggr_plot <- aggr(train_data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(train_data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern")) ##missing value plot

 

library(corrgram)

corrgram(train_data, order=NULL, lower.panel=panel.shade,

  upper.panel=NULL, text.panel=panel.txt,

  main="Bigmart Sales Data

Correlation plot")  ##cor plot

 

smoothScatter(train_data$Item_MRP,train_data$Item_Outlet_Sales, main= 'scatter Plot')

 

## missing value implute for contionus variable

train_data$Item_Weight[is.na(train_data$Item_Weight)] <- median(train_data$Item_Weight, na.rm = TRUE)

table(is.na(train_data$Item_Weight))

 

test_data$Item_Weight[is.na(test_data$Item_Weight)] <- median(test_data$Item_Weight, na.rm = TRUE)

table(is.na(test_data$Item_Weight))

 

##missing value implute for categorical variable

table(train_data$Outlet_Size)

class(train_data$Outlet_Size)

train_data$Outlet_Size = as.character(train_data$Outlet_Size)

train_data$Outlet_Size[train_data$Outlet_Size == ""] = NA

train_data$Outlet_Size = as.factor(train_data$Outlet_Size)

summary(train_data$Outlet_Size)

 

library(VIM)

train_data<-kNN(train_data,k=5)

summary(train_data)

 

head(train_data)

train_data<-subset(train_data, select = Item_Identifier:Item_Outlet_Sales)

names(train_data)

summary(train_data)

table(is.na(train_data))

 

table(test_data$Outlet_Size)

class(test_data$Outlet_Size)

test_data$Outlet_Size = as.character(test_data$Outlet_Size)

test_data$Outlet_Size[test_data$Outlet_Size == ""] = NA

test_data$Outlet_Size = as.factor(test_data$Outlet_Size)

summary(test_data$Outlet_Size)

 

 

test_data<-kNN(test_data,k=5)

summary(test_data)

 

head(test_data)

test_data<-subset(test_data, select = Item_Identifier:Outlet_Type)

names(test_data)

summary(test_data)

table(is.na(test_data))

 

## Rename the observation

library(plyr)

train_data$Item_Fat_Content <- revalue(train_data$Item_Fat_Content,c("LF" = "Low Fat", "reg" ="Regular"))

train_data$Item_Fat_Content <- revalue(train_data$Item_Fat_Content, c("low fat" = "Low Fat"))

 

test_data$Item_Fat_Content <- revalue(test_data$Item_Fat_Content,c("LF" = "Low Fat", "reg" ="Regular"))

test_data$Item_Fat_Content <- revalue(test_data$Item_Fat_Content, c("low fat" = "Low Fat"))

 

## Create a new variable

train_data$Age_of_Year <- 2013 - train_data$Outlet_Establishment_Year

test_data$Age_of_Year <- 2013 - test_data$Outlet_Establishment_Year

#drop variables not required in modeling

library(dplyr)

train_data <- select(train_data, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))

 

test_data <- select(test_data, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))

 

#impute 0 in item_visibility

train_data$Item_Visibility <- ifelse(train_data$Item_Visibility == 0, median(train_data$Item_Visibility),train_data$Item_Visibility)

boxplot(train_data$Item_Visibility)

sqrt_visi<-sqrt(train_data$Item_Visibility)

boxplot(sqrt_visi)

 

## Regression model

linear_model <- lm(Item_Outlet_Sales ~ ., data = train_data)

summary(linear_model)

plot(linear_model)

library(Metrics)

rmse(train_data$Item_Outlet_Sales,(linear_model$fitted.values))

 

log_linear_model <- lm(log(Item_Outlet_Sales) ~ ., data = train_data)

summary(log_linear_model)

plot(log_linear_model)

rmse(train_data$Item_Outlet_Sales, exp(log_linear_model$fitted.values))

 

## Decision tree model

library(rpart)

library(e1071)

library(rpart.plot)

library(caret)

 

#setting the tree control parameters

fitControl <- trainControl(method = "cv", number = 5)

cartGrid <- expand.grid(.cp=(1:50)*0.01)

 

#decision tree

tree_model <- train(Item_Outlet_Sales ~ ., data = train_data, method = "rpart", trControl = fitControl, tuneGrid = cartGrid)

print(tree_model)

 

main_tree <- rpart(Item_Outlet_Sales ~ ., data = train_data, control = rpart.control(cp=0.01))

printcp(main_tree)

plotcp(main_tree)

prp(main_tree)

pre_score <- predict(main_tree, type = "vector")

rmse(train_data$Item_Outlet_Sales, pre_score)

rpart.plot(main_tree,uniform = TRUE,main="Regression Tree for Item_Outlet_Sales")

text(main_tree,use.n = TRUE,cex=.8)

 

## Final predction on Bigmart test data

main_predict <- predict(main_tree, newdata = test_data, type = "vector")

test_data$Item_Outlet_Sales=main_predict

names(test_data)

write.csv(test_data, 'Bigmart Predicted sales(test_data).csv')

bottom of page