R Code
##Set directory
setwd("E:/Business Analytics (sem-2)/New Project")
train_data<-read.csv("Train_UWu5bXk.csv")
test_data<-read.csv("Test_u94Q5KV.csv")
getwd()
##Data visulizition
library(ggplot2)
ggplot(train_data, aes(x= Item_Visibility, y = Item_Outlet_Sales)) + geom_point(size = 2.5, color="orange")+ggtitle('Scatter Plot') + xlab("Item Visibility") + ylab("Item Outlet Sales")
ggplot(train_data, aes(Item_Type, Item_Outlet_Sales)) + geom_bar( stat = "identity")+ggtitle('Bar Plot') +theme(axis.text.x = element_text(angle = 70, vjust = 0.5, color = "navy"))
ggplot(train_data, aes(Item_Type, Item_MRP)) +geom_boxplot() +ggtitle("Box Plot") + theme(axis.text.x = element_text(angle = 70, vjust = 0.5, color = 'red'))
library(VIM)
aggr_plot <- aggr(train_data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(train_data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern")) ##missing value plot
library(corrgram)
corrgram(train_data, order=NULL, lower.panel=panel.shade,
upper.panel=NULL, text.panel=panel.txt,
main="Bigmart Sales Data
Correlation plot") ##cor plot
smoothScatter(train_data$Item_MRP,train_data$Item_Outlet_Sales, main= 'scatter Plot')
## missing value implute for contionus variable
train_data$Item_Weight[is.na(train_data$Item_Weight)] <- median(train_data$Item_Weight, na.rm = TRUE)
table(is.na(train_data$Item_Weight))
test_data$Item_Weight[is.na(test_data$Item_Weight)] <- median(test_data$Item_Weight, na.rm = TRUE)
table(is.na(test_data$Item_Weight))
##missing value implute for categorical variable
table(train_data$Outlet_Size)
class(train_data$Outlet_Size)
train_data$Outlet_Size = as.character(train_data$Outlet_Size)
train_data$Outlet_Size[train_data$Outlet_Size == ""] = NA
train_data$Outlet_Size = as.factor(train_data$Outlet_Size)
summary(train_data$Outlet_Size)
library(VIM)
train_data<-kNN(train_data,k=5)
summary(train_data)
head(train_data)
train_data<-subset(train_data, select = Item_Identifier:Item_Outlet_Sales)
names(train_data)
summary(train_data)
table(is.na(train_data))
table(test_data$Outlet_Size)
class(test_data$Outlet_Size)
test_data$Outlet_Size = as.character(test_data$Outlet_Size)
test_data$Outlet_Size[test_data$Outlet_Size == ""] = NA
test_data$Outlet_Size = as.factor(test_data$Outlet_Size)
summary(test_data$Outlet_Size)
test_data<-kNN(test_data,k=5)
summary(test_data)
head(test_data)
test_data<-subset(test_data, select = Item_Identifier:Outlet_Type)
names(test_data)
summary(test_data)
table(is.na(test_data))
## Rename the observation
library(plyr)
train_data$Item_Fat_Content <- revalue(train_data$Item_Fat_Content,c("LF" = "Low Fat", "reg" ="Regular"))
train_data$Item_Fat_Content <- revalue(train_data$Item_Fat_Content, c("low fat" = "Low Fat"))
test_data$Item_Fat_Content <- revalue(test_data$Item_Fat_Content,c("LF" = "Low Fat", "reg" ="Regular"))
test_data$Item_Fat_Content <- revalue(test_data$Item_Fat_Content, c("low fat" = "Low Fat"))
## Create a new variable
train_data$Age_of_Year <- 2013 - train_data$Outlet_Establishment_Year
test_data$Age_of_Year <- 2013 - test_data$Outlet_Establishment_Year
#drop variables not required in modeling
library(dplyr)
train_data <- select(train_data, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))
test_data <- select(test_data, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))
#impute 0 in item_visibility
train_data$Item_Visibility <- ifelse(train_data$Item_Visibility == 0, median(train_data$Item_Visibility),train_data$Item_Visibility)
boxplot(train_data$Item_Visibility)
sqrt_visi<-sqrt(train_data$Item_Visibility)
boxplot(sqrt_visi)
## Regression model
linear_model <- lm(Item_Outlet_Sales ~ ., data = train_data)
summary(linear_model)
plot(linear_model)
library(Metrics)
rmse(train_data$Item_Outlet_Sales,(linear_model$fitted.values))
log_linear_model <- lm(log(Item_Outlet_Sales) ~ ., data = train_data)
summary(log_linear_model)
plot(log_linear_model)
rmse(train_data$Item_Outlet_Sales, exp(log_linear_model$fitted.values))
## Decision tree model
library(rpart)
library(e1071)
library(rpart.plot)
library(caret)
#setting the tree control parameters
fitControl <- trainControl(method = "cv", number = 5)
cartGrid <- expand.grid(.cp=(1:50)*0.01)
#decision tree
tree_model <- train(Item_Outlet_Sales ~ ., data = train_data, method = "rpart", trControl = fitControl, tuneGrid = cartGrid)
print(tree_model)
main_tree <- rpart(Item_Outlet_Sales ~ ., data = train_data, control = rpart.control(cp=0.01))
printcp(main_tree)
plotcp(main_tree)
prp(main_tree)
pre_score <- predict(main_tree, type = "vector")
rmse(train_data$Item_Outlet_Sales, pre_score)
rpart.plot(main_tree,uniform = TRUE,main="Regression Tree for Item_Outlet_Sales")
text(main_tree,use.n = TRUE,cex=.8)
## Final predction on Bigmart test data
main_predict <- predict(main_tree, newdata = test_data, type = "vector")
test_data$Item_Outlet_Sales=main_predict
names(test_data)
write.csv(test_data, 'Bigmart Predicted sales(test_data).csv')