You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
product_features<-function(data){
##1. KPI - List Price for all the productsdata$list_price<-data$gmv/data$units##2. KPI - Promotional Offer for all the Productsdata$promotional_offer<- (data$product_mrp-data$list_price)/data$product_mrp##3. KPI - Payment Mode Indicatordata$payment_ind<- ifelse(data$s1_fact.order_payment_type=="Prepaid",1,0)
##4. KPI - Prepaid Order Percentage# Total Order Placedtotal_order<- aggregate(payment_ind~Year+Month+week, data, FUN=NROW)
# Total Online Orderonline_order<- aggregate(payment_ind~Year+Month+week, data=data, FUN=sum)
# Merge "total_order" and "online_order"order_merged<- merge(total_order, online_order, by= c("Month", "Year", "week"), all.x=TRUE)
# Calculating the proportion of total online order from total orderorder_merged$per_order<-order_merged$payment_ind.y/order_merged$payment_ind.x# Removing columnsorder_merged<-order_merged[,-c(4,5)]
# Adding "per_order" column in datasetdata<- merge(data, order_merged, by= c("Month", "Year", "week"), all.x=TRUE)
##5. KPI - Product Categorycluster<- aggregate(cbind(units,list_price, product_mrp)~product_analytic_vertical, data, mean)
if(nrow(cluster)>2){
cluster$units_1<- scale(cluster$units)
cluster$list_price_1<- scale(cluster$list_price)
cluster$product_mrp_1<- scale(cluster$product_mrp)
k1<-cluster[,-c(1:3)]
# Applying clustering algorithmclust<- kmeans(k1, centers=3, iter.max=50, nstart=50)
cluster$price_tag<- as.factor(clust$cluster)
cluster<-cluster[, c(1,8)]
# Adding columns generated from the clustering algorithm to the datasetdata<- merge(data, cluster, by=c("product_analytic_vertical"), all.x=TRUE)
k2<- count(data, price_tag)[2]
levels(data$price_tag)[which(k2==max(count(data, price_tag)[2]))] <-"Mass_Product"
levels(data$price_tag)[which(k2==min(count(data, price_tag)[2]))] <-"Premium_Product"
levels(data$price_tag)[which(k2!=max(count(data, price_tag)[2]) &k2!=min(count(data, price_tag)[2]))] <-"Aspiring_Product"
}
else{
data$price_tag<-NAdata$price_tag$product_analytic_vertical<-factor(data$price_tag$product_analytic_vertical)
if(tapply(data$product_mrp, data$product_analytic_vertical, mean)[[1]] > tapply(data$product_mrp, data$product_analytic_vertical, mean)[[2]]){
data$price_tag[which(data$product_analytic_vertical== levels(data$product_analytic_vertical)[1])] <-"Aspiring_Product"data$price_tag[is.na(data$price_tag)] <-"Mass_Product"
}
else{
data$price_tag[which(data$product_analytic_vertical== levels(data$product_analytic_vertical)[2])] <-"Aspiring_Product"data$price_tag[is.na(data$price_tag)] <-"Mass_Product"
}
}
##6. KPI - Adstock# Considering the adstock rate as 50%adstock_rate=0.50# Creating the adstock for each media investmentdf<-data.frame(week=1:53)
for(iin3:ncol(media_investment_weekly_final)){
df[[paste0(colnames(media_investment_weekly_final)[i],"_adstock")]] <-stats::filter(x=media_investment_weekly_final[i], filter=adstock_rate,
method="recursive")
}
# Merging the adstock with the actual datasetdata<- merge(data, df, by= c("week"), all.x=TRUE)
## Converting the data into weekly format# As we have data from July-2015 to June-2016, So we're considering June-105 as our base for week calculation/number# i.e 1st week of July-2015 as 1 (instead of 26), 2nd week of July-2015 as 2 (instead of 27) and so on till June-2016# Also, for Jan-2016 we'll consider the subsequent week number [i.e week number after Dec-2015 last week] (instead as 1st week)data$week<- ifelse(data$week>=26, data$week-25, data$week+28)
# Filtering out the variables which are not necessarydata<- subset(data, select=-c(Month,Year,product_analytic_sub_category,month_date,Week_date))
# Creating two vectors which holds the numeric and categorical variablescol_numeric<- c("week", "gmv", "units", "deliverybdays", "deliverycdays", "sla", "product_mrp", "product_procurement_sla")
col_factor<- c("product_analytic_vertical", "s1_fact.order_payment_type","wday", "is_special_sale_day","special_sale_day", "price_tag")
# Converting the continuous variables into numeric format and Categorical variables in to factorsdata[,col_numeric] <- sapply(data[,col_numeric], as.numeric)
data[,col_factor] <- sapply(data[,col_factor], as.factor)
df_dummies<-data[,col_factor] ## Created a data frame which holds only categorical variables# Creating dummy variables for categorical/factor attributesdummies<-data.frame(sapply(df_dummies, function(x) data.frame(model.matrix(~x-1,data=df_dummies))[,-1]))
dummies<- as.data.frame(cbind(data[1], dummies))
# Aggregate the dummy variables data by weeksdummies_aggregate<- aggregate(.~week, dummies, sum, na.rm=TRUE)
# Aggregating the Actual data variables by weeksdata<-data %>% group_by(week) %>% summarise(gmv= sum(gmv), units= sum(units), deliverybdays= mean(deliverybdays), deliverycdays= mean(deliverycdays),
sla= mean(sla), product_mrp= sum(product_mrp), product_procurement_sla= mean(product_procurement_sla),
Total_Investment= mean(Total_Investment), TV= mean(TV), Digital= mean(Digital), Sponsorship= mean(Sponsorship),
Content_Marketing= mean(Content_Marketing), Online_Marketing= mean(Online_Marketing), Affiliates= mean(Affiliates),
SEM= mean(SEM), Radio= mean(Radio), Other= mean(Other), NPS_Score= mean(`NPS Score`), holiday_count= mean(holiday_count),
list_price= sum(list_price), promotional_offer= sum(promotional_offer)/length(week), per_order= mean(per_order), TV_adstock= mean(TV_adstock),
Digital_adstock= mean(Digital_adstock), Sponsorship_adstock= mean(Sponsorship_adstock), Content_Marketing_adstock= mean(Content_Marketing_adstock),
Online_Marketing_adstock= mean(Online_Marketing_adstock), Affiliates_adstock= mean(Affiliates_adstock), SEM_adtock= mean(SEM_adstock),
Radio_adstock= mean(Radio), Other_adstock= mean(Other_adstock))
# Merging the Dummy and actual data variables in to one data frame data<- merge(data, dummies_aggregate, by= c("week"), all.x=TRUE)
return(data)
}
1.2. Calling the “product_features” function for the 3 Product subcategories to create the Engineered variables and
# Also to convert the whole data into weekly formatGamingAccessory_df<- product_features(eleckart_GamingAccessory)
HomeAudio_df<- product_features(eleckart_HomeAudio)
CameraAccessory_df<- product_features(eleckart_CameraAccessory)
2. EDA for Sub-category :: GamingAccessory [gmv Vs Independent variables]
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2788178 149.0 8441055 450.9 11985735 640.2
## Vcells 96339048 735.1 203636514 1553.7 203636514 1553.7
2.1. Response Curves [‘gmv’ w.r.t all the Independent variables]
GA<-GamingAccessory_final[,c(2:19,21:32,68:82)]
2.1.1. Plotting the scatter plot of all the Independent variables w.r.t ‘gmv’
GA_plots1<-list() # new empty listfor (iin2:45) local({
i<-ip4<- ggplot(GA,aes(x=GA[,i],y=GA[,1])) + geom_point() + geom_smooth(method="loess") + theme_bw() +
labs(x= paste0("", colnames(GA[i])),y="GMV")
GA_plots1[[i-1]] <<-p4# add each plot into plot list
})
2.1.2. Plotting all the graphs
# Note: It takes few seconds to load, please hold in order to come up all the graphs
plot_grid(GA_plots1[[1]],GA_plots1[[2]],GA_plots1[[3]],GA_plots1[[4]],GA_plots1[[5]],GA_plots1[[6]],
GA_plots1[[7]],GA_plots1[[8]],GA_plots1[[9]],align="h")
3. EDA for Sub-category :: HomeAudio [gmv Vs Independent variables]
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3892405 207.9 8441055 450.9 11985735 640.2
## Vcells 98228170 749.5 203636514 1553.7 203636514 1553.7
3.1. Response Curves [‘gmv’ w.r.t all the Independent variables]
HA<-HomeAudio_final[,c(2:19,21:32,63:77)]
3.1.1. Plotting the scatter plot of all the Independent variables w.r.t ‘gmv’
HA_plots1<-list() # new empty listfor (iin2:45) local({
i<-ip5<- ggplot(HA,aes(x=HA[,i],y=HA[,1])) + geom_point() + geom_smooth(method="loess") + theme_bw() +
labs(x= paste0("", colnames(HA[i])),y="GMV")
HA_plots1[[i-1]] <<-p5# add each plot into plot list
})
3.1.2. Plotting all the graphs
# Note: It takes few seconds to load, please hold in order to come up all the graphs
plot_grid(HA_plots1[[1]],HA_plots1[[2]],HA_plots1[[3]],HA_plots1[[4]],HA_plots1[[5]],HA_plots1[[6]],
HA_plots1[[7]],HA_plots1[[8]],HA_plots1[[9]],align="h")