Skip to content

Commit

Permalink
Change from ggplot2 to rbokeh
Browse files Browse the repository at this point in the history
replace the graphic package ggplot with rbokeh
  • Loading branch information
SixiangHu committed Dec 3, 2015
1 parent 7a2404e commit 3796dde
Show file tree
Hide file tree
Showing 11 changed files with 229 additions and 350 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: DataMan
Type: Package
Title: This is an R package for data cleaning and preliminary data analysis
Version: 0.1.2
Version: 0.4
Date: 2015-01-24
Author: Sixiang Hu <[email protected]>
Maintainer: Sixiang Hu <[email protected]>
Expand All @@ -10,9 +10,9 @@ Description: This package provides some simple function used for: Data Cleaning
Analysis - an enhanced summary function to provide more info about variables in
the given data.
Depends:
ggplot2
rbokeh
Imports:
data.table,scales,reshape2,googleVis,grid,gridExtra,Rcpp (>= 0.11.6),dplyr
data.table,Rcpp (>= 0.11.6),dplyr
License: GPL (>= 2)
LinkingTo: Rcpp, RcppArmadillo
URL: https://github.com/SixiangHu/DataMan
Expand Down
21 changes: 8 additions & 13 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,18 @@ export(liftPlot)
export(modelPlot)
export(resiPlot)
export(tree2data)
export(liftPlot)
importFrom(data.table,":=")
importFrom(data.table,as.data.table)
importFrom(data.table,data.table)
importFrom(data.table,setkey)
importFrom(dplyr,"%>%")
importFrom(dplyr,group_by)
importFrom(dplyr,left_join)
importFrom(dplyr,summarise)
importFrom(ggplot2,aes)
importFrom(ggplot2,element_text)
importFrom(ggplot2,ggplot)
importFrom(googleVis,gvisComboChart)
importFrom(grid,grid.newpage)
importFrom(grid,pushViewport)
importFrom(grid,unit.pmax)
importFrom(grid,viewport)
importFrom(gridExtra,grid.arrange)
importFrom(reshape2,melt)
importFrom(scales,date_format)
importFrom(scales,percent)
importFrom(rbokeh,figure)
importFrom(rbokeh,grid_plot)
importFrom(rbokeh,ly_abline)
importFrom(rbokeh,ly_hexbin)
importFrom(rbokeh,ly_hist)
importFrom(rbokeh,ly_lines)
importFrom(rbokeh,ly_points)
170 changes: 48 additions & 122 deletions R/dataPlot.r
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' dataPlot
#'
#' @description This function allows you to visualise features of a dataset by specifying dependent and response variable.
#' @usage dataPlot(data,xvar,yvar,byvar=NULL,weights=NULL,interactive=FALSE,newGroupNum=10,...)
#' @usage dataPlot(data,xvar,yvar,byvar=NULL,weights=NULL,newGroupNum=10)
#' @param data a data frame.
#' @param xvar either an integer to specify the position of the dependent variable in the data frame,
#' or a character string to indicate the dependent variable name.
Expand All @@ -11,11 +11,9 @@
#' a numerical vector to specify the weights used for calculating weighted average of response,
#' a character string to specify the name of weight variable in the data frame,
#' an integer to specify the position of the weight variable in the data frame.
#' @param interactive logical. Set true to use googleVis package plotting interactively, which can be used in shiny apps. Currently it doesn't work when using "by" method.
#' @param byvar Optinal. either an integer to specify the position of the <by> variable in the data frame,
#' or a character string to indicate the <by> variable name.
#' @param newGroupNum Optional. An integer specifies number of new bands when levels of current plotting variable `xvar` or `by` is more than 100.
#' @param ... xlim and ylim can be used to set the range of the ggplot2 plot. For example, xlim=c(0,1) means restrict the xaxis within (0,1). This does not work for goolgeVis interactive plot because, because, because it is interactive, which you can zoom in and out with your mouse. :)
#' @param newGroupNum An integer specifies number of new bands when levels of current plotting variable `xvar` or `by` is more than 100.
#' @details
#' Before entering modelling stage, we may want to go through variable by variable in a data set to find the
#' features for response variable. This function provides this functionality.
Expand All @@ -25,32 +23,23 @@
#'
#' @author Sixiang Hu
#' @importFrom data.table as.data.table setkey :=
#' @importFrom ggplot2 aes element_text ggplot
#' @importFrom gridExtra grid.arrange
#' @importFrom grid unit.pmax
#' @importFrom reshape2 melt
#' @importFrom googleVis gvisComboChart
#' @importFrom scales percent date_format
#' @importFrom rbokeh figure ly_lines ly_points ly_hist grid_plot
#' @export dataPlot
#' @examples
#'
#' dataPlot(mtcars,"wt","mpg")
#'
#' dataPlot(mtcars,"wt","mpg",byvar="vs")

dataPlot <- function(data,xvar,yvar,byvar=NULL,weights=NULL,interactive=FALSE,newGroupNum=10,...){

opts.list<-list(...)
opts <- names(list(...))
if("xlim" %in% opts) xlim<-opts.list$xlim
if("ylim" %in% opts) ylim<-opts.list$ylim
if("binwidth" %in% opts) binwidth<-opts.list$binwidth else binwidth <- 1
dataPlot <- function(data,xvar,yvar,byvar=NULL,weights=NULL,
newGroupNum=10){

# Error Trapping
if( is.null(data) ) stop("data set provided is null.")
if( is.null(xvar) ) stop("X variable provided is null.")
if( is.null(yvar) ) stop("Responce variable provided is null.")

# Find data column
if (is.character(xvar)) {
if(!xvar %in% colnames(data)) stop(paste("xvar variable (",xvar,") cannot be found.",""))
x <- data[,which(names(data)==xvar)]
Expand All @@ -64,8 +53,8 @@ dataPlot <- function(data,xvar,yvar,byvar=NULL,weights=NULL,interactive=FALSE,ne

if (is.character(yvar)) {
if(!yvar %in% colnames(data)) stop(paste("yvar variable (",yvar,") cannot be found.",""))
yname <- yvar
y <- data[,which(names(data)==yvar)]
yname <- yvar
}
else if (is.integer(yvar)) {
y <- data[,yvar]
Expand All @@ -88,7 +77,6 @@ dataPlot <- function(data,xvar,yvar,byvar=NULL,weights=NULL,interactive=FALSE,ne
else by <- NULL

if( !is.null(weights) ){
wname = "w"
if (is.character(weights)){
if(!weights %in% colnames(data) ) stop(paste("xvar variable (",weights,") cannot be found.",""))
w <- data[,which(names(data)==weights)]
Expand All @@ -103,131 +91,69 @@ dataPlot <- function(data,xvar,yvar,byvar=NULL,weights=NULL,interactive=FALSE,ne
}
else w <- rep(1,dim(data)[1])

#New Group for xvar which has too much levels.
#New Group for xvar if it has too many levels.
if ( (is.numeric(x) || is.integer(x) ) && nlevels(as.factor(x))>100 ) {
if ( is.null(newGroupNum) ) newGroupNum <- 10

new_band <- seq(min(x, na.rm = TRUE),max(x, na.rm = TRUE),length.out=newGroupNum)
x <- cut(x,new_band,include.lowest = TRUE)
}
#New Group for byvar which has too much levels.

#New Group for byvar if it has too many levels.
if(!is.null(by)){
if ( (is.numeric(by) || is.integer(by)) && nlevels(as.factor(by))>100 ) {
if ( is.null(newGroupNum) ) newGroupNum <- 10

if ( (is.numeric(by) || is.integer(by)) && nlevels(as.factor(by))>20 ) {
new_band <- seq(min(by, na.rm = TRUE),max(by, na.rm = TRUE),length.out=newGroupNum)
by <- cut(by,new_band,include.lowest = TRUE)
}
}

#Data for plot
strTitle <- paste("Observation Analysis on: ",xname)

if (is.null(by)) {
data.plot <- data.table::as.data.table(as.data.frame(cbind(x=x,y=y,w=w),stringsAsFactors=FALSE))
data.plot <- data.table::data.table(x=x,y=y,w=w)
data.table::setkey(data.plot,x)

data.plot <- data.plot[,lapply(.SD,as.numeric),by=x,.SDcols=c("y","w")]
data.agg <- as.data.frame(data.plot[,lapply(.SD,weighted.mean,w=w),by=x,.SDcols=c("y","w")],row.names=c("xvar","weights","observed"))
data.freq <- as.data.frame(data.plot[,sum(w),by=x][,freq:=V1/sum(V1)])

data.melt <- reshape2::melt(data.agg[,-3],id=c("x"))

#line graph

strV1 <- paste("Observation Analysis on: ",xname)

gLine <- ggplot2::ggplot(data=data.melt,aes(x=x,y=value)) +
ggplot2::geom_line(size=1,colour= "magenta3") + ggplot2::geom_point(size=4,fill="white",shape=22)
if(("xlim" %in% opts) && is.numeric(data.melt$x)) gLine <-gLine + ggplot2::scale_x_continuous(limits=xlim)
else if(("xlim" %in% opts) && !is.numeric(data.melt$x)) gLine <-gLine + ggplot2::scale_x_discrete(limits=xlim)
else if(("xlim" %in% opts) && is(data.melt[,"x"],"Date")) gLine <-gLine + ggplot2::scale_x_date(label=date_format("%y%m"),limits=xlim)
if("ylim" %in% opts) gLine <-gLine + ggplot2::ylim(ylim)
gLine <- gLine + ggplot2::xlab("") + ggplot2::ylab(yname)+ ggplot2::ggtitle(strV1)+ theme_mp_line
if(nlevels(as.factor(data.melt$x))>25) gLine <- gLine + ggplot2::theme(axis.text.x = element_text(angle = 90,hjust=0.5,vjust=0.5))

#histogram graph
ghist <- ggplot2::ggplot(data=data.freq,aes(x=x,y=freq))+
ggplot2::geom_histogram(stat="identity",colour="black",fill="yellow")
if(("xlim" %in% opts) && is.numeric(data.melt$x)) ghist <-ghist + ggplot2::scale_x_continuous(limits=xlim)
else if(("xlim" %in% opts) && !is.numeric(data.melt$x)) ghist <-ghist + ggplot2::scale_x_discrete(limits=xlim)
ghist <- ghist + ggplot2::ylab("percent (%)")+
ggplot2::scale_y_continuous(labels = percent)+
ggplot2::xlab("")+ theme_mp_hist

p1 <- ggplot2::ggplot_gtable(ggplot2::ggplot_build(gLine))
p2 <- ggplot2::ggplot_gtable(ggplot2::ggplot_build(ghist))

maxWidth <- grid::unit.pmax(p1$widths[2:3], p2$widths[2:3])

p1$widths[2:3] <- maxWidth
p2$widths[2:3] <- maxWidth

gridExtra::grid.arrange(p1,p2,ncol=1,nrow=2,heights=c(4,1))

if (interactive) {
df <- data.frame(data.agg,freq=data.freq$freq)
gvisSingleOptionList <- list(pointSize=8,
series="[
{targetAxisIndex:0, type:'line',color:'magenta',pointShape: 'square'},
{targetAxisIndex:1, type:'bars',color:'yellow'}]",
crosshair="{trigger:'both'}",
hAxis.title=xname,
theme="maximized",
title=paste0("Observation analysis on ",xname, " Observed"),
vAxes="{1:{format:'##.#%',maxValue:1}}",
explorer="{ actions: ['dragToZoom', 'rightClickToReset'],keepInBounds: true }",
chartArea="{width:'90%',height:'90%'}",
height=750)

plot(googleVis::gvisComboChart(df,xvar="x",yvar="y",options=gvisSingleOptionList))

data.agg <- data.plot[,lapply(.SD,weighted.mean,w=w),by=x,.SDcols=c("y","w")]
data.agg <- data.agg[order(as.factor(x)),]

rbokeh::figure(title = strTitle,ylab=yname,height = 500, width = 900) %>%
rbokeh::ly_lines(as.factor(x),y,data=data.agg) %>%
rbokeh::ly_points(as.factor(x),y,data=data.agg,glyph=0,
hover="<strong>x value:</strong> @x<br><strong>y value:</strong> @y")

if (class(x) %in% c("integer","numeric","Date")) {
p2 <- rbokeh::figure(xlab="",ylab="Frequency",height = 250, width = 900) %>%
rbokeh::ly_hist(x,breaks=nlevels(as.factor(x)))
}
else {
p2 <- rbokeh::figure(xlab="",ylab="Frequency",height = 250, width = 900) %>%
rbokeh::ly_bar(x[order(x)])
}

#rbokeh::grid_plot(list(list(p1),list(p2)),nrow=2,ncol=1,same_axes = c(TRUE, FALSE))
}
else{
data.plot <- data.table::as.data.table(as.data.frame(cbind(x=x,y=y,w=w,by=by),stringsAsFactors=FALSE))
data.table::setkey(data.plot,x,by)

data.plot <- data.plot[,lapply(.SD,as.numeric),by=list(x,by),.SDcols=c("y","w")]

data.agg <- as.data.frame(data.plot[,lapply(.SD,weighted.mean,w=w),by=list(x,by),.SDcols=c("y","w")],row.names=c("xvar","by","weights","observed"))

data.freq <- as.data.frame(data.plot[,sum(w),by=list(x,by)][,freq:=V1/sum(V1)])

data.agg <- data.plot[,lapply(.SD,weighted.mean,w=w),by=list(x,by),.SDcols=c("y","w")]

#line graph
gLine1 <- ggplot2::ggplot(data=data.agg,aes(x=x,y=y,group=factor(by),colour=factor(by)))+
ggplot2::geom_line(size=1) + ggplot2::geom_point(size=4,fill="white")
if("xlim" %in% opts) gLine1 <-gLine1 + xlim(xlim)
if("ylim" %in% opts) gLine1 <-gLine1 + ylim(ylim)
gLine1 <- gLine1+ggplot2::xlab("")+ggplot2::ylab(yname)+ ggplot2::ggtitle(paste("Observation Analysis on: ",xname," by ",byname))+
theme_mp_line
if(nlevels(as.factor(data.agg$x))>25) gLine1 <- gLine1 + ggplot2::theme(axis.text.x = element_text(angle = 90,hjust=0.5,vjust=0.5))

#histogram graph
ghist <- ggplot2::ggplot(data=data.freq,aes(x=x,y=freq,fill=factor(by)))+ ggplot2::geom_histogram(stat="identity",binwidth=1)
if("xlim" %in% opts) ghist <-ghist + ggplot2::xlim(xlim)
ghist <- ghist + ggplot2::xlab("")+ ggplot2::ylab("percent (%)")+ ggplot2::scale_y_continuous(labels = percent) +
theme_mp_hist

p1 <- ggplot2::ggplot_gtable(ggplot2::ggplot_build(gLine1))
p2 <- ggplot2::ggplot_gtable(ggplot2::ggplot_build(ghist))

maxWidth <- grid::unit.pmax(p1$widths[2:3], p2$widths[2:3])

p1$widths[2:3] <- maxWidth
p2$widths[2:3] <- maxWidth
p1 <- rbokeh::figure(title = strTitle,xlab="",ylab=yname,height = 500, width = 900) %>%
rbokeh::ly_lines(x,y,data=data.agg,group=by,color=by) %>%
rbokeh::ly_points(x,y,data=data.agg,group=by,color=by,
hover="<strong>x value:</strong> @x<br><strong>y value:</strong> @y<br><strong>by value:</strong> @by")

if (class(x) %in% c("integer","numeric","Date")) {
p2 <- rbokeh::figure(xlab="",ylab="Frequency",height = 250, width = 900) %>%
rbokeh::ly_hist(x,breaks=nlevels(as.factor(x)))
}
else {
p2 <- rbokeh::figure(xlab="",ylab="Frequency",height = 250, width = 900) %>%
rbokeh::ly_bar(x,color=by,data=data.plot)
}

gridExtra::grid.arrange(p1,p2,ncol=1,nrow=2,heights=c(4,1))
grid_plot(list(p1,p2),nrow=2,ncol=1,byrow=TRUE,same_axes = c(TRUE, FALSE))
}
}

#theme for plot ggplot2 graph
theme_mp_line <- ggplot2::theme_bw() +
ggplot2::theme(text=ggplot2::element_text(face="bold.italic"),
legend.justification=c(1,1),
legend.position=c(1,1))


theme_mp_hist <- ggplot2::theme_bw() +
ggplot2::theme(text=ggplot2::element_text(face="bold.italic"),
axis.text.x = ggplot2::element_blank(),
legend.position = 'none')
}
41 changes: 26 additions & 15 deletions R/liftPlot.r
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
#' @importFrom dplyr group_by summarise %>%
#' @export liftPlot
#' @examples
#'
#' glm1 <- glm(mpg~cyl,data=mtcars,family=Gamma(log))
#' glm2 <- glm(mpg~cyl+vs,data=mtcars,family=Gamma(log))
#' pred1 <- predict(glm1,mtcars)
#' pred2 <- predict(glm2,mtcars)
#' data <- cbind(pred1,pred2)
#' liftPlot(data)

liftPlot <- function(data,weight=NULL,bucket=20,showas=NULL){
newNameFlag <- TRUE
Expand All @@ -23,37 +30,41 @@ liftPlot <- function(data,weight=NULL,bucket=20,showas=NULL){
if (!is.null(showas) && length(showas) != dim(data)[2]) stop("Names provided in `showas` has a different length with column provided.")
else if (is.null(showas)) newNameFlag <- FALSE

P <- data.frame(Group=NULL,Pred = NULL)
P <- data.frame(Group=NULL,Pred = NULL,ModelNames=NULL)

for(i in 1:dim(data)[2]){
temp <- liftGroup(data[,i],weight,bucket)
temp <- as.data.frame(.liftGroup(data[,i],weight,bucket))

if (newNameFlag) temp$ModelNames <- showas[i]
else temp$ModelNames <- names(data)[i]

else if (!is.null(names(data)[i])) temp$ModelNames <- names(data)[i]
else temp$ModelNames <- paste("Mode; ",i,sep="")

P <- rbind(P,temp)
}

tools <- c("pan", "wheel_zoom", "box_zoom", "box_select", "resize", "reset")

p1 <- figure(tools = tools,height=400) %>%
ly_lines(Group, Pred, data = P, color = ModelNames)

P <- data.frame(P)
p2 <- figure(tools = tools,height=200) %>%
ly_hist(Group,data=P,breaks=bucket)

grid_plot(list(p1, p2), same_axes = TRUE,
nrow=2,ncol=1,byrow=TRUE,width=800)

ggplot(P,aes(x=Group,y=Pred,group=ModelNames,colour=ModelNames)) +
geom_line(size=1.5)+geom_point()+
theme_bw()+
theme(legend.justification=c(1,0), legend.position=c(1,0))+
xlab("Exposure Groups")+ylab("Weighted Predictions")
}

liftGroup <- function(pred,weight=NULL,bucket){
.liftGroup <- function(pred,weight=NULL,bucket){

df <- data.frame(cbind(pred,weight))
df <- df[order(df[,1]),]
df$group <- floor((1:length(pred))/ ceiling(length(pred)/bucket))+1

df_plot <-df %>%
group_by(group) %>%
summarise(wmean = weighted.mean(pred, weight))

df_plot <- data.frame(df_plot)

names(df_plot) <- c("Group","Pred")
df_plot
}
Loading

0 comments on commit 3796dde

Please sign in to comment.