July_2013_ILO_Report.R

# ******************************************************************************
#
#       Vegetation Index / Rainfall Estimate Validation Procedures 
#       Created by: Anthony Louis D'Agostino (ald2187@columbia.edu)
#       Date Created: June 18, 2013  
#
# ******************************************************************************


# ===========================  Purpose  ======================================== 
# 
# This script performs 3 discrete tasks 1. Calls existing rainfall estimate and
# VI data from a sniidharita scenario to  emulate results produced in earlier
# ILO reports. 2. Creates a rankings comparion against ARC2 for a
# variable-sized, upscaled VI pixel (i.e., user can specify whether or not to
# spatially average over a 2km x 2km grid to determine correlations and payout
# agreement with rainfall). 3. Creates a rankings comparison against ARC2 by
# masking out pixels with VI spatial correlation coefficients below some
# user-specified level.  The remaining pixels are spatially averaged to produce
# a single VI time-series per site.
#
# Some plots will be sent straight to PDF in the Output folder 
# 
# ******************************************************************************


# ============================  Requirements ===================================
# 
# * sitetable.csv - copied from Dry2012Satellite's common.data folder under the
# sniidharita repository, current as of June 18, 2013.  Will identify any
# discrepancies between this script's output and the earlier ILO reporting data 
#
# * ARC2[/RFE] rainfall files from sniidharita script
# 
# * Sloppy construction of spatial correlation component means currently running
#   code multiple times using different correlation thresholds, then later
#   reading the output .csvs to createa master melted file for ggplotting.  This
#   is something that needs to be improved in the near future. (July 17, 2013)
# ******************************************************************************


# =========================  Edit History  ===================================== 

# 7/19(ALD)
#         *   Added error handling to scaling piece
#         *   Included weight.matrix flag to switch between evi.corr.regrid and
#             evi.corr.mask, the latter which is a weighting matrix.  Changes
#             filename and titles accordingly.  To do both, must run once as 
#             TRUE, then again as FALSE.  
#         *   Comparison for scaling section completed - produces an VIspatial    
#             correlation .png unnecessarily - to be ignored.  
# 7/18(ALD)
#         *   Added error handling for correlation threshold piece in response
#             to excessive script crashes caused by not downloading DL data 
# 7/17(ALD) 
#         *   Changed default ggsave output to .png: scope for setting default
#             file type globally (to consider)
#         *   Moved sitetable to repo since sites will be removed and this 
#             gives us the opportunity to track which ones. 
#         *   Golba, Haleku removed from analysis 
#         *   Kihen removed because of evi.corr.regrid 
#         *   Commenting out functions on "southern sites" since they have been
#             removed from the analysis.  
#         *   Fixed URL for lagged correlations - need to include month component
#         *   Should restructure bench.corr.compare to sub-function based 
#
# 7/16(ALD) 
#         *   Improved masking function in evi.corr.regrid, continued coding 
#             scaling portion of script, have to write out sc.agree to graph
#             and .csv.  
#
# 7/11(ALD) - fixed url problems in evi.corr.regrid - now have flexibility of 
#             calling function using a Lags argument to produce the lag.df obj.
#
# 7/10(ALD) - added a 'total.worst.years' with output csv in lieu of modifying
#             the arc.worst matrix - this gives total years that are below 
#             threshold and therefore sites to focus on for potential contract
#             problems 
#
#             Need to fix calls to arc.early from first site forloop by calling
#             the ARC_worst_years*.csv.  Will improve spatial correlation data. 
#
#             Created lag.df to capture lag correlation values - could make for 
#             a nice accessory plot 
#
# 7/9(ALD)  - added both woreda and site-level map plots so that in write-up
#             we can refer to specific areas when making comments about results 
#
# 7/3(ALD)  - further documentation work, some pseudo-coding for necessary work
#             problems accessing ggplot2 functions inside function, requires 
#             some repetition of procedures - need to discuss with Helen 
#
# 7/2(ALD)  - improved plotting using ggmap, streamlined documentation 
#
# 7/1(ALD)  - created evi.corr.regrid function in vi_functions.R, 

# ******************************************************************************


# ============================= Notes ========================================== 
#
# Datasets may be applicable only for some regions, given how IRI-DL data is 
# divided into tiles - check beforehand.  
# Need to determine which rainfall products we are comparing against before 
# pushing forward with design -> should move to creating generic functions 
#
# * Currently only focusing on ARC2 data, but have also included RFE rankings 
# * May encounter download problems with IRI-DL, solution is to try again.  
# * At present, specific spatial correlation values are manually entered and file 
#     downloaded - in future should create function which analyzes many 
#     correlation values simultaneously.  Follow-up analysis can currently be 
#     performed by specifying which correlation values to consider, given 
#     filename structure.  
#
# VARIABLES OF INTEREST
# * vi.mat = matrix of agreement percentage with ARC data by VI product
# * arc.ranks = for validating earlier ILO results: gives ECDF ranks across 
#     early and late phases 
# * arc.worst = takes the two [first] worst years for both early and late windows
#     complete 'worst' (due to multiple values) saved in separate file 
# ******************************************************************************


# ---- Initialization --------------------

    rm(list = ls())
    library(zoo)
    library(ggmap)  # for visualizations
    library(ggplot2)
    library(reshape2) 
 
#=============================================================================#
  # Specify folder containing this script and related files 
    setwd("~/Documents/Index_Insurance/ILO/ILO_github/")

  # load auxiliary functions like "vi.compare" 
    source("vi_functions.R")

  # Names and creates (in first run) folder for outputted rankings files 
    out.path <- "Output/"
    if (!file.exists(out.path)){dir.create(out.path)}
#=============================================================================#


#===================== ARC Worst Years Assessment =============================
#
# Comparing worst years of ARC2 against select VI products.  Results should 
# match output in earlier ILO reports.  
#
#=============================================================================#


# ---- Read in Precipitation Data -----

  # range of years covered in analysis, specified in Phase One report 
    years <- as.character(seq(2001,2012))  
  
  # choose the scenario to work with (draws upon existing file/folder structure)
  #  base.path <- "~/Documents/Index_Insurance/sniidharita_fist/WIIET_Data/Dry2012satellite/"
    base.path <- "~/Documents/Index_Insurance/sniidharita_fist/WIIET_Data/Dry2012fullassessment/"


  # read in sitetable and common dekad files 
  #  site.data <- read.csv(paste0(base.path,"common.data/sitetable.csv"), header = TRUE, row.names = 1)
    site.data <- read.csv("sitetable.csv", header = TRUE, row.names = 1)

    dekad.cal <- read.csv(paste0(base.path,"common.data/CalendarYearDayDecad.csv"), header = F, skip = 2, row.names = 1)
    dekadmonth <- read.csv(paste0(base.path, "common.data/dekadmonth.csv"))
  
  # folder name under which site-level data is located
    scen <- "scenarios/"
  # folder structure for rbf files 
    rbf.fn <- "/EthRFEadj/rainbyphase.csv"

 
  # Peform replication analysis of 2012 ILO Report? (Default is TRUE)
    repl <- FALSE  
  # PSEUDO - have not incorporate this into a larger if loop, though need to fig
  # which variables would be needed for later in the analysis 


# PSEUDO - 'phase' is passed to vi.common (in vi_functions), yet we are really 
# interested in assessing all phases.  Here I take the short-cut of focusing 
# on a single phase since this is about reproducing an earlier report, not
# developing a new core function.  Should be fixed in the future.  

  # establish parameters for comparison table 
    prods <- c("EVI", "NDVI", "NDWI")  
    satellite <- "Mod"  
    phases <- c("Early","Late")
    phase <- "Early"
  # compare the worst x % of years for agreement between products 
    badyear.thres <- 1/6    
  # create array for early and late arc ecdf rankings 
    arc.ranks <- array(data = NA, dim = c(nrow = dim(site.data)[1], length(years), length(phases)))
    dimnames(arc.ranks) <- list(rownames(site.data), years, phases)
  

  # matrix with agreement % values for all products, all phases 
    vi.mat <- matrix(ncol = length(phases)*length(prods), nrow = length(rownames(site.data)))
    rownames(vi.mat) <- rownames(site.data)

  # not happy with this, but need to access subset of columns later on - this should be fixed 
    phaseprod1 <- paste0(phases[1],prods)
    phaseprod2 <- paste0(phases[2],prods)
    colnames(vi.mat) <- c(phaseprod1, phaseprod2)

  # create an array with ecdf rankings for each product, each site, across all years
    vi.rankings <- array(data = NA, dim = c(length(rownames(site.data)), length(years), length(prods)))
    dimnames(vi.rankings) <- list(rownames(site.data), years, prods)

  # create single matrix to hold the worst years for all ARC windows  
    event.years <- length(phases)*badyear.thres*length(years)
    arc.worst <- matrix(data = NA, ncol = event.years, nrow = dim(site.data)[1])
    rownames(arc.worst) <- rownames(site.data)
    colnames(arc.worst) <- rep(phases, each = event.years/2) # gives "Early Early..."
  
  # create matrix to hold complete worst years for all ARC windows
  #  (not restricted to just badyear.thres * length(years) count per window)
  # adds NA values to latter end of incoming strings 
    total.event.years <- length(phases)*length(years)
    colheads <- as.vector(t(outer(phases,1:length(years), FUN = "paste0")))
    total.worst.years <- matrix(data = NA, ncol = total.event.years, nrow = length(rownames(site.data)), dimnames = list(rownames(site.data),colheads))

  # create matrix to hold rainfall by phase for each site year
    rbf <- matrix(data = NA, ncol = length(phases), nrow = dim(site.data)[1]*length(years))
    colnames(rbf) <- phases
    rownames(rbf) <- as.vector(t(outer(rownames(site.data),years, FUN = "paste0")))


  # begin comparison process on a site-by-site basis 
    for (site in rownames(site.data)){
      
      # for troubleshooting purposes 
       print(paste0("Now calculating comparisons for ",site))
  
      # read in rainfall by phase data  
        rfe <- read.csv(paste0(base.path,scen,site,rbf.fn), header = T, row.names = 1)
        rfe.sub <- rfe[years,]  #only years we're interested in 
        rfe.p1 <- rfe.sub[years,"X1"] # first phase 
        rfe.p2 <- rfe.sub[years,"X3"] # second phase 

       
      ##### Have not worried about RFE for this analysis, but base work is here #  
       
      # generate rankings 
#        Fn1 <- ecdf(rfe.p1)
#        Fn2 <- ecdf(rfe.p2) 
#        r1 <- round(Fn1(rfe.p1), digits = 2)   # ranks for 1st phase
#        r2 <- round(Fn2(rfe.p2), digits = 2)   # ranks for 2nd phase
      
      # identify the years in the bottom "badyear.thres" - problem with ties (what to do?)
#        rfe.years.early <- rownames(rfe.sub)[which(rfe.p1 <= quantile(rfe.p1,probs = badyear.thres))]   
#        rfe.years.late <- rownames(rfe.sub)[which(rfe.p2 <= quantile(rfe.p2,probs = badyear.thres))]
      
      # define sowing windows to calculate ARC rainfall by using dekad calendar 
      # days given for comparison against earlier reports 
    
       
      # read in window data from site-specific contract 
       
      contract <- paste0(base.path, scen, site, "/payout.data/contract.R") 
      source(contract)
       
       
       # read ARC precip file in day-year format
       arc <- read.csv(paste0(base.path,scen,site,"/met.data/precip.daily.csv"), header = T, row.names = 1)
       colnames(arc) <- substr(colnames(arc),2,5)   # remove X from character string preceding colnames
       
       # create subset of only arc data over years in "years", take sum over window timings 
       arc.sub   <- arc[,years]
       
    # identify the dekadal range for the windows    
      e.dek.range <-  (t$phases[1,1]+t$swfirst-1):(t$phases[2,1]+t$swfirst-1)
      l.dek.range <- (t$phases[1,3]+t$swfirst-1):(t$phases[2,3]+t$swfirst-1)
       
      # calculate dekadal rainfall, take min value against contract cap 
      # early window  
       earlies <- 0 
       for (i in e.dek.range){
         earlies <- earlies + pmin(colSums(arc.sub[which(dekad.cal == i),]), t$cap)
       }
       
      # repeat for late window  
       lates <- 0 
       for (j in l.dek.range){
         lates <- lates + pmin(colSums(arc.sub[which(dekad.cal == j),]), t$cap)
       }
       
      # write to our rbf matrix
        rbf[paste0(site,years),phases[1]] <- earlies
        rbf[paste0(site,years),phases[2]] <- lates
       
      # identify which years satisfy the quantile requirement, NA as a flag if  
        arc.years.early <- colnames(arc.sub)[which(earlies <= quantile(earlies, probs = badyear.thres))]
      # raise flag by creating a dupe version with complete data
         arc.years.early.tot <- arc.years.early
       
    # coerce into being the right vector length 
       if (length(arc.years.early) != badyear.thres*length(years)){
         arc.years.early <- arc.years.early[1:(badyear.thres*length(years))]
       }
       
        arc.years.late  <- colnames(arc.sub)[which(lates <= quantile(lates, probs = badyear.thres))]
       # raise flag by creating a dupe version with complete data
        arc.years.late.tot <- arc.years.late    
    # coerce into being the right vector length 
       if (length(arc.years.late) != badyear.thres*length(years)){
         arc.years.late <- arc.years.late[1:(badyear.thres*length(years))] 
       }     
       
       # to accommodate fixed width of total.worst.years - affixes NA to short strings
       total.worst.years[site,] <- c(c(arc.years.early.tot,rep(NA,(length(years)-length(arc.years.early.tot)))), c(arc.years.late.tot,rep(NA,(length(years)-length(arc.years.late.tot)))))
       
       
      # write-in the ecdf rankings of arc estimates
        arc.Fn <- ecdf(earlies)
        arc.ranks[site,,"Early"]  <- round(arc.Fn(earlies), digits = 2) 
      
        arc.Fn <- ecdf(lates)
        arc.ranks[site,,"Late"]   <- round(arc.Fn(lates), digits = 2)  
       
      # returns vector of rankings agreement (overlap) results for early window
      # uses "vi.compare" from vi_functions script  
      # vi.mat includes both early and late rankings, whereas other objs don't   
        phase <- "Early"
        print(sapply(prods,vi.compare))
        vi.mat[site,phaseprod1] <- sapply(prods, vi.compare) 
       
        phase <- "Late"
        print(sapply(prods,vi.compare))
        vi.mat[site,phaseprod2] <- sapply(prods, vi.compare)     
      
      # converting matrix to array for incorporation into master array
      # write ecdf rankings values to master array
        ranks.mat <- sapply(prods, vi.ecdf)
        dim(ranks.mat) <- c(1, length(years), length(prods)) #individual sites
        dimnames(ranks.mat) <- list(site,years,prods)
        vi.rankings[site,,]  <-  ranks.mat
    
        arc.worst[site,] <- as.numeric(c(arc.years.early,arc.years.late))         
       
  } # end of across sites loop 


  # write ARC2 rbf values for comparison against sniid results
    write.csv(rbf, paste0(out.path, "ARC2_rbf",years[1], "-", years[length(years)], ".csv"))

  # write total worst ARC windows (NA's for shorter strings)
    write.csv(total.worst.years, paste0(out.path, "ARC_total_worst", years[1], "-", years[length(years)],".csv"))

  # write worst ARC years by window to file  
    write.csv(arc.worst, paste0(out.path,"ARC_worst_years",years[1],"-", years[length(years)],".csv")) 

  # write single-phase, product-level ecdf rankings to csv
  # how could i approach this using a non-for loop approach?
    f.out <- paste0(out.path,prods,years[1],"-",years[length(years)],"rankings.csv")
    for (i in 1:length(prods)){
      write.csv(vi.rankings[,,i], f.out[i])
      }
    # comparison of these against existing tables looks good  
  
  # create all-site rankings table with lat/lon coordinates 
  # need to add lat/lon & site names as separate entry since previously only used as rownames 
    arc.early <-cbind(arc.ranks[,,"Early"],site.data[,c("Latitude","Longitude")],rownames(site.data))
      colnames(arc.early)[(length(years)+3)] <- "site"
      rownames(arc.early) <- NULL
    arc.late <- cbind(arc.ranks[,,"Late"],site.data[,c("Latitude","Longitude")], rownames(site.data))
      colnames(arc.late)[(length(years)+3)] <- "site"
      rownames(arc.early) <- NULL
  
# make sure we have the same order of sites across objects
if (identical(rownames(site.data), rownames(vi.mat))){
  
  # combine latitude and longitude coordinates for each site  
  # prepare for ggplot2 plotting
  vi.mat <- data.frame(site.data[,c("Latitude", "Longitude")],vi.mat)
  vi.mat <- data.frame(site = rownames(vi.mat), vi.mat)
  rownames(vi.mat) <- NULL
  vi.2 <- melt(vi.mat, id.vars = c("site","Latitude", "Longitude"))
  
} #end if identical clause

  benchmark <- vi.2   # establish as the baseline against 
                      # which other comparisons are judged


    # set separate df's for early and late windows - easier to conduct comparisons   
      benchmark.early <- benchmark[which(benchmark[,"variable"] == "EarlyEVI"),] 
      benchmark.late  <- benchmark[which(benchmark[,"variable"] == "LateEVI"),]

  # save table to file
    f.out <- "vi_comparison_table.csv"
    write.csv(vi.mat, file = paste0(out.path, f.out))
    
  # save sorted version
    vi.sort <- vi.mat[with(vi.mat, order(site)),]  
    f.out <- "vi_comparison_table_sorted.csv"
    write.csv(vi.sort, file = paste0(out.path, f.out)) 
    

# ---- Visualization --------------------

  # API key from Cloud Made Maps
    cm.key <- "3ba6f5c05bc142209d423981fcbacb4a"  

  # create map scenes that have site label text atop map layer, split into N and S 
  # first divide into two scenes, since much of the land in the larger bounding box 
  # does not include any sites 
  #  s.sites <- subset(site.data, Latitude < 9, select = c(Latitude, Longitude, Woreda))
    n.sites <- subset(site.data, Latitude > 12, select = c(Latitude, Longitude, Woreda))
  
  # create bounding box coordinates, specify buffer size, function in vi_functions.R
 #   s.coords <- make.coords(s.sites, 0.1)
    n.coords <- make.coords(n.sites, 0.1)

  # read in cloudmade maps given bounding information from make.coords functions
  #  s.map <- get_cloudmademap(bbox = c(left = s.coords$l, bottom = s.coords$b, right = s.coords$r, top = s.coords$t), api_key = cm.key)  
    n.map <- get_cloudmademap(bbox = c(left = n.coords$l, bottom = n.coords$b, right = n.coords$r, top = n.coords$t), api_key = cm.key)  

  # maps of site labels, saved to output folder 
  #  ggmap(s.map) + geom_text(aes(x=Longitude, y=Latitude, label=rownames(s.sites)), data = s.sites, size = 3) + labs(title = "Ethiopia Harita Sites (S)")
  #  ggsave(filename = paste0(out.path,"EthSouthSitesMap.png"))

    ggmap(n.map) + geom_text(aes(x=Longitude, y=Latitude, label=rownames(n.sites)), data = n.sites, alpha = 1.0, size = 2) + labs(title = "Ethiopia Harita Sites")
    ggsave(filename = paste0(out.path,"EthSitesMap.png"))

  # maps of woreda labels, saved to output folder 
  #  ggmap(s.map) + geom_text(aes(x=Longitude, y=Latitude, label=Woreda), data = s.sites, alpha = 1.0, size = 5) + labs(title = "Ethiopia Harita Woredas (S)")
 #   ggsave(filename = paste0(out.path,"EthSouthWoredasMap.png"))

    ggmap(n.map) + geom_text(aes(x=Longitude, y=Latitude, label=Woreda), data = n.sites, alpha = 1.0, size = 2) + labs(title = "Ethiopia Harita Woredas")
    ggsave(filename = paste0(out.path,"EthWoredasMap.png"))


# set bounding box coordinates for map background. 
# May consider modifying to exclude Michael Debir for crisper presentation. 
# Problems arise when df points are outside the bounding box grabbed from OSM, 
# therefore ensure they are inside.


  b.s <- 0.2  # buffer size in degrees, for how large to create the bounding box around the selected bounding box edge pixels  

override.box <- c("L","B","R","T")    # active only when override = TRUE, vector sequence is left, bottom, right, and top pixels (L,R: longitude, T,B: latitude) - must replace with values in degrees 
    
    override <- FALSE   # if TRUE, overrides default calculation of bounding box 
                        #dimensions in the viz function 

 
    if(override){
      l <- override.box[1]; b <- override.box[2]; r <- override.box[3]; t <- override.box[4]
    } else{
      # default bounding box coordinates, including the earlier specified b.s
eth.coords <- make.coords(arc.early, b.s)
    }


  # convert input dataframe to appropriate form for ggplot2/ggmap use  
    # arc early rankings 
    a_e <- melt(arc.early, id.vars = c("site","Latitude", "Longitude"))
    colnames(a_e)[5] <- "Ranking"
    # arc late rankings 
    a_l <- melt(arc.late, id.vars = c("site","Latitude", "Longitude"))
    colnames(a_l)[5] <- "Ranking"


  # set standard map base layers   
    theme_set(theme_bw(16))
    outmap <- get_cloudmademap(bbox = c(left = eth.coords$l, bottom = eth.coords$b, right = eth.coords$r, top = eth.coords$t), api_key = cm.key)  


    out.stdmap <- get_map(location = c(eth.coords$l, eth.coords$b, eth.coords$r, eth.coords$t), source = "google")

  # generate early window ARC rankings, can modify number of rows output appears in  
    ggmap(outmap) + geom_point(aes(x= Longitude, y = Latitude, color = Ranking), data = a_e) + scale_color_gradient(low = "red", high = "green") + facet_wrap(~ variable, nrow = 3) + labs(title = "ARC Early Window Rankings")
    ggsave(file = paste0(out.path, "ARCearlyranks.png"), scale = 1.5)

  # generate late ARC ranks 
  # wrap in print() to display plot on-screen  
    ggmap(outmap) + geom_point(aes(x=Longitude, y=Latitude, color = Ranking), data = a_l) + scale_color_gradient(low = "red", high = "green") + facet_wrap(~ variable, nrow = 3) + labs(title = "ARC Late Window Rankings")
    ggsave(file = paste0(out.path,"ARClateranks.png"), scale = 1.5)
    

#Comparing worst years of ARC2 against select VI products.  Results should 
# match output in earlier ILO reports.


# ====  These functions should work, but encountering problems =====#
  # Have opted to pull out the commands outside the function - can improve for later iteration 
#  arc.early <- df.melt(arc.early)  # creates a melted version for gg
#  arc.late <- df.melt(arc.late)  # creates a melted version for gg
#  arc.vi.vis(arc.early)
#  arc.vi.vis(arc.late)


# ================= Comparing VI and ARC ranks =================================
# 
# Also used for validating against previous ILO reports, this time comparing 
# individual VI products against ARC and with the badyear.thres value defining
# which are the worst years 
#
# *****************************************************************************#

  # ARC-VI Agreement on Worst Years
    # this can serve as the benchmark against which other versions can be compared to evaluate any performance gains from regridding, spatial correlation, etc.  
    ggmap(outmap) + geom_point(aes(x=Longitude, y=Latitude, color = value), data = vi.2)      + scale_color_gradient(low = "red", high = "green") + facet_wrap(~ variable, nrow = 2 )      + labs(title = "ARC-VI Worst Year Agreement %")
    ggsave(file = paste0(out.path, "ARCEVIagree.png"), scale = 1.5)


# working version of point-based output 
# print(ggmap(Eth, base_layer = ggplot(aes(x=Longitude, y=Latitude), data = gg.arc.early)) + geom_point(aes(color = factor(Ranking), size = 3)) + facet_wrap(~ variable))  

# working version using scale gradient 
#print(ggmap(Eth, base_layer = ggplot(aes(x=Longitude, y=Latitude), data = gg.arc.early)) + geom_point(aes(color = Ranking)) + scale_color_gradient(low = "red", high = "green") + facet_wrap(~ variable))  


# ======================= VI Grid Size Scaling ================================= 
#
# An alternative to comparing VI results scaled at the same pixel size as ARC2 
# is to 1.) compare across a smaller aggregate pixel size (under 10k x 10k), 
# and 2.) within that smaller box, to only compare pixels whose correlation 
# values 
#
# Since much of the code is common to the correlation work below, outputs are
# in that chunk of code.  
# 
# Will create a .csv output file titled with diameter value for comparison  
#
# How many km wide do you want the aggregate gridding box?  DOES IT NEED TO BE LESS THAN 10KM?  NEEDS TESTING
    range.number <- 15  

# For this range, how many distance points would you like to evaluate and compare?  
    pt.num <- 5  
#
# Specify years of interest - uncomment 1st to reuse years from beginning of analysis
# preferable to ensure that badyear.thres * length(years) > 2 
        years <- years   
      #  years <- as.character(seq(2002,2012))  


#==============================================================================#

  # default pixel size in degrees, from IRI-DL
    MODIS.pixel.size  <-  0.00221704 
    SPOT.pixel.size   <-  0.008928572 

    long.eq <- 111.32 # number of kilometers between longitude degrees at equator, 
                      # if we decide to make adjustments to account for 
                      # shrinking pixel sizes when moving towards poles 

  # returns input argument in deg
    di.range <- seq(0,range.number, length.out = pt.num)/(2*long.eq)  
  # ignores 0 distance value
    di.range <- di.range[-1]   
    
  # clear values for input parameters used in the spatial correlation section  
    lag.start <- 0 
    lag.end <- 0   

  # read in worst arc years from earlier analysis - requires year range to 
  # coincide with range used in beginning of code 
    arc.worst.years <- read.csv(paste0(out.path, "ARC_worst_years", years[1], "-", years[length(years)],".csv"), header = TRUE, row.names = 1)


  # create template array from which early/late windows will be drawn - 
  # worst years are stored here as we move through the loop 
    z.cols <- as.vector(t(outer(phases, di.range, FUN = "paste0")))
    y.cols <- c("Latitude", "Longitude", "Agreement", "Year1", "Year2")
    sc.agree <- array(data = NA, dim = c(dim(site.data)[1], length(y.cols), length(z.cols)), dimnames = list(rownames(site.data), y.cols, z.cols))

  # populate lat/lon values across z index 
      sc.agree[,c("Latitude","Longitude"),] <- as.matrix(site.data[,c("Latitude","Longitude")])

# cycle through the sites
for (site in rownames(site.data)){
  
    print(paste0("Now performing grid size scaling analysis for ", site))
  
    contract.pth<-paste0(base.path,scen,site,"/payout.data/contract.R")
    source(contract.pth)
  
    # identify window months for downloading VI data 
    # 4 dekads later was the [adjustable] standard delay in sniidharita  
    # delay already inherent in IRI Data Library code in evi.corr.regrid since using
    # shiftdatashort with a 1 month lag     
      dekdelay <- 4 
      midearly<-as.integer(t$swfirst+(t$phases[2,1]+t$phases[1,1])/2)
      midlate<-as.integer(t$swfirst+(t$phases[2,3]+t$phases[1,3])/2)
      earlymonth <- as.character(dekadmonth[(midearly+dekdelay)%%36,"Month"])
      latemonth <- as.character(dekadmonth[(midlate+dekdelay)%%36,"Month"])
  
  # loop across the distance ranges for a given site in the for loop 
  for (di.val in di.range){
    
    # returns re-scaled [xy] averaged EVI values for window months specified by 
    # contract parameters
    scale.early <- try(evi.corr.regrid(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = di.val, CorrThreshold = 0, RegridSize = MODIS.pixel.size, Month = earlymonth), TRUE)
    scale.late <- try(evi.corr.regrid(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = di.val, CorrThreshold = 0, RegridSize = MODIS.pixel.size, Month = latemonth), TRUE)   
    
  
    if(inherits(scale.early, "try-error")){
      # created a fake df with NA values to enable code to continue 
      scale.early <- read.csv(paste0(out.path, "ScaleEarlySample.csv"), header = T)
    }
    
      if (inherits(scale.late, "try-error")){
        scale.late <- read.csv(paste0(out.path, "ScaleEarlySample.csv"), header = T)
      }
    

    # FOLLOWING PROCEDURES SHOULD BE PUT INTO FUNCTION 
    
    # now subset to the years of interest and identify which years agree with ARC worst
    # early.worst / late.worst produce worst years, while *.ranks give total ranks vectors
    scale.early <- data.frame(scale.early, Month = substr(scale.early$Time,1,3)[1], Year = as.numeric(substr(scale.early$Time, 5, 8)))
    scale.late  <- data.frame(scale.late,  Month = substr(scale.late$Time,1,3)[1], Year = as.numeric(substr(scale.late$Time, 5, 8)))
    
    # drop the original "Time" column 
    scale.early <- subset(scale.early, select = -Time)
    # evi.late  <- subset(evi.late, Year == years, select = -Time)
    scale.late  <- subset(scale.late, select = -Time)
    
    
    # PSEUDO - create code to ensure that all years are providing data for the same month
    
    # subset to appropriate years window 
    
    
    scale.early <- scale.early[is.element(scale.early$Year,years),]
    scale.late  <- scale.late[is.element(scale.late$Year,years),]
    
    # return the worst years according to badyear.thres value 
    # raise exception and continue script if NAs appear
    sc.early.worst <- try(find.worst(scale.early,"aprod"), TRUE)
    sc.late.worst <- try(find.worst(scale.late,"aprod"), TRUE)
  
    if (inherits(sc.early.worst, "try-error")){
      sc.early.worst <- rep(NA, badyear.thres*length(years))
    }
    
    if (inherits(sc.late.worst, "try-error")){
      sc.late.worst <- rep(NA, badyear.thres*length(years))
    }
    
    
    # references the two worst years from imported CSV 
    arc.site.early <- arc.worst.years[site,c("Early","Early.1")]
    arc.site.late <- arc.worst.years[site,c("Late","Late.1")]
    
    sc.agree[site,c("Agreement", "Year1", "Year2"),paste0("Early",di.val)] <- c(length(intersect(arc.site.early,sc.early.worst)) / max(length(arc.site.early), length(sc.early.worst)), sc.early.worst)
    
    sc.agree[site,c("Agreement", "Year1", "Year2"),paste0("Late",di.val)] <- c(length(intersect(arc.site.late,sc.late.worst)) / max(length(arc.site.late), length(sc.late.worst)), sc.late.worst)
      
  } # end of di.val for loop 
} # end of sites for loop 


  # determine number of iterations per phase: earlies are first, lates are last 
    phases.no <- length(di.range) 

#######  Comparison Against Benchmark Model #######

  # get earlies and lates, manipulated from sc.agree 
  scale.early.df <- adply(sc.agree[,,1:phases.no], c(3))
  scale.late.df <- adply(sc.agree[,,(phases.no+1):(2*phases.no)], c(3))
  colnames(scale.early.df)[1] <- colnames(scale.late.df)[1] <- "WindowScaleSize"
  
  # get bounding box size and return to km 
  # over-writing our old scale.early object - should be all right 
    scale.early <- data.frame(scale.early.df, Window = "Early Window", site = rep(unlist(dimnames(sc.agree)[1]),phases.no))
    scale.early[,1] <- as.factor(paste0(round(as.numeric(gsub("Early", "", scale.early[,"WindowScaleSize"])) * 2 * long.eq, digits = 3), " km")) 
    scale.late <- data.frame(scale.late.df, Window = "Late Window", site = rep(unlist(dimnames(sc.agree)[1]),phases.no))
    scale.late[,1] <- as.factor(paste0(round(as.numeric(gsub("Late", "", scale.late[,"WindowScaleSize"])) * 2 *long.eq, digits = 3), " km")) 

  # subtract the benchmark
  # first roll out benchmark to be comparable
    benchmark.early.scale <- do.call("rbind", rep(list(benchmark.early), phases.no))
    benchmark.late.scale <- do.call("rbind", rep(list(benchmark.late), phases.no))
  # make sure sites align
    if(identical(scale.early[,"site"], benchmark.early.scale[,"site"])){
      print("Early scaled rownames agree - Congrats!")
      scale.early[,"Difference"] <- scale.early[,"Agreement"] - benchmark.early.scale[,"value"]
    }else{
      stop("Mismatch between early benchmark and scale.early - try again!")
      
    }

    if(identical(scale.late[,"site"], benchmark.late.scale[,"site"])){
      print("Late scaled rownames agree - Congrats!")
      scale.late[,"Difference"] <- scale.late[,"Agreement"] - benchmark.late.scale[,"value"]
    }else{
      stop("Mismatch between late benchmark and scale.late - try again!")
    }

  # combine the windows and produce outputs 
    scale.final <- rbind(scale.early,scale.late)
    scale.final$WindowScaleSize <- factor(scale.final$WindowScaleSize, levels = c("3.75 km", "7.5 km", "11.25 km", "15 km"))  

# levels(scale.final$WindowScaleSize) <- 
    fn <- paste0(out.path, "EVIScaleSize_", round(di.range[1], digits = 3), "_", round(di.range[length(di.range)], digits = 3), "_by_", length(di.range), ".")
    ggmap(outmap) + geom_point(aes(x=Longitude, y=Latitude, color = Difference), data = scale.final, size = 3) + scale_color_gradient2(low = "red", high = "green") + facet_grid(Window ~ WindowScaleSize)     
    ggsave(file = paste0(fn,"png"))
    write.csv(scale.final,paste0(fn,"csv"))


#======================= ARC - VI Spatial Correlation ==========================

# SUMMARY: 
# Uses a correlation threshold 'r' such that all pixels whose ARC-VI correlation 
# coefficients exceeding 'r' inside the specified bounding box are averaged 
# out, generating a single VI value for each time period.  Also includes the 
# option to determine the average correlation across a user-specified range
# of lagged months.  
#

  # Specify the correlation coefficient threshold (r) for masking pixels 
       # corr.value <- c(0.0, 0.2, 0.6, 0.8)
        corr.value <- c(0.0, 0.2, 0.4, 0.6, 0.8)

  # Specify years of interest - uncomment 1st to reuse years from beginning of analysis
  # preferable to ensure that badyear.thres * length(years) > 2 
        years <- years   
      #  years <- as.character(seq(2002,2012))  
  
  # used in IRI-DL URL construction: when following lag values are equivalent, 
  # results are for a single lag, e.g.  T 0 1 0 gives no lag.  default is to 
  # step lags by 1 month each 

  # Do you also want to run an analysis on lagged correlation coefficients?  
        Run.Lag <- FALSE
        lag.start <- -1
        lag.end <- 3

  # How large around the site pixel do you wish to create a bounding box?
      # If script fails to run, try reducing this value to shrink the bounding box 
      # Value should likely be < 0.5 
        b.s <- 10/111

  # What spatial resolution do you want to regrid EVI values to?
      # For reference, consider the following pixel sizes (in degrees): 
      #  MODIS.pixel.size  <-  0.00221704 
      #  SPOT.pixel.size   <-  0.008928572 
      # If script fails, try increasing this value 
      # Value should likely be > 0.005 
        rg.size <- MODIS.pixel.size

  # Do you want to apply a correlation threshold mask *OR* a correlation weighted 
  # matrix?  The former excludes EVI values for pixels whose correlation value
  # falls below a user-specified value (in corr.value), while the latter keeps 
  # all correlation values, multiplies them by their respective EVI values,
  # then does an ecdf ranking.

        weight.matrix <- TRUE

#=============================================================================#

  # no need to loop over correlation values if using correlation values as 
  # weighting matrix 
      if(weight.matrix){
        corr.value <- 0.0
      }

  # read in auxiliary file again to incorporate revised values 
    source("vi_functions.R")


spatial.comp <- function(corr.value){
  # create template df from which early/late windows will be drawn - final agreement % values will be stored here as we move through the loop 
    agree.df <- data.frame(Latitude = site.data[,"Latitude"], Longitude = site.data[,"Longitude"], Agreement = NA, Year1 = NA, Year2 = NA)
    rownames(agree.df) <- rownames(site.data)
    
    agree.early <- agree.df 
    agree.late <- agree.df 

# PSEUDO - much of this needs to be turned into a function since repeated for 
# both early and late windows 


    # read in the .csv pertaining to worst years generated from above 
    # for now, filename corresponds to a file created for same year range as above
    # this file import should somehow be more flexible in the future  
    # limiting this portion of the analysis ONLY to the two years from above
    # under the expectation that contract design should result in 
    # badyear.thres * length(years) number of "worst years" 
    arc.worst.years <- read.csv(paste0(out.path, "ARC_worst_years", years[1], "-", years[length(years)],".csv"), header = TRUE, row.names = 1)


  # create obj that includes the averaged correlation value for each lag 
    lag.df <- matrix(nrow = dim(site.data)[1], ncol = (lag.end-lag.start+1))
    rownames(lag.df) <- rownames(site.data)
    colnames(lag.df) <- paste0("Lag", seq(lag.start,lag.end))
  #  colnames(lag.df) <- paste0(seq(lag.start,lag.end), "MonthLag")


for (site in rownames(site.data)){
  # month of VI values determined in addVEG*.R script : reads in dekad month 
  # 
  
  # have to read in site-specific contract parameters to determine which dekads to pull VI products for 
  contract.pth<-paste0(base.path,scen,site,"/payout.data/contract.R")
  source(contract.pth)
  
  midearly<-as.integer(t$swfirst+(t$phases[2,1]+t$phases[1,1])/2)
  midlate<-as.integer(t$swfirst+(t$phases[2,3]+t$phases[1,3])/2)
  
  # 4 dekads later was the [adjustable] standard delay in sniidharita  
  # delay already inherent in IRI Data Library code in evi.corr.regrid since using
  # shiftdatashort with a 1 month lag 
  dekdelay <- 4 
  earlymonth <- as.character(dekadmonth[(midearly+dekdelay)%%36,"Month"])
  latemonth <- as.character(dekadmonth[(midlate+dekdelay)%%36,"Month"])
  
  
  # PSEUDO - this is where the rescaled outputs will be generated 
 # month <- earlymonth
 

  # generates lagged EVI values for all years available  
  
  # find post-processed [X Y] averaged EVI values for both early and late windows 
  # raise exception and continue script if NAs appear in DL data by reading
  # an NA filled locally stored error template - this means error checking will
  # be required again when taking the worst years of the evi.early/late data 

  
  # switch to evi.corr.mask function in vi_functions.R if weighting matrix selected
  if(weight.matrix){
    evi.early <- try(data.frame(evi.corr.mask(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = b.s, Month = earlymonth), Window = "Early"), TRUE) 
    
    
    if(inherits(evi.early, "try-error")){
      evi.early <- read.csv(paste0(out.path, "evicorr_error_template.csv"), header = TRUE)
    }
    
    evi.late <- try(data.frame(evi.corr.mask(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = b.s, Month = latemonth), Window = "Late"), TRUE)
    
    if(inherits(evi.late, "try-error")){
      evi.late <- read.csv(paste0(out.path, "evicorr_error_template.csv"), header = TRUE)}
    
    
  }else{
    evi.early <- try(data.frame(evi.corr.regrid(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = b.s, RegridSize = rg.size, CorrThreshold = corr.value, Month = earlymonth), Window = "Early"), TRUE) 
    
    
    if(inherits(evi.early, "try-error")){
      evi.early <- read.csv(paste0(out.path, "evicorr_error_template.csv"), header = TRUE)
    }
    
    evi.late <- try(data.frame(evi.corr.regrid(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = b.s, RegridSize = rg.size, CorrThreshold = corr.value, Month = latemonth), Window = "Late"), TRUE)
    
    if(inherits(evi.late, "try-error")){
      evi.late <- read.csv(paste0(out.path, "evicorr_error_template.csv"), header = TRUE)
    }
    
    
  } # end else statement
   
  
  # identify [X Y] average correlation values with product per range of lags
  # contingent on Run.Lag status 
   
  if (Run.Lag){
    lagval <- data.frame(evi.corr.regrid(Lat = site.data[site,"Latitude"], Lon = site.data[site,"Longitude"], Size = b.s, CorrThreshold = 0, Month = earlymonth, Lags=TRUE))
    lag.df[site,] <- lagval[,"correlation"]
  }

  
  # now subset to the years of interest and identify which years agree with ARC worst
  # early.worst / late.worst produce worst years, while *.ranks give total ranks vectors
    evi.early <- data.frame(evi.early, Month = substr(evi.early$Time,1,3)[1], Year = as.numeric(substr(evi.early$Time, 5, 8)))
    evi.late  <- data.frame(evi.late,  Month = substr(evi.late$Time,1,3)[1], Year = as.numeric(substr(evi.late$Time, 5, 8)))
  
  # drop the original "Time" column 
    evi.early <- subset(evi.early, select = -Time)
   # evi.late  <- subset(evi.late, Year == years, select = -Time)
    evi.late  <- subset(evi.late, select = -Time)
  
  # PSEUDO - create code to ensure that all years are providing data for the same month

    evi.early <- evi.early[is.element(evi.early$Year,years),]
    evi.late  <- evi.late[is.element(evi.late$Year,years),]
  
    # return the worst years according to badyear.thres value 
    # raise exception and continue script if NAs appear
        early.worst <- try(find.worst(evi.early,"aprod"), TRUE)
        late.worst <- try(find.worst(evi.late,"aprod"), TRUE)
        
        if (inherits(try(find.worst(evi.early,"aprod"), TRUE), "try-error")){
          early.worst <- rep(NA, badyear.thres*length(years))
        }
        
        if (inherits(try(find.worst(evi.late,"aprod"), TRUE), "try-error")){
          late.worst <- rep(NA, badyear.thres*length(years))
        }
    
  
  print(paste0("In ", site, " the worst ", round(badyear.thres, digits = 2), " early window years for the [", years[1], ",", years[length(years)], "]", " period are ", early.worst[1], ",", early.worst[2]))
  
  print(paste0("In ", site, " the worst ", round(badyear.thres, digits = 2), " late window years for the [", years[1], ",", years[length(years)], "]", " period are ", late.worst[1], ",", late.worst[2])) 
  
  # PSEUDO - now run an agree function to find out what percent of years are matching     
  
  
  # references the two worst years from imported CSV 
      arc.early <- arc.worst.years[site,c("Early","Early.1")]
      arc.late <- arc.worst.years[site,c("Late","Late.1")]
  
  # to avoid getting 0% matching when both years are NA, flag and mark result NA
  
  if(!any(is.na(early.worst))){    
    agree.early[site,c("Agreement", "Year1", "Year2")] <- c(length(intersect(arc.early,early.worst)) / max(length(arc.years.early), length(early.worst)), early.worst)
  }else{agree.early[site,c("Agreement", "Year1", "Year2")] <- c(NA, early.worst)
  }

 if(!any(is.na(late.worst))){
   agree.late[site,c("Agreement","Year1","Year2")] <- c(length(intersect(arc.late,late.worst)) / max(length(arc.years.late), length(late.worst)), late.worst)
 }else agree.late[site,c("Agreement","Year1","Year2")] <- c(NA, late.worst) 

  } # end of spatial correlation for loop across sites 

#PSEUDO - want to compare these results against the baseline to identify any performance improvements 

#PSEUDO - insert visualization function here, when it's working 

  # dataframe that includes essential info for gg functions and for csv export
    # prevent duplication of rownames in final arc.evi matrix 
    agree.early <- data.frame(agree.early, site = rownames(agree.early), Window = "Early", Corr_Thres = corr.value)
    rownames(agree.early) <- NULL
    agree.late <- data.frame(agree.late, site = rownames(agree.late), Window = "Late", Corr_Thres = corr.value)
    rownames(agree.late) <- NULL
    evi.arc.sc <- rbind(agree.early, agree.late)


  if(weight.matrix){
    # plot title 
    ti <- paste0("ARC-EVI Agreement for Pixels with Weighted Correlation Matrix")
    # plot filename  
    fn <- paste0(out.path, "EVI_SpCorrWeightMatrix_bs", round(b.s, digits = 3), "_rg", rg.size, "_", years[1], "-", years[length(years)], ".")
    
  }else{
    # plot title 
    ti <- paste0("ARC-EVI Agreement for Pixels with r > ", corr.value)
    # plot filename  
    fn <- paste0(out.path, "EVI_SpCorr_R",corr.value,"_bs", round(b.s, digits = 3), "_rg", rg.size, "_", years[1], "-", years[length(years)], ".") 
    
  } 


  # may need to re-run this code to ensure it generates plot properly  
    ggmap(outmap) + geom_point(aes(x=Longitude, y=Latitude, color = Agreement), data = evi.arc.sc, size = 3) + scale_color_gradient(low = "red", high = "green") + facet_wrap(~ Window, nrow = 1) + labs(title = ti)    
    ggsave(file = paste0(fn,"png"))
    write.csv(evi.arc.sc,paste0(fn,"csv"))
  

}# end of spatial.comp function 


  if (Run.Lag){
    # write lagged correlation averages to file
    lag.df <- cbind(lag.df, site.data[,c("Latitude","Longitude")], site = rownames(site.data))
    fn <- paste0(out.path,"LagCorrAvg_",lag.start,"_",lag.end,".csv")
    write.csv(lag.df, fn)
    
    # some gg work on the lagged correlation values 
    
    lag.melt <- df.melt(lag.df) 
    lag.melt$value <- as.numeric(lag.melt$value)
    # PSEUDO  # remove "Lag" from variable names ?? 
    
    ggmap(outmap) + geom_point(aes(x = Longitude, y = Latitude, color = value), data = lag.melt) + scale_color_gradient(low = "black", high = "red") + facet_wrap(~ variable, nrow = 1) + labs(title = "[X Y] Averaged Lagged EVI-ARC2 Correlation Values")
    
    ggsave(file = paste0(out.path,"AvgLagCorr_",corr.value,"_.png"))
  } # end if Run.Lag loop 


  # run across all correlation threshold values in vector 
    sapply(corr.value, spatial.comp)

    # PSEUDOCODE - insert code to analyze combination of different correlation values here 
  

#######  Comparison Against Benchmark Model #######

    
  # read in the re-scaled grid box data

    compare.wrapper <- function(corrVal){
      # create wrapper to use apply function with single input argument
      return(bench.corr.compare(corrVal, b.s, rg.size))
    }
    sapply(corr.value, compare.wrapper)


# cycles through all correlation values in that big loop and combines them into a single df, then produce a faceted ggmap output to file 
  for (this.corr in corr.value){
    temp.df <- read.csv(paste0(out.path, "DiffEVI_SpCorr_R",this.corr,"_bs", round(b.s, digits = 3), "_rg", rg.size, "_", years[1], "-", years[length(years)],".csv"), header = T)
    if(this.corr == corr.value[1]){
      corr.comb.df <- temp.df
    }else{corr.comb.df <- rbind(corr.comb.df, temp.df)}
  } # end this.corr for loop 

ggmap(outmap) + geom_point(aes(x = Longitude, y = Latitude, color = fin), data = corr.comb.df) + scale_colour_gradient2(low = "red", high = "green", "Difference\nin agreement\npercent") + facet_grid(Window ~ Corr_Thres)
ggsave(file = paste0(out.path, "DiffCorrThresTotalFacet.png"))