Merge pull request #7 from UNSW-CEEM/benchmarking

Added validation process scripts
UNSW-CEEM · Feb 15, 2022 · 552fc36 · 552fc36
2 parents 421e3b5 + 9559bb8
commit 552fc36
Show file tree

Hide file tree

Showing 7 changed files with 321 additions and 1 deletion.
diff --git a/BDInterface/interface.R b/BDInterface/interface.R
@@ -110,6 +110,16 @@ DBInterface <- R6::R6Class("DBInterface",
 
       con <- RSQLite::dbConnect(RSQLite::SQLite(), self$db_path_name)
 
+      # create DB history table
+      RSQLite::dbExecute(con, "DROP TABLE IF EXISTS db_history")
+      RSQLite::dbExecute(con, "CREATE TABLE db_history(event TEXT, utc_timestamp TEXT, tool_git_hash TEXT)")
+      event <- "built"
+      created_at <- lubridate::now("UTC")
+      tool_git_hash <- git2r::revparse_single(revision="HEAD")$sha
+      RSQLite::dbExecute(con, "INSERT INTO db_history VALUES (:event, :timestamp, :hash)",
+                         params = list(event=event, timestamp=created_at, hash=tool_git_hash))
+
+      # insert timeseries data
       RSQLite::dbExecute(con, "DROP TABLE IF EXISTS timeseries")
 
       RSQLite::dbExecute(con, "CREATE TABLE timeseries(

diff --git a/install.R b/install.R
@@ -25,4 +25,6 @@ install.versions(c('padr'), c('0.5.1'), type=install_type)
 install.versions(c('sqldf'), c('0.4-11'), type=install_type)
 install.versions(c('gridExtra'), c('2.3'), type=install_type)
 install.versions(c('rjson'), c('0.2.20'), type=install_type)
-install.versions(c('R6'), c('2.5.0'), type=install_type)
+install.versions(c('R6'), c('2.5.0'), type=install_type)
+install.versions(c('git2r'), c('0.29.0'), type=install_type)
+install.versions(c('logging'), c('0.10-108'), type=install_type)
diff --git a/shiny.R b/shiny.R
@@ -1015,6 +1015,7 @@ server <- function(input,output,session){
                                       reconnection_time, ramp_above_threshold, max_power, ufls_status,
                                       pre_event_sampled_seconds, post_event_sampled_seconds, 
                                       ufls_status_v, pre_event_v_mean, post_event_v_mean)
+          v$circuit_summary$tool_hash <-git2r::revparse_single(revision="HEAD")$sha
 
           # Summarise and upscale disconnections on a manufacturer basis.
           if (exclude_solar_edge()){

diff --git a/validation/README.md b/validation/README.md
@@ -0,0 +1,36 @@
+# Tool result validation process
+
+In order to ensure that results are not being modified by changes to the tool - or that expected changes ARE being made - we need to validate/benchmark the results.
+
+Validation datasets will be made available on cloudstor once they are finalised.
+
+## Running validation
+
+1. Run analysis on sample data with reference version of tool
+    1. Checkout reference version of tool (currently latest master branch)
+    2. Build database for the events in the `DER_disturbance_analysis/validation/data` directory using `build_validation_databases.R`
+    3. Run tool using metadata from the event directory as config json
+    4. Batch save results in the same directory as the data, entering `ref` as the file name
+2. Run analysis on sample data with version of tool to be tested
+    1. Checkout target branch
+    2. Build database for the events in the `DER_disturbance_analysis/validation/data` directory using `build_validation_databases.R`
+    3. Run tool using metadata from the event directory as config json
+    4. Batch save results in the same directory as the data, entering `test` as the file name
+3. Compare reference and test results using `validate_results.R`
+    1. Identify if results match
+    2. Check any discrepencies against expected impact of test version of tool
+
+## Choosing validation datasets
+
+A representative set of validation data is required. In order to capture this we are currently planning to use the data from 2021-05-25 from QLD, 2020-01-31 from SA, and at least one event using Tesla data (TBD)
+
+For each event the following process will need to be followed:
+
+1. Build database from raw data for event
+2. Run normal tool analysis with appropriate settings for that event. Ensure that frequency data is included if necessary, and all category filters other than "raw" are checked
+3. Batch save the results under a memorable name
+4. Identify appropriate sample circuits based on the results in the circuit summary. Currently the following columns are being used to identify unique circuits:
+    * `response_category`, `reconnection_compliance_status`, `ufls_status`, `compliance_status`, `Standard_Version`
+5. Using the site ID of the sample results filter the raw data to only use included circuits (including site_details, circuit_details and raw data files). 
+6. Save this filtered raw data as a new set of files under `DER_disturbance_analysis/validation/data` in a sub-directory with a meaningful name representing the event.
+7. Copy any required supporting files - the metadata file from the sample selection should be included, as should any necessary network frequency data.
diff --git a/validation/build_validation_databases.R b/validation/build_validation_databases.R
@@ -0,0 +1,71 @@
+# To use this file, run it as an R script
+# If you are not running this from the tool top level directory ensure that you have set tool_directory to the root
+# directory of the DER tool repository'
+# Ensure that the output database is set to "ref" if building reference DBs, or "test" if building test DBs
+
+library("logging")
+library("rjson")
+logging::basicConfig()
+
+base_directory_name <- basename(getwd())
+if (base_directory_name == "DER_disturbance_analysis") {
+    tool_directory <- getwd()
+} else {
+    print("Script is not being run in DER_disturbance_analysis folder, make sure that tool directory has been set")
+    tool_directory <- "~/UNSW/MATCH/DER_disturbance_analysis"
+}
+source(sprintf("%s/BDInterface/interface.R", tool_directory))
+
+data_dirs <- list.dirs('validation/data', recursive=FALSE)
+output_database <- "ref" # "test"
+required_file_names <- c("ref_circuit_details.csv", "ref_meta_data.json", "ref_raw_data.csv", "ref_site_details.csv")
+
+if (length(data_dirs) > 0){
+    for (dir in data_dirs){
+        all_files_in_dir <- list.files(dir)
+        required_files_in_dir <- required_file_names %in% all_files_in_dir
+        if (all(required_files_in_dir)){
+            site_details_path_name <- paste(dir, "/", "ref_site_details.csv", sep="")
+            circuit_details_path_name <- paste(dir, "/", "ref_circuit_details.csv", sep="")
+            timeseries_path_name <- paste(dir, "/", "ref_raw_data.csv", sep="")
+            metadata_path_name <- paste(dir, "/", "ref_meta_data.json", sep="")
+            db_path_name <- paste(dir, "/", output_database, ".db", sep="")
+
+            db <- DBInterface$new()
+            if (!file.exists(db_path_name)){
+                db$connect_to_new_database(db_path_name)
+                logging::loginfo(paste("Creating new database", db_path_name))
+            } else {
+                db$connect_to_existing_database(db_path_name)
+                logging::loginfo(paste("Replacing existing database", db_path_name))
+            }
+
+            db$default_timeseries_column_aliases <- list(utc_tstamp='_ts', c_id='_c_id', voltage='_v', frequency='_f', energy='_e',
+                                                         duration='_d', power='_p', vmin='vmin', vmax='vmax',
+                                                         vmean='vmean')
+            db$build_database(timeseries = timeseries_path_name,
+                              circuit_details = circuit_details_path_name,
+                              site_details = site_details_path_name)
+
+            db$add_postcode_lon_lat_to_database(sprintf("%s/inbuilt_data/postcode_lon_lat.csv", tool_directory))
+
+            db$add_manufacturer_mapping_table(sprintf("%s/inbuilt_data/manufacturer_mapping.csv", tool_directory))
+
+            db$run_data_cleaning_loop(500)
+
+            # update metadata
+            if (file.exists(metadata_path_name)){
+                metadata <- rjson::fromJSON(file=metadata_path_name)
+                metadata$database_name <- sprintf("%s/validation/%s/%s.db", tool_directory, dir, output_database)
+                output_metadata_path <- paste(dir, "/", output_database, "_meta_data.json", sep="")
+                metadata_conn <- file(output_metadata_path)
+                writeLines(rjson::toJSON(metadata, indent=4), metadata_conn)
+                close(metadata_conn)
+            }
+        } else {
+            logging::logerror(sprintf("Required files missing from directory: %s/%s", dir, required_file_names[!required_files_in_dir]))
+        }
+    }
+} else {
+    logging::logerror("No data found in directory")
+}
diff --git a/validation/data/.gitignore b/validation/data/.gitignore
@@ -0,0 +1,4 @@
+# ignore all files in this directory to avoid commmitting data to the repo
+*
+*/
+!.gitignore
diff --git a/validation/validate_results.R b/validation/validate_results.R
@@ -0,0 +1,196 @@
+# Validate benchmarking dataset results
+# Step 1: Ensure you have read the README in this directory and followed the instructions on creating the reference
+#         and test data
+# Step 2: Run this script from DER disturbance analysis directory
+#         OR change tool_directory to the location of the DER disturbance analysis repo
+# Step 3: Review the results printed to the console. Investigate any differences identified to ensure that only
+#         expected changes are present.
+
+logging::basicConfig()
+
+base_directory_name <- basename(getwd())
+if (base_directory_name == "DER_disturbance_analysis") {
+    tool_directory <- getwd()
+} else {
+    print("Script is not being run in DER_disturbance_analysis folder, make sure that tool directory ahs been set")
+    tool_directory <- "~/UNSW/MATCH/DER_disturbance_analysis"
+}
+source(sprintf("%s/BDInterface/interface.R", tool_directory))
+
+data_dirs <- list.dirs('validation/data', recursive=FALSE)
+
+ref_db_name <- "ref.db"
+test_db_name <- "test.db"
+ref_circuit_summary_fname <- "ref_circ_sum.csv"
+ref_underlying_data_fname <- "ref_underlying.csv"
+test_circuit_summary_fname <- "test_circ_sum.csv"
+test_underlying_data_fname <- "test_underlying.csv"
+
+required_files <- c(
+    ref_circuit_summary_fname, ref_underlying_data_fname,
+    test_circuit_summary_fname, test_underlying_data_fname
+)
+
+table_name_query <- "SELECT DISTINCT name FROM sqlite_master WHERE type = 'table'"
+table_columns_query <- "PRAGMA table_info('%s')"
+table_length_query <- "SELECT count(*) AS length FROM %s"
+table_all_data_query <- "SELECT * FROM %s"
+
+
+difference_between_lists <- function(reference_list, test_list) {
+    results <- list()
+    results$reference_only <- reference_list[!(reference_list %in% test_list)]
+    results$test_only <- test_list[!(test_list %in% reference_list)]
+    results$common <- reference_list[(reference_list %in% test_list)]
+    return(results)
+}
+
+
+compare_dbs <- function(ref_db_con, test_db_con, compare_values=FALSE, event_name=NA){
+    diff_found <- FALSE
+    # compare tables
+    ref_tables <- RSQLite::dbGetQuery(ref_db_con, table_name_query)$name
+    test_tables <- RSQLite::dbGetQuery(test_db_con, table_name_query)$name
+    table_diffs <- difference_between_lists(ref_tables, test_tables)
+
+    if (length(table_diffs$reference_only) > 0) {
+        logging::loginfo(sprintf("%s - The following tables exist only in the REFERENCE database:\n%s",
+                                 event_name, table_diffs$reference_only))
+        diff_found <- TRUE
+    }
+    if (length(table_diffs$test_only) > 0) {
+        logging::loginfo(sprintf("%s - The following tables exist only in the TEST database:\n%s",
+                                 event_name, table_diffs$test_only))
+        diff_found <- TRUE
+    }
+
+    # compare columns in tables
+    for (table in table_diffs$common) {
+        ref_columns <- RSQLite::dbGetQuery(ref_db_con, sprintf(table_columns_query, table))$names
+        test_columns <- RSQLite::dbGetQuery(test_db_con, sprintf(table_columns_query, table))$names
+        column_diffs <- difference_between_lists(ref_columns, test_columns)
+
+        if (length(column_diffs$reference_only) > 0) {
+            logging::loginfo(sprintf("%s - Table %s: the following columns exist only in the REFERENCE data:\n%s",
+                                     event_name, table, column_diffs$reference_only))
+            diff_found <- TRUE
+        }
+        if (length(column_diffs$test_only) > 0) {
+            logging::loginfo(sprintf("%s - Table %s: the following columns exist only in the TEST data:\n%s",
+                                     event_name,  table, column_diffs$test_only))
+            diff_found <- TRUE
+        }
+
+        # compare volume of data in columns
+        ref_table_length <- RSQLite::dbGetQuery(ref_db_con, sprintf(table_length_query, table))
+        test_table_length <- RSQLite::dbGetQuery(test_db_con, sprintf(table_length_query, table))
+
+        if (ref_table_length$length[[1]] != test_table_length$length[[1]]) {
+            logging::loginfo(sprintf(
+                "%s - Table %s has differing lengths in reference and test data\nref: %s; test: %s",
+                event_name, table, ref_table_length$length[[1]], test_table_length$length[[1]]
+                ))
+            diff_found <- TRUE
+        }
+
+        # compare values in table
+        if (compare_values) {
+            ref_data <- RSQLite::dbGetQuery(ref_db_con, sprintf(table_all_data_query, table))
+            test_data <- RSQLite::dbGetQuery(test_db_con, sprintf(table_all_data_query, table))
+            diffs <- all.equal(ref_data[column_diffs$common], ref_data[column_diffs$common])
+            if (!isTRUE(diffs)){
+                logging::loginfo(
+                    sprintf(
+                        "%s - Differences found in values:\n%s",
+                        event_name, toString(diffs)
+                    )
+                )
+                diff_found <- TRUE
+            }
+        }
+    }
+    if (!diff_found) {
+        logging::loginfo(sprintf("%s - No differences found between databases", event_name))
+    }
+}
+
+
+compare_dfs <- function(reference, test, event_name=NA){
+    # 1. check for new columns
+    ref_columns <- names(reference)
+    test_columns <- names(test)
+    column_diff <- difference_between_lists(ref_columns, test_columns)
+
+    if (length(column_diff$reference_only) > 0) {
+        logging::loginfo(
+            sprintf(
+                "%s - Reference dataframe contains columns not found in test dataframe:\n%s",
+                event_name, toString(column_diff$reference_only)
+            )
+        )
+    }
+    if (length(column_diff$test_only) > 0) {
+        logging::loginfo(
+            sprintf(
+                "%s - Test dataframe contains new columns:\n%s",
+                event_name, toString(column_diff$test_only)
+            )
+        )
+    }
+
+    # 2. check for different values in existing columns
+    diffs <- all.equal(reference[column_diff$common], test[column_diff$common])
+    if (!isTRUE(diffs)){
+        logging::loginfo(
+            sprintf("%s - Differences found in column values:\n%s", event_name, toString(diffs))
+        )
+    } else {
+        logging::loginfo(
+            sprintf("%s - No differences found between dataframes", event_name)
+        )
+    }
+    return(diffs)
+}
+
+
+if (length(data_dirs) > 0) {
+    for (dir in data_dirs){
+        all_files_in_dir <- list.files(dir)
+        required_files_in_dir <- required_files %in% all_files_in_dir
+        if (all(required_files_in_dir)){
+            # check databases
+            ref_db_path <- sprintf("%s/%s", dir, ref_db_name)
+            test_db_path <- sprintf("%s/%s", dir, test_db_name)
+            if (file.exists(ref_db_path) & file.exists(test_db_path)) {
+                ref_db_con <- RSQLite::dbConnect(RSQLite::SQLite(), sprintf("%s/%s", dir, ref_db_name))
+                test_db_con <- RSQLite::dbConnect(RSQLite::SQLite(), sprintf("%s/%s", dir, test_db_name))
+                compare_dbs(ref_db_con, test_db_con, TRUE, dir)
+
+                # check csvs
+                ref_circuit_summary <- read.csv(sprintf("%s/%s", dir, ref_circuit_summary_fname))
+                ref_underlying_data <- read.csv(sprintf("%s/%s", dir, ref_underlying_data_fname))
+                test_circuit_summary <- read.csv(sprintf("%s/%s", dir, test_circuit_summary_fname))
+                test_underlying_data <- read.csv(sprintf("%s/%s", dir, test_underlying_data_fname))
+
+                # check circuit summary
+                logging::loginfo(sprintf("%s - Comparing circuit summaries...", dir))
+                compare_dfs(ref_circuit_summary, test_circuit_summary, event_name=dir)
+                # check underlying data
+                logging::loginfo(sprintf("%s - Comparing underlying data...", dir))
+                compare_dfs(ref_underlying_data, test_underlying_data, event_name=dir)
+
+                RSQLite::dbDisconnect(ref_db_con)
+                RSQLite::dbDisconnect(test_db_con)
+            } else {
+                if (!file.exists(ref_db_path)) {
+                    logging::loginfo(sprintf("Reference database not found, expected at %s", ref_db_path))
+                }
+                if (!file.exists(test_db_path)) {
+                    logging::loginfo(sprintf("Test database not found, expected at %s", test_db_path))
+                }
+            }
+        }
+    }
+} else {
+    logging::logerror("No data found in directory")
+}