-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from UNSW-CEEM/benchmarking
Added validation process scripts
- Loading branch information
Showing
7 changed files
with
321 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Tool result validation process | ||
|
||
In order to ensure that results are not being modified by changes to the tool - or that expected changes ARE being made - we need to validate/benchmark the results. | ||
|
||
Validation datasets will be made available on cloudstor once they are finalised. | ||
|
||
## Running validation | ||
|
||
1. Run analysis on sample data with reference version of tool | ||
1. Checkout reference version of tool (currently latest master branch) | ||
2. Build database for the events in the `DER_disturbance_analysis/validation/data` directory using `build_validation_databases.R` | ||
3. Run tool using metadata from the event directory as config json | ||
4. Batch save results in the same directory as the data, entering `ref` as the file name | ||
2. Run analysis on sample data with version of tool to be tested | ||
1. Checkout target branch | ||
2. Build database for the events in the `DER_disturbance_analysis/validation/data` directory using `build_validation_databases.R` | ||
3. Run tool using metadata from the event directory as config json | ||
4. Batch save results in the same directory as the data, entering `test` as the file name | ||
3. Compare reference and test results using `validate_results.R` | ||
1. Identify if results match | ||
2. Check any discrepencies against expected impact of test version of tool | ||
|
||
## Choosing validation datasets | ||
|
||
A representative set of validation data is required. In order to capture this we are currently planning to use the data from 2021-05-25 from QLD, 2020-01-31 from SA, and at least one event using Tesla data (TBD) | ||
|
||
For each event the following process will need to be followed: | ||
|
||
1. Build database from raw data for event | ||
2. Run normal tool analysis with appropriate settings for that event. Ensure that frequency data is included if necessary, and all category filters other than "raw" are checked | ||
3. Batch save the results under a memorable name | ||
4. Identify appropriate sample circuits based on the results in the circuit summary. Currently the following columns are being used to identify unique circuits: | ||
* `response_category`, `reconnection_compliance_status`, `ufls_status`, `compliance_status`, `Standard_Version` | ||
5. Using the site ID of the sample results filter the raw data to only use included circuits (including site_details, circuit_details and raw data files). | ||
6. Save this filtered raw data as a new set of files under `DER_disturbance_analysis/validation/data` in a sub-directory with a meaningful name representing the event. | ||
7. Copy any required supporting files - the metadata file from the sample selection should be included, as should any necessary network frequency data. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# To use this file, run it as an R script | ||
# If you are not running this from the tool top level directory ensure that you have set tool_directory to the root | ||
# directory of the DER tool repository' | ||
# Ensure that the output database is set to "ref" if building reference DBs, or "test" if building test DBs | ||
|
||
library("logging") | ||
library("rjson") | ||
logging::basicConfig() | ||
|
||
base_directory_name <- basename(getwd()) | ||
if (base_directory_name == "DER_disturbance_analysis") { | ||
tool_directory <- getwd() | ||
} else { | ||
print("Script is not being run in DER_disturbance_analysis folder, make sure that tool directory has been set") | ||
tool_directory <- "~/UNSW/MATCH/DER_disturbance_analysis" | ||
} | ||
source(sprintf("%s/BDInterface/interface.R", tool_directory)) | ||
|
||
data_dirs <- list.dirs('validation/data', recursive=FALSE) | ||
output_database <- "ref" # "test" | ||
required_file_names <- c("ref_circuit_details.csv", "ref_meta_data.json", "ref_raw_data.csv", "ref_site_details.csv") | ||
|
||
if (length(data_dirs) > 0){ | ||
for (dir in data_dirs){ | ||
all_files_in_dir <- list.files(dir) | ||
required_files_in_dir <- required_file_names %in% all_files_in_dir | ||
if (all(required_files_in_dir)){ | ||
site_details_path_name <- paste(dir, "/", "ref_site_details.csv", sep="") | ||
circuit_details_path_name <- paste(dir, "/", "ref_circuit_details.csv", sep="") | ||
timeseries_path_name <- paste(dir, "/", "ref_raw_data.csv", sep="") | ||
metadata_path_name <- paste(dir, "/", "ref_meta_data.json", sep="") | ||
db_path_name <- paste(dir, "/", output_database, ".db", sep="") | ||
|
||
db <- DBInterface$new() | ||
if (!file.exists(db_path_name)){ | ||
db$connect_to_new_database(db_path_name) | ||
logging::loginfo(paste("Creating new database", db_path_name)) | ||
} else { | ||
db$connect_to_existing_database(db_path_name) | ||
logging::loginfo(paste("Replacing existing database", db_path_name)) | ||
} | ||
|
||
db$default_timeseries_column_aliases <- list(utc_tstamp='_ts', c_id='_c_id', voltage='_v', frequency='_f', energy='_e', | ||
duration='_d', power='_p', vmin='vmin', vmax='vmax', | ||
vmean='vmean') | ||
db$build_database(timeseries = timeseries_path_name, | ||
circuit_details = circuit_details_path_name, | ||
site_details = site_details_path_name) | ||
|
||
db$add_postcode_lon_lat_to_database(sprintf("%s/inbuilt_data/postcode_lon_lat.csv", tool_directory)) | ||
|
||
db$add_manufacturer_mapping_table(sprintf("%s/inbuilt_data/manufacturer_mapping.csv", tool_directory)) | ||
|
||
db$run_data_cleaning_loop(500) | ||
|
||
# update metadata | ||
if (file.exists(metadata_path_name)){ | ||
metadata <- rjson::fromJSON(file=metadata_path_name) | ||
metadata$database_name <- sprintf("%s/validation/%s/%s.db", tool_directory, dir, output_database) | ||
output_metadata_path <- paste(dir, "/", output_database, "_meta_data.json", sep="") | ||
metadata_conn <- file(output_metadata_path) | ||
writeLines(rjson::toJSON(metadata, indent=4), metadata_conn) | ||
close(metadata_conn) | ||
} | ||
} else { | ||
logging::logerror(sprintf("Required files missing from directory: %s/%s", dir, required_file_names[!required_files_in_dir])) | ||
} | ||
} | ||
} else { | ||
logging::logerror("No data found in directory") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# ignore all files in this directory to avoid commmitting data to the repo | ||
* | ||
*/ | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
# Validate benchmarking dataset results | ||
# Step 1: Ensure you have read the README in this directory and followed the instructions on creating the reference | ||
# and test data | ||
# Step 2: Run this script from DER disturbance analysis directory | ||
# OR change tool_directory to the location of the DER disturbance analysis repo | ||
# Step 3: Review the results printed to the console. Investigate any differences identified to ensure that only | ||
# expected changes are present. | ||
|
||
logging::basicConfig() | ||
|
||
base_directory_name <- basename(getwd()) | ||
if (base_directory_name == "DER_disturbance_analysis") { | ||
tool_directory <- getwd() | ||
} else { | ||
print("Script is not being run in DER_disturbance_analysis folder, make sure that tool directory ahs been set") | ||
tool_directory <- "~/UNSW/MATCH/DER_disturbance_analysis" | ||
} | ||
source(sprintf("%s/BDInterface/interface.R", tool_directory)) | ||
|
||
data_dirs <- list.dirs('validation/data', recursive=FALSE) | ||
|
||
ref_db_name <- "ref.db" | ||
test_db_name <- "test.db" | ||
ref_circuit_summary_fname <- "ref_circ_sum.csv" | ||
ref_underlying_data_fname <- "ref_underlying.csv" | ||
test_circuit_summary_fname <- "test_circ_sum.csv" | ||
test_underlying_data_fname <- "test_underlying.csv" | ||
|
||
required_files <- c( | ||
ref_circuit_summary_fname, ref_underlying_data_fname, | ||
test_circuit_summary_fname, test_underlying_data_fname | ||
) | ||
|
||
table_name_query <- "SELECT DISTINCT name FROM sqlite_master WHERE type = 'table'" | ||
table_columns_query <- "PRAGMA table_info('%s')" | ||
table_length_query <- "SELECT count(*) AS length FROM %s" | ||
table_all_data_query <- "SELECT * FROM %s" | ||
|
||
|
||
difference_between_lists <- function(reference_list, test_list) { | ||
results <- list() | ||
results$reference_only <- reference_list[!(reference_list %in% test_list)] | ||
results$test_only <- test_list[!(test_list %in% reference_list)] | ||
results$common <- reference_list[(reference_list %in% test_list)] | ||
return(results) | ||
} | ||
|
||
|
||
compare_dbs <- function(ref_db_con, test_db_con, compare_values=FALSE, event_name=NA){ | ||
diff_found <- FALSE | ||
# compare tables | ||
ref_tables <- RSQLite::dbGetQuery(ref_db_con, table_name_query)$name | ||
test_tables <- RSQLite::dbGetQuery(test_db_con, table_name_query)$name | ||
table_diffs <- difference_between_lists(ref_tables, test_tables) | ||
|
||
if (length(table_diffs$reference_only) > 0) { | ||
logging::loginfo(sprintf("%s - The following tables exist only in the REFERENCE database:\n%s", | ||
event_name, table_diffs$reference_only)) | ||
diff_found <- TRUE | ||
} | ||
if (length(table_diffs$test_only) > 0) { | ||
logging::loginfo(sprintf("%s - The following tables exist only in the TEST database:\n%s", | ||
event_name, table_diffs$test_only)) | ||
diff_found <- TRUE | ||
} | ||
|
||
# compare columns in tables | ||
for (table in table_diffs$common) { | ||
ref_columns <- RSQLite::dbGetQuery(ref_db_con, sprintf(table_columns_query, table))$names | ||
test_columns <- RSQLite::dbGetQuery(test_db_con, sprintf(table_columns_query, table))$names | ||
column_diffs <- difference_between_lists(ref_columns, test_columns) | ||
|
||
if (length(column_diffs$reference_only) > 0) { | ||
logging::loginfo(sprintf("%s - Table %s: the following columns exist only in the REFERENCE data:\n%s", | ||
event_name, table, column_diffs$reference_only)) | ||
diff_found <- TRUE | ||
} | ||
if (length(column_diffs$test_only) > 0) { | ||
logging::loginfo(sprintf("%s - Table %s: the following columns exist only in the TEST data:\n%s", | ||
event_name, table, column_diffs$test_only)) | ||
diff_found <- TRUE | ||
} | ||
|
||
# compare volume of data in columns | ||
ref_table_length <- RSQLite::dbGetQuery(ref_db_con, sprintf(table_length_query, table)) | ||
test_table_length <- RSQLite::dbGetQuery(test_db_con, sprintf(table_length_query, table)) | ||
|
||
if (ref_table_length$length[[1]] != test_table_length$length[[1]]) { | ||
logging::loginfo(sprintf( | ||
"%s - Table %s has differing lengths in reference and test data\nref: %s; test: %s", | ||
event_name, table, ref_table_length$length[[1]], test_table_length$length[[1]] | ||
)) | ||
diff_found <- TRUE | ||
} | ||
|
||
# compare values in table | ||
if (compare_values) { | ||
ref_data <- RSQLite::dbGetQuery(ref_db_con, sprintf(table_all_data_query, table)) | ||
test_data <- RSQLite::dbGetQuery(test_db_con, sprintf(table_all_data_query, table)) | ||
diffs <- all.equal(ref_data[column_diffs$common], ref_data[column_diffs$common]) | ||
if (!isTRUE(diffs)){ | ||
logging::loginfo( | ||
sprintf( | ||
"%s - Differences found in values:\n%s", | ||
event_name, toString(diffs) | ||
) | ||
) | ||
diff_found <- TRUE | ||
} | ||
} | ||
} | ||
if (!diff_found) { | ||
logging::loginfo(sprintf("%s - No differences found between databases", event_name)) | ||
} | ||
} | ||
|
||
|
||
compare_dfs <- function(reference, test, event_name=NA){ | ||
# 1. check for new columns | ||
ref_columns <- names(reference) | ||
test_columns <- names(test) | ||
column_diff <- difference_between_lists(ref_columns, test_columns) | ||
|
||
if (length(column_diff$reference_only) > 0) { | ||
logging::loginfo( | ||
sprintf( | ||
"%s - Reference dataframe contains columns not found in test dataframe:\n%s", | ||
event_name, toString(column_diff$reference_only) | ||
) | ||
) | ||
} | ||
if (length(column_diff$test_only) > 0) { | ||
logging::loginfo( | ||
sprintf( | ||
"%s - Test dataframe contains new columns:\n%s", | ||
event_name, toString(column_diff$test_only) | ||
) | ||
) | ||
} | ||
|
||
# 2. check for different values in existing columns | ||
diffs <- all.equal(reference[column_diff$common], test[column_diff$common]) | ||
if (!isTRUE(diffs)){ | ||
logging::loginfo( | ||
sprintf("%s - Differences found in column values:\n%s", event_name, toString(diffs)) | ||
) | ||
} else { | ||
logging::loginfo( | ||
sprintf("%s - No differences found between dataframes", event_name) | ||
) | ||
} | ||
return(diffs) | ||
} | ||
|
||
|
||
if (length(data_dirs) > 0) { | ||
for (dir in data_dirs){ | ||
all_files_in_dir <- list.files(dir) | ||
required_files_in_dir <- required_files %in% all_files_in_dir | ||
if (all(required_files_in_dir)){ | ||
# check databases | ||
ref_db_path <- sprintf("%s/%s", dir, ref_db_name) | ||
test_db_path <- sprintf("%s/%s", dir, test_db_name) | ||
if (file.exists(ref_db_path) & file.exists(test_db_path)) { | ||
ref_db_con <- RSQLite::dbConnect(RSQLite::SQLite(), sprintf("%s/%s", dir, ref_db_name)) | ||
test_db_con <- RSQLite::dbConnect(RSQLite::SQLite(), sprintf("%s/%s", dir, test_db_name)) | ||
compare_dbs(ref_db_con, test_db_con, TRUE, dir) | ||
|
||
# check csvs | ||
ref_circuit_summary <- read.csv(sprintf("%s/%s", dir, ref_circuit_summary_fname)) | ||
ref_underlying_data <- read.csv(sprintf("%s/%s", dir, ref_underlying_data_fname)) | ||
test_circuit_summary <- read.csv(sprintf("%s/%s", dir, test_circuit_summary_fname)) | ||
test_underlying_data <- read.csv(sprintf("%s/%s", dir, test_underlying_data_fname)) | ||
|
||
# check circuit summary | ||
logging::loginfo(sprintf("%s - Comparing circuit summaries...", dir)) | ||
compare_dfs(ref_circuit_summary, test_circuit_summary, event_name=dir) | ||
# check underlying data | ||
logging::loginfo(sprintf("%s - Comparing underlying data...", dir)) | ||
compare_dfs(ref_underlying_data, test_underlying_data, event_name=dir) | ||
|
||
RSQLite::dbDisconnect(ref_db_con) | ||
RSQLite::dbDisconnect(test_db_con) | ||
} else { | ||
if (!file.exists(ref_db_path)) { | ||
logging::loginfo(sprintf("Reference database not found, expected at %s", ref_db_path)) | ||
} | ||
if (!file.exists(test_db_path)) { | ||
logging::loginfo(sprintf("Test database not found, expected at %s", test_db_path)) | ||
} | ||
} | ||
} | ||
} | ||
} else { | ||
logging::logerror("No data found in directory") | ||
} |