mason-lee-laa-cook.R

# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand
# Please edit mason-lee-laa-cook.Rmd to modify this file

## ----setup, include=FALSE-------------------------------------------------------------------
knitr::opts_chunk$set(
  echo = FALSE, 
  warning = FALSE, 
  message = FALSE)


## ----load-libraries-------------------------------------------------------------------------
library(cassowaryr)
library(GGally)
library(plotly)
library(dplyr)
library(tidyr)
library(stringr)
library(patchwork)
library(knitr)
library(kableExtra)
library(ggimg) #for the visual table
library(ggstance) #for vertical dodge on plot
options(digits=2) # for numerical printing


## ----building-blocks2, width = 15, height = 5, out.width = "100%", fig.alt = "A hand drawn illustration of a scatter plot with its respective convex hull, alpha hull, and minimum spanning tree.", fig.cap = "The building blocks for graph-based scagnostics: (a) convex hull, (b) alpha hull and (c) minimal spanning tree. The convex hull is a convex shell around all the data points. The alpha hull contains all the points but allows concavities better capturing some shapes, but it needs tuning. The minimal spanning tree connects all points once, and has a single chain connecting central points.", layout = "l-body"----
library(alphahull)
data("features")
nl <- features |> filter(feature == "ring")
d1 <- draw_convexhull(nl$x, nl$y, clr="#FFD700", fill=TRUE) +
  ggtitle("a. Convex hull") +
  xlab("") + ylab("") +
  theme_void() +
  theme(aspect.ratio=1, axis.text = element_blank())
d2 <- draw_alphahull(nl$x, nl$y, clr="#00a800") +
  ggtitle("b. Alpha hull") +
  xlab("") + ylab("") +
  theme_void() +
  theme(aspect.ratio=1, axis.text = element_blank())
d3 <- draw_mst(nl$x, nl$y) +
  ggtitle("c. Minimal spanning tree") +
  xlab("") + ylab("") +
  theme_void() +
  theme(aspect.ratio=1, axis.text = element_blank())
d1 + d2 + d3


## ----scagdrawn, out.width = "100%", fig.alt = "A hand drawn illustration showing the algorithm of the skinny, outlying and clumpy calculations. In each drawing the edges used in the calculation are highlighted in both the mathematical formula and in the illustration of the scatter plot.", fig.cap = " An visualisation of the calculation used to compute the skinny (top), outlying (middle), and clumpy (bottom) scagnostics. These three measure definitions are quite distinct, and each illustrates a unique method of capturing a visual feature in a scatter plot."----
knitr::include_graphics("figures/drawnmeasures.png")


## ----features-plot, width = 150, height = 150, out.width = "100%", fig.alt = "Fifteen scatter plots of the features data where each scatter plot has a distinct visual shape.", fig.cap = "The scatter plots of the features dataset. These scatter plots were designed to each represent a distinct visual feature, for example the ring scatter plot is a hollow version of disk. The scagnostics need to be able to differentiate these plots."----
mypal <- c("#b2182b", "#d53e4f","#FF4E50", "#FC913A", "#fdae61",
           "#F9D423", "#fee08b" , "#abdda4" , "#a6d96a" , "#66c2a5" ,
           "#66bd63","#3B8183", "#3288bd", "#74add1",  "#abd9e9")

#plot them
ggplot(features, aes(x,y,colour=feature))+
  geom_point() +
  theme_minimal() + 
  facet_wrap(~feature, ncol=5, scales="free") +
  xlab("") + ylab("") +
  theme(legend.position = "none", 
        aspect.ratio= 1, 
        axis.text = element_blank()) +
  scale_colour_manual(values = mypal)


## -------------------------------------------------------------------------------------------
scagissues_tb <- tibble(Scagnostic = c("Striated", 
                             "Sparse", 
                             "Skewed",
                             "Outlying",
                             "Stringy",
                             "Clumpy"), 
                      Issues = c("The striated measure can identify when one variable is discrete, and one is continuous, but is unable to identify two discrete variables. Additionally, since the edges used to calculate the striated measure are a subset of those used by the stringy measure, the two scagnostics are highly correlated, and often identify the same plots.",
                               "While sparse does seem to identify spread out distributions, it rarely returns a value higher than 0.1. The removal of binning means that an infinitely large number of points can cluster in an infinsimally small space in the plot. This cluster can arbitrarily keep the sparse value low, even if the graph outside this cluster is very sparse. With a large number of observations on two continuous variables, this problem becomes unavoidable. Therefore, as the number of observations increase the sparse value approaches 0, independent of the shape of the scatterplot.", 
                               "This measure can identify skewed edge lengths, such as the L shape in the visual table, that typically require a log transformation. The skewed value rarely drop below 0.5 or rise above 0.8. Skewed seems to suffers from a similar issue to sparse reguarding binning.",
                               "By definition an outlier must have all its adjacent edges in the MST above the outlying threshold. This means two or more observations that are close together but away from the main mass of data will not be identified as outliers, which does not align with human intuition. Even if we change the measure such that only one edge needs to be above the outlying threshold, it would only remove a single point. The measure also gives high outlying values to skewed scatter plots.  If the number of points close to the centre of the cluster is large enough, outlying identifies the spread out points to be outliers and returns a large value, once again going against human intuition.",
                               "This measure rarely drops below 0.5 even on data generated from an independent bivariate normal distribution (which should intuitively return a 0). Unlike the other scagnostics on this list, stringy does not depend upon the edge lengths of the MST, so it is hard to say if this issue stems from binning. That being said, it was not reported in the binned version of the scagnostics, and so is likely a result of binning. ",
                               "With the removal of binning, clumpy does not identify a long edge connected to a short edge, but rather identifies any edge connected to an arbitrarily small edge. This means the clumpy measure rarely drops below 0.9, and also fails to order scatterplots similarly to human intuition."))


## ----scagissues-tb-html, eval=is_html_output()----------------------------------------------
# scagissues_tb |>
#   kbl(caption = "Summary of MST scagnostic issues.")


## ----scagissues-tb-pdf, eval=is_latex_output()----------------------------------------------
scagissues_tb |> 
  kbl(caption = "Summary of MST scagnostic issues.", format="latex", booktabs = T)  |>
  column_spec(2, width = "12cm")


## ----scatterplots-as-images,  include=FALSE, eval=FALSE-------------------------------------
# #set theme so all scatter plots in table match
# plot_theme <-  theme_classic() + #theme_minimal() +
#   theme(aspect.ratio=1, axis.title=element_blank(), axis.text = element_blank(),
#         panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
#         panel.border = element_rect(colour = "black", fill=NA, size=4),
#         legend.position = "none"
#         )
# 
# #save scatter plots as images
# plots <- sort(unique(features$feature))
# 
# for (i in seq(length(plots))){
#   holdplot <- features |>
#     filter(feature==plots[i]) |>
#     ggplot(aes(x,y, size=2))+ geom_point(colour=mypal[i]) + plot_theme
#   ggsave(paste0("paper-RJ/figures/", plots[i], ".png"),holdplot) #files already in /figure/
# }
# 


## ----visual-table, width = 150, height = 200, out.width = "100%", fig.alt = "A visual table of the scagnostic values of each of the feature scatter plots where each scagnostic is identifying different features as important.", fig.cap = "A visual table that displays a selection of scagnostics computed on the features data. The rows correspond to different scagnostics and the horizontal axis is the calculated value on a range of 0-1. Thumbnail plots of variable pairs are placed at their scagnostic value, and indicate the type of structure that would produce high or low or medium values. Some scagnostics, e.g. clumpy, need adjustment as they do not correctly order the scagnostics, or range from 0 to 1. Other measures, such as splines work without any changes to their definition.", layout = "l-body-outset"----

# Calculate Scagnostics
features_scagnostics_long <- features |>
  group_by(feature) |>
  summarise(calc_scags(x,y)) |>
  pivot_longer(cols=outlying:dcor, names_to = "scagnostic")

# edit data frame
plot_path_data <- features_scagnostics_long |>
  mutate(plotad = paste0("figures/", feature, ".png"))

# which plots to include in visual table
whichplots <- function(scag, feature){
  pad = FALSE
  # Alphahull measures
  if(all(scag=="convex", feature %in% c("discrete", "ring", "l-shape"))){
    pad = TRUE
  }
  if(all(scag=="skinny", feature %in% c("line", "positive", "disk"))){
    pad = TRUE
  }
  
  # MST measures
  if(all(scag=="outlying", feature %in% c("outliers2","l-shape", "outliers"))){
    pad = TRUE
  }
  if(all(scag=="stringy", feature %in% c("nonlinear1", "gaps"))){
    pad = TRUE
  }
  if(all(scag=="striated", feature %in% c("vlines", "discrete", "weak"))){
    pad = TRUE
  }
  if(all(scag=="clumpy", feature %in% c("vlines", "clusters", "nonlinear"))){
    pad = TRUE
  }
  if(all(scag=="sparse", feature %in% c("weak", "line"))){
    pad = TRUE
  }
  if(all(scag=="skewed", feature %in% c("l-shape", "barrier"))){
    pad = TRUE
  }
  
  # Association Measures
  if(all(scag=="monotonic", feature %in% c("line", "positive", "weak"))){
    pad = TRUE
  }
  if(all(scag=="splines", feature %in% c("nonlinear2", "clusters", "vlines"))){
    pad = TRUE
  }
  if(all(scag=="dcor", feature %in% c("positive", "barrier", "gaps"))){
    pad = TRUE
  }
  pad
}

# Make Visual Table
# Data
plot_data <- plot_path_data |>
  group_by(scagnostic, feature) |>
  mutate(doplot = whichplots(scagnostic, feature)) |>
  ungroup() |>
  filter(doplot==TRUE)

# so i dont have to keep adjusting the image size
s <- length(unique(plot_data$feature))

# plot
visual_table <- ggplot(plot_data, aes(x=value , y=scagnostic))+
  geom_point_img(aes(img = plotad), size = 1.5) + 
  theme_classic() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        legend.position="none") +
  xlim(-0.1,1.1) +
  scale_size_identity()+
  xlab("Scagnostic value") +
  ylab("") +
  #ggtitle("Visual Table of Scagnostic Values")+
  theme(
    axis.line = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    panel.grid.major.y = element_line()
)
#ggsave("paper-RJ/figures/visual_table.png", visual_table, width=10, height=10)
visual_table


## ----striated-vtable, width = 150, height = 50, out.width = "100%", fig.alt = "A visual table of the feature scatter plots on the striated and striated 2 measures where striated 2, unlike striated, gives a value of 1 to a plots that is continuous in one axis and discrete on the other.", fig.cap = "Using a visual table to compare the striated and it's adjusted counterpart, striated2 allows us to visualise the difference in the measures. While the functions may seem similar at a glance, striated2 has a stricter version of discreteness, hence why line and vlines have the same result and plots with no discreteness score a 0."----

# Make Visual Table
# Data
plot_data_striated <- plot_path_data |>
  group_by(scagnostic, feature) |>
  mutate(doplot = ifelse(all(scagnostic %in% c("striated","striated2"), 
                             feature %in% c("vlines", "discrete","line", "disk", "outliers2")),
                         TRUE,
                         FALSE)) |> 
  ungroup() |>
  filter(doplot==TRUE)


# plot
striated_visual_table <- ggplot(plot_data_striated, aes(x=value , y=scagnostic))+
  geom_point_img(aes(img = plotad), size = 2,
                 position=ggstance::position_dodgev(height=0.9)) +
  xlim(-0.05,1.05) +
  scale_size_identity()+
  xlab("Scagnostic value") +
  ylab("") +
  #ggtitle("Striated Comparison") +
  theme_classic() +
  theme(
    panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    legend.position="none",
    axis.line = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    panel.grid.major.y = element_line()
)
#ggsave("paper-RJ/figures/striated_visual_table.png", striated_visual_table, width=10, height=10)

striated_visual_table


## ----clumpy-vtable, width = 150, height = 50, out.width = "100%", fig.alt = "A visual table of the feature scatter plots on the cluump and clumpy 2 measures where clumpy 2, unlike clumpy, orders the features plots according to how 'clustered' they are and actually ranges from 0 to 1.", fig.cap = "A visual table comparing the scagnostic values of clumpy and clumpy2. We can see the clusters plot is next to last in the ordering of the original clumpy measure, but first in clumpy2. It is clear that clumpy2 achieves a more ballanced distribution and more intuitive plot ordering."----

plot_data_clumpy <- plot_path_data |>
  group_by(scagnostic, feature) |>
  mutate(doplot = ifelse(all(scagnostic %in% c("clumpy","clumpy2"), 
                             feature %in% c("vlines", "clusters","gaps", "outliers", "barrier")),
                         TRUE,
                         FALSE)) |> 
  ungroup() |>
  filter(doplot==TRUE)


# plot
clumpy_visual_table <- ggplot(plot_data_clumpy, aes(x=value , y=scagnostic))+
  geom_point_img(aes(img = plotad), size = 2, 
                 position=ggstance::position_dodgev(height=0.9)) +
  xlim(-0.05,1.05) +
  scale_size_identity()+
  xlab("Scagnostic Value") +
  ylab("") +
  #ggtitle("Clumpy Comparison") +
  theme_classic() +
  theme(
    panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    legend.position="none",
    axis.line = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    panel.grid.major.y = element_line()
)
#ggsave("paper-RJ/figures/clumpy_visual_table.png", clumpy_visual_table, width=10, height=10)

clumpy_visual_table


## -------------------------------------------------------------------------------------------
datasets_tb <- tibble(dt = c("features", 
                             "anscombe_tidy", 
                             "datasaurus_dozen",
                             "datasaurus_dozen_wide",
                             "numbat",
                             "pk"), 
                      text = c("Simulated data with special features. ",
                               "Data from Anscombes famous example in tidy format.", 
                               "Datasaurus Dozen data in a long tidy format.",
                               "Datasaurus Dozen Data in a wide tidy format.",
                               "A toy data set with a numbat shape hidden among noise variables.",
                               "Parkinsons data from UCI machine learning archive."))


## ----datasets-tb-html, eval=is_html_output()------------------------------------------------
# datasets_tb |>
#   kable(caption = "Cassowaryr data sets", col.names = c("data", "explanation"))


## ----datasets-tb-pdf, eval=is_latex_output()------------------------------------------------
datasets_tb |> 
  kable(caption = "Cassowaryr data sets", format="latex", col.names = c("data", "explanation"), booktabs = T)  |>
  column_spec(1, width = "4cm") |>
  column_spec(2, width = "8cm")


## -------------------------------------------------------------------------------------------
scagfuncs_tb <- tibble(dt = c("scree", 
                             "sc_clumpy", 
                             "sc_clumpy2",
                             "sc_clumpy_r",
                             "sc_convex",
                             "sc_dcor",
                             "sc_monotonic",
                             "sc_outlying",
                             "sc_skewed",
                             "sc_skinny",
                             "sc_sparse",
                             "sc_sparse2",
                             "sc_splines",
                             "sc_striated",
                             "sc_striated2",
                             "sc_stringy"), 
                      text = c("Generates a scree object that contains the Delaunay triangulation of the scater plot.",
                               "Compute the original clumpy scagnostic measure.", 
                               "Compute adjusted clumpy scagnositc measure.",
                               "Compute robust clumpy scagnostic measure.",
                               "Compute the original convex scagnostic measure",
                               "Compute the distance correlation index.",
                               "Compute the Spearman correlation.",
                               "Compute the original outlying scagnostic measure.",
                               "Compute the original skewed scagnostic measure.",
                               "Compute the original skinny scagnostic measure.",
                               "Compute the original sparse scagnostic measure.",
                               "Compute adjusted sparse measure.",
                               "Compute the spline based index.",
                               "Compute the original stirated scagnostic measure.",
                               "Compute angle adjusted striated measure.",
                               "Compute stringy scagnostic measure."))


## ----scagfuncs-tb-html, eval=is_html_output()-----------------------------------------------
# scagfuncs_tb |> kable(caption = "Cassowaryr Scagnostic functions")


## ----scagfuncs-tb-pdf, eval=is_latex_output()-----------------------------------------------
scagfuncs_tb |> kable(caption = "Cassowaryr Scagnostic functions", col.names = c("Function", "Explanation"), format="latex", booktabs = T)  |>
  column_spec(1, width = "3cm") |>
  column_spec(2, width = "10cm")


## -------------------------------------------------------------------------------------------
drawfuncs_tb <- tibble(funcname = c("draw_alphahull", 
                             "draw_convexhull",
                             "draw_mst"), 
                      description = c("Drawing the alpha hull.",
                               "Drawing the convex hull.",
                               "Drawing the MST."))


## ----drawfuncs-tb-html, eval=is_html_output()-----------------------------------------------
# drawfuncs_tb |> kable(caption = "Cassowaryr drawing functions")


## ----drawfuncs-tb-pdf, eval=is_latex_output()-----------------------------------------------
drawfuncs_tb |> kable(caption = "Cassowaryr drawing functions", col.names = c("Function", "Explanation"), format="latex", booktabs = T)  |>
  column_spec(1, width = "3cm")


## ----calc-scags-args------------------------------------------------------------------------
calcfunc_tb <- tibble(argument = c("y", "x", "scags"), 
                      description = c("numeric vector of x values.",
                               "numeric vector of y values.",
                               "collection of strings matching names of scagnostics to calculate: outlying, stringy, striated, striated2, striped, clumpy, clumpy2, sparse, skewed, convex, skinny, monotonic, splines, dcor. The default is to calculate all scagnostics."))


## ----calcfunc-tb-html, eval=is_html_output()------------------------------------------------
# calcfunc_tb |> kable(caption="The main arguments for `calc_scags()`.")


## ----calcfunc-tb-pdf, eval=is_latex_output()------------------------------------------------
calcfunc_tb |> kable(format="latex", caption="The main arguments for calc\\_scags().", col.names = c("Argument", "Explanation"), escape=FALSE, booktabs = T)  |>
  column_spec(2, width = "12cm")


## ----calcscags-example, echo=TRUE-----------------------------------------------------------
features_scags <- features |>
    group_by(feature) |>
    summarise(calc_scags(x,y, c("outlying", "clumpy2", "monotonic")))


## ----featuresscags-html, eval=is_html_output()----------------------------------------------
# features_scags |> kable(caption = "Summary of three scagnostics computed by `calc_scags()` on the long form of the features data.")


## ----featuresscags-pdf, eval=is_latex_output()----------------------------------------------
features_scags |> kable(format="latex", caption = "Summary of three scagnostics computed by calc\\_scags() on the long form of the features data.", escape=FALSE, booktabs = T)  |>
  column_spec(1, width = "3cm")


## ----echo=TRUE, include=TRUE----------------------------------------------------------------
top_scags(features_scags)


## ----outlying-test-plot, width = 100, height = 50, out.width = "100%", fig.alt = "A scatter plot of simulated organised data where there are multiple ways of drawing the minimum spanning tree.", fig.cap = "Plot of simulated data used for testing the outlying scagnostic. The left plot shows the raw data, while the right plot presents the MST generated on that data. When we created the test we expected the red dashed line to be in the MST, but instead the green line that connects points 3 and 4 is. If the red edge is in the MST rather than the black edge, the outlying value on this plot is much higher."----
x1<- c(0,1,1,1,0,0,0,1,1,1,0,0,0,1,1,1, 10, 10, 10)
y1 <- c(0,0,1,2,2,3,4,4,5,6,6,7,8,8,9,10, 10, 6, 5)

t1 <- ggplot(tibble(x1, y1), aes(x1, y1)) + geom_point() +
  geom_text(data=tibble(x1= c(1,10,1,10), y1 = c(10, 10, 6,6), lab=c(1,2,3,4)), 
            aes(x=x1, y=y1, label=lab), 
            colour="#3288bd", nudge_x=0.3, nudge_y=0.3) +
  theme_classic() + ggtitle("Simlulated Data for Outlying Test")
t2 <- draw_mst(x1, y1, out.rm = FALSE) + theme_classic() + 
  ggtitle("MST of Simulated Data") +
  geom_segment(aes(x=0.1, y=0.6, xend=1, yend=0.6), colour="#00a800")+
  geom_segment(aes(x=0.1, y=1, xend=1, yend=1), colour="#fb8072", linetype=2)+
  #geom_segment(aes(x=1, y=1, xend=1, yend=0.7), colour="#FFD700")+
  geom_text(data=tibble(x1= c(0.1,1,0.1,1), y1 = c(1, 1, 0.6,0.6), lab=c(1,2,3,4)), 
            aes(x=x1, y=y1, label=lab), 
            colour="#3288bd", nudge_x=0.03, nudge_y=0.03) +
  theme(legend.position="none", aspect.ratio=1)

t1 + t2 + plot_layout(nrow=1)


## ----AFL DATA, include=FALSE, eval=FALSE----------------------------------------------------
# library(fitzRoy)
# aflw <- fetch_player_stats(2020, comp = "AFLW")
# 
# aflw_num <- aflw |>
#   select_if(is.numeric)
# 
# aflw_num <- aggregate(aflw_num[,5:37],
#   list(aflw$player.player.player.surname),
#   mean)
# aflw_num <- aflw_num |>
#   ungroup()
# 
# aflw_scags <- calc_scags_wide(aflw_num[,2:34])
# 
# #AFLW DATA SAVE
# save(aflw, file="data/aflw.rda")
# save(aflw_num, file="data/aflw_num.rda")
# save(aflw_scags, file="data/aflw_scags.rda")


## ----AFLW Scatter Plots , echo=FALSE--------------------------------------------------------
load("data/aflw_num.rda")
load("data/aflw_scags.rda")

mypal <- c("#66c2a5", "#fc8d62","#8da0cb", "#e78ac3", "#a6d854", "#ffd92f")

#standounts from splom
#outlying and Skewed + highest skinny not 1
p1 <- ggplot(aflw_num, aes(x=disposalEfficiency, y=hitouts, label=Group.1)) + 
  theme_classic()+
  #ggtitle("Plot 1") + 
  geom_point(colour=mypal[1]) 

#high on the 3 associations measures
p2 <- ggplot(aflw_num, aes(x=totalPossessions, y=disposals, label=Group.1)) + 
  theme_classic()+
  #ggtitle("Plot 2") + 
  geom_point(colour=mypal[2]) 

#low on sparse and high on convex
p3 <- ggplot(aflw_num, aes(x=marksInside50, y=goals)) + 
  theme_classic()+
  #ggtitle("Plot 3") + 
  geom_point(colour=mypal[3]) 

#high on clumpy adjusted, low on monotonic
p4 <- ggplot(aflw_num, aes(x=onePercenters, y=handballs)) +
  theme_classic()+
  #ggtitle("Plot 4") + 
  geom_point(colour=mypal[4]) 

#interesting HIGH on striated, moderate on outlying
p5 <- ggplot(aflw_num, aes(x=bounces, y=hitouts, label=Group.1)) + 
  theme_classic()+
  #ggtitle("Plot 5") + 
  geom_point(colour=mypal[5]) 

# me randomly picking two variables
p6 <- ggplot(aflw_num, aes(x=kicks, y=handballs)) + 
  theme_classic()+
  #ggtitle("Plot 6") + 
  geom_point(colour=mypal[6])


## ----AFLW-scatters-interactive, out.width="100%", include=knitr::is_html_output(), eval=knitr::is_html_output(), fig.alt = "Six scatter plots of AFL sports statistics where all six scatter plots are clearly visually distinct from one another.",  fig.cap="Six AFLW sport statistics scatter plots that were identified as identified as interesting by the scagnostics. Plots 1 to 5 had unique values on some combination of the scagnostics, Plot 6 had middling values on all measures. There is a clear difference in structure between these plots indicating the scagnostics ability to identify interesting visual features.", fig.width = 12, fig.height = 8----
# 
# pp1 <- ggplotly(p1, width=650, height=450)
# pp2 <- ggplotly(p2, width=650, height=450)
# pp3 <- ggplotly(p3, width=650, height=450)
# pp4 <- ggplotly(p4, width=650, height=450)
# pp5 <- ggplotly(p5, width=650, height=450)
# pp6 <- ggplotly(p6, width=650, height=450)
# subplot(pp1, pp2, pp3, pp4, pp5, pp6,
#         nrows=2, widths = c(0.33, 0.33, 0.33), heights = c(0.5,0.5))


## ----AFLW-scatters-static, out.width="100%", include=knitr::is_latex_output(), eval=knitr::is_latex_output(), fig.align='center', fig.alt = "Six scatter plots of AFL sports statistics where all six scatter plots are clearly visually distinct from one another.", fig.cap="Six AFLW sport statistics scatter plots that were identified as identified as interesting by the scagnostics. Plots 1 to 5 had unique values on some combination of the scagnostics, Plot 6 had middling values on all measures. There is a clear difference in structure between these plots indicating the scagnostics ability to identify interesting visual features.", fig.width = 12, fig.height = 8----

#titles
p1a <- p1 + ggtitle("Plot 1")
p2a <- p2 + ggtitle("Plot 2")
p3a <- p3 + ggtitle("Plot 3")
p4a <- p4 + ggtitle("Plot 4")
p5a <- p5 + ggtitle("Plot 5")
p6a <- p6 + ggtitle("Plot 6")

p1a + p2a + p3a + p4a + p5a + p6a + plot_layout(nrow=2)


## ----AFL relevent SPLOMS, include= FALSE, echo=FALSE----------------------------------------
test <- aflw_scags |>
  mutate(lab = paste0(Var1, ", ", Var2)) |>
  mutate(plot1=ifelse(lab=="disposalEfficiency, hitouts", TRUE,FALSE),
         plot2=ifelse(lab=="totalPossessions, disposals", TRUE,FALSE),
         plot3=ifelse(lab=="marksInside50, goals", TRUE,FALSE),
         plot4=ifelse(lab=="onePercenters, handballs", TRUE,FALSE),
         plot5=ifelse(lab=="hitouts, bounces", TRUE, FALSE)
         ) |>
  mutate(plotted = any(plot1,plot2,plot3,plot4, plot5))

s1 <- ggplot(test, aes(x=outlying, skewed, colour=plot1, label=lab)) + 
  geom_point() +
  theme_classic() +
  theme(legend.position ="none")+
  scale_colour_manual(values=c("grey", mypal[1]))
  #ggtitle("Outlying vs Skewed")

s2 <- ggplot(test, aes(x=splines, dcor, colour=plot2, label=lab)) + 
  geom_point() +
  theme_classic() +
  theme(legend.position ="none")+
  scale_colour_manual(values=c("grey", mypal[2]))
  #ggtitle("Splines vs Dcor")

s5 <- ggplot(test, aes(x=outlying, y = striated2, colour=plot5, label=lab)) +
  geom_point() +
  theme_classic() +
  theme(legend.position ="none") +
  scale_colour_manual(values=c("grey", mypal[5])) 
  #ggtitle("Outlying vs Striated2")


## ----threeplot-interactive, out.width="100%", include=knitr::is_html_output(), fig.alt = "Three scatter plots of scagnostic data calculated on the AFLW data with the scatter plot each identified as interesting displayed below it.", fig.cap="Three plots that were identified as interesting below the scagnostic scatter plot used to identify it. Each scatter plot of AFLW data is displayed below a plot of the two scagnostic measures it stood out on. One of the most useful ways to identify plots is through scatter plots of the scagnostics.", fig.width = 12, fig.height = 8, eval=knitr::is_html_output(), layout = "l-body"----
# subplot(ggplotly(s1, width=650, height=450),
#         ggplotly(s2, width=650, height=450),
#         ggplotly(s5, width=650, height=450),
#         ggplotly(p1, width=650, height=450),
#         ggplotly(p2, width=650, height=450),
#         ggplotly(p5, width=650, height=450),
#         nrows=2, widths = c(0.3, 0.3, 0.3), heights = c(0.5, 0.5)) |>
#   config(displayModeBar = FALSE)


## ----threeplot-static, out.width="100%", include=knitr::is_latex_output(), fig.alt = "Three scatter plots of scagnostic data calculated on the AFLW data with the scatter plot each identified as interesting displayed below it.", fig.cap="Three plots that were identified as interesting with the scagnostic scatter plot used to identify it. Each scatter plot of AFLW data is displayed below a plot of the two scagnostic measures it stood out on. One of the most useful ways to identify plots is through scatter plots of the scagnostics.", eval=knitr::is_latex_output(), fig.width = 12, fig.height = 8, fig.align='center'----

s1a <- s1 + ggtitle("Outlying vs Skewed")
s2a <- s2 + ggtitle("Splines vs Dcor")
s5a <- s5 + ggtitle("Outlying vs Striated2")

s1a + s2a + s5a + p1a + p2a + p5a + plot_layout(nrow=2)


## ----eval=FALSE-----------------------------------------------------------------------------
# set.seed(26)
# bbh <- read_csv("data/bbh_posterior_samples.csv") |>
#   sample_n(200)
# bbh_scags <- calc_scags_wide(bbh)
# save(bbh_scags, file="data/bbh_scags.rda")
# save(bbh, file="data/bbh_samples.rda")


## ----bbh-scags-plots------------------------------------------------------------------------
load("data/bbh_scags.rda")

newpal = c("#1f78b4", "#33a02c", "#e31a1c")

bbh_scags <- mutate(bbh_scags, combination = paste0(Var1, ", ", Var2)) |>
  mutate(colours = ifelse(combination %in% c("time, ra", "dec, ra", "dec, time"), "a", "z")) |>
  mutate(colours = ifelse(combination=="alpha, time", "b", colours)) |>
  mutate(colours = ifelse(combination %in% c("m2, m1","chi_p, chi_tot"), "c", colours))

bbh1 <- ggplot(bbh_scags, aes(x=convex, y=skinny, colour= colours,
                      label = combination)) +
  geom_point() + 
  theme_classic() +
  scale_colour_manual(values=c(newpal, "grey")) +
  #ggtitle("Convex vs Skinny") + 
  theme(aspect.ratio=1, axis.text = element_blank(), legend.position = "none")

bbh2 <- ggplot(bbh_scags, aes(x=dcor, y=splines, colour= colours,
                      label = combination)) +
  geom_point() + 
  theme_classic() +
  scale_colour_manual(values=c(newpal, "grey")) +
  #ggtitle("Splines vs Dcor") + 
  theme(aspect.ratio=1, axis.text = element_blank(), legend.position = "none")

bbh3 <- ggplot(bbh_scags, aes(x=clumpy, y=clumpy2, colour= colours,
                      label = combination)) +
  geom_point() + 
  theme_classic() +
  scale_colour_manual(values=c(newpal, "grey")) +
  #ggtitle("Clumpy vs Clumpy2") + 
  theme(aspect.ratio=1, axis.text = element_blank(), legend.position = "none")

load("data/bbh_samples.rda")

bbh4 <- ggplot(bbh, aes(x=time, y=ra)) +
  geom_point(colour=newpal[1]) + 
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())

bbh5 <- ggplot(bbh, aes(x=dec, y=ra)) +
  geom_point(colour=newpal[1]) + 
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())

bbh6 <- ggplot(bbh, aes(x=dec, y=time)) +
  geom_point(colour=newpal[1]) + 
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())

bbh7 <- ggplot(bbh, aes(x=m1, y=m2)) +
  geom_point(colour=newpal[3]) + 
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())

bbh8 <- ggplot(bbh, aes(x=chi_p, y=chi_tot)) +
  geom_point(colour=newpal[3]) + 
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())

bbh9 <- ggplot(bbh, aes(x=time, y=alpha)) +
  geom_point(colour=newpal[2]) + 
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())


## ----bbh-scags-interactive, fig.alt = "Three scatter plots of scagnostic data calculated on black hole mergers data, where scatter plots of interest are highlighted in blue, green or red.", fig.cap="Selected pairs of scagnostics computed for the black hole mergers data. The coloured points in these plots align with the plot colours of Figure 11. Groups of parameter combinations can be seen to stand out in the left plot (high on skinny and low on convex) and in the middle plot (high on both dcor and splines). The plot on the right shows clumpy vs clumpy2, where we can see the big impact of the correction for this dataset.", layout = "l-body", out.width="100%", fig.width=9, fig.height=3, eval=knitr::is_html_output()----
# gs1 <- ggplotly(bbh1, width=600, height=500)
# gs2 <- ggplotly(bbh2, width=600, height=500)
# gs3 <- ggplotly(bbh3, width=600, height=500)
# subplot(gs1, gs2, gs3, nrows=1, widths = c(0.33, 0.33, 0.33), heights = 0.43)


## ----bbh-scags-static, fig.alt = "Three scatter plots of scagnostic data calculated on black hole mergers data, where scatter plots of interest are highlighted in blue, green or red.", fig.cap="Selected pairs of scagnostics computed for the black hole mergers data. The coloured points in these plots align with the plot colours of Figure 11. Groups of parameter combinations can be seen to stand out in the left plot (high on skinny and low on convex) and in the middle plot (high on both dcor and splines). The plot on the right shows clumpy vs clumpy2, where we can see a large impact of the correction. ", fig.width = 12, fig.height = 4, out.width="100%", eval=knitr::is_latex_output()----
bbh1 <- bbh1 + ggtitle("Convex vs Skinny")
bbh2 <- bbh2 + ggtitle("Splines vs Dcor")
bbh3 <- bbh3 + ggtitle("Clumpy vs Clumpy2")

bbh1 + bbh2 + bbh3


## ----blackholes, fig.alt = "The Six scatter plots of the black hole mergers data identified in the scagnostics as having interesting visual features where all the scatter plots have an interesting visual shape.", fig.cap="Features in the BBH data that stand out on several of the scagnostics measures (convey, skinny, splines and dcor), showing strong relations between variables including non-linear and non-functional dependencies. The colour of these point align with the plots position in the plots of Figure 10. The final example (time vs alpha) is expected to take high values in clumpy, but only stands out on the corrected clumpy2.", fig.width = 12, fig.height = 8, out.width="100%"----

#subplot(bbh1, bbh2, bbh3, bbh4, bbh5, bbh6,
#        nrows=2, widths = c(0.33, 0.33, 0.33), heights = c(0.5, 0.5))
bbh4 + bbh5 + bbh6 + bbh7 + bbh8 + bbh9 + plot_layout(nrow=2)


## ----eval=FALSE-----------------------------------------------------------------------------
# # Getting the data
# library(compenginets)
# 
# # Playing with other ideas
# cets_macro <- get_cets("macroeconomics")
# cets_micro <- get_cets("micoeconomics")
# save(cets_macro, file="data/cets_macro.rda")
# save(cets_micro, file="data/cets_micro.rda")
# 
# # Comparing two types of time series
# library(feasts)
# get_features <- function(ts_in) {
#   features(as_tsibble(ts_in),
#            value, feature_set("feasts"))
# }
# 
# load(here::here("data/cets_macro.rda"))
# feats_macro <- purrr::map_dfr(cets_macro, get_features)
# save(feats_macro, file=here::here("data/feats_macro.rda"))
# 
# load("data/cets_micro.rda")
# feats_micro <- purrr::map_dfr(cets_micro, get_features)
# save(feats_micro, file = "data/feats_micro.rda")


## -------------------------------------------------------------------------------------------
load("data/feats_macro.rda")
load("data/feats_micro.rda")


scags_macro <- calc_scags_wide(feats_macro[,c(1,3,4,7,9,13,41)], 
  scags = c("convex", "splines", "skinny",
            "outlying", "stringy", "striated2",
            "clumpy2", "sparse", "skewed"))


scags_micro <- calc_scags_wide(feats_micro[,c(1,3,4,7,9,13,41)], 
  scags = c("convex", "splines", "skinny",
            "outlying", "stringy", "striated2",
            "clumpy2","sparse", "skewed"))

scags_macro <- scags_macro |>
  pivot_longer(outlying:splines, 
               names_to = "scags",
               values_to = "macro_value") 

scags_micro <- scags_micro |>
  pivot_longer(outlying:splines, 
               names_to = "scags",
               values_to = "micro_value") 

scags_mac_mic <- full_join(scags_macro, scags_micro, by = c("Var1", "Var2", "scags"))
scags_mac_mic <- scags_mac_mic |>
  mutate(scag_dif = abs(macro_value-micro_value)) |>
  group_by(scags) |>
  arrange(desc(scag_dif))|>
  slice_head(n=1)

scags_mac_mic$Var1 <- str_replace_all(scags_mac_mic$Var1, "_", ' ') 
scags_mac_mic$Var2 <- str_replace_all(scags_mac_mic$Var2, "_", ' ') 
  

## ----scagsmacmic-html, eval=is_html_output()------------------------------------------------
# scags_mac_mic |> kable(caption = "Pairs of time series features that have large differences between macroeconomic and microeconomic series on a range of scagnostics.", escape=FALSE, booktabs = T, col.names = c("Variable 1", "Variable 2", "Scagnostic", "Macro Value", "Micro Value", "Difference"))


## ----scagsmacmic-pdf, eval=is_latex_output()------------------------------------------------
scags_mac_mic |> kable(format="latex", caption = "Pairs of time series features that have large differences between macroeconomic and microeconomic series on a range of scagnostics.", escape=FALSE, booktabs = T, col.names = c("Variable 1", "Variable 2", "Scagnostic", "Macro Value", "Micro Value", "Difference")) 


## ----timeseries, fig.alt = "Three scatter plots of time series feautes calcuated on microeconomic and macro economic series where the plots were identified due to having the biggest difference in a particular scagnostic value.",fig.cap="Interesting differences in the visual features of two groups of time series can be detected by scagnostics. The three plots represent the variable pairs that maximised the differences between outlying (left), skewed (middle) and convex (right). While these distributions may have overlapping means, their differencs in shape have been identified by the scagnostics.", fig.width = 10, fig.height = 4, out.width="100%", layout = "l-body"----
feats_mac_mic <- bind_rows(
  mutate(feats_macro, type = "macro"),
  mutate(feats_micro, type = "micro")
)

p1 <- ggplot(feats_mac_mic, 
       aes(pacf5 , y=diff1_acf1, colour = type)) + 
  geom_point(alpha = 0.5) +
  scale_colour_brewer("", palette="Dark2") +
  #scale_colour_manual(values=c(mypal[1], mypal[5]))+
  theme_classic()+
  ggtitle("Top difference on outlying") + 
  theme(aspect.ratio=1, axis.text = element_blank())
  #facet_wrap(~type, nrow=2) 

p2 <- ggplot(feats_mac_mic, 
       aes(x=curvature, y= trend_strength, colour = type)) + 
  geom_point(alpha = 0.5) +
  scale_colour_brewer("", palette="Dark2") +
  #scale_colour_manual(values=c(mypal[1], mypal[5]))+
  theme_classic()+
  ggtitle("Top difference on skewed") +
  theme(aspect.ratio=1, axis.text = element_blank()) 
  #facet_wrap(~type, nrow=2)

p3 <- ggplot(feats_mac_mic, 
       aes(x=longest_flat_spot, y=trend_strength, colour = type)) + 
  geom_point(alpha = 0.5) +
  ggtitle("Top difference on convex") +
  scale_colour_brewer("", palette="Dark2") +
  #scale_colour_manual(values=c(mypal[1], mypal[5]))+
  theme_classic() +
  theme(aspect.ratio=1, axis.text = element_blank())  
  #facet_wrap(~type, nrow=2)

p1 + p2 + p3 + plot_layout(guides = "collect") & theme(legend.position = "bottom")


## ----tsplots, fig.alt =  "Six line plots, three of macroeconomic time series data and three of microeconomic time series data, where the difference between the two series is in how jaggard they are.", fig.cap="Selection of three series from the 100 of each of two groups, macroeconomics and microeconomics. The difference between the groups appears to be primarily in the jagginess of the two series, which a little surprisingly, is captured by the time series features curvature and trend strength. The way that trend strength is calculated, on closer inspection, could lead to describing jagginess.", fig.width = 12, fig.height = 8, out.width="100%", layout = "l-body"----
load("data/cets_macro.rda")
load("data/cets_micro.rda")

macro_s_ts1 <- cets_macro[[1]] |>
  as_tibble() |> 
  mutate(x = as.numeric(x), 
         t = 1:n(), 
         name = names(cets_macro)[[1]])
macro_s_ts2 <- cets_macro[[2]] |>
  as_tibble() |> 
  mutate(x = as.numeric(x), 
         t = 1:n(), 
         name = names(cets_macro)[[2]])
macro_s_ts3 <- cets_macro[[3]] |>
  as_tibble() |> 
  mutate(x = as.numeric(x), 
         t = 1:n(), 
         name = names(cets_macro)[[3]])
macro_s_ts <- bind_rows(macro_s_ts1,
                        macro_s_ts2,
                        macro_s_ts3)
mac <- ggplot(macro_s_ts, aes(x=t, y=x)) +
  geom_line() + 
  facet_wrap(~name, ncol=1, scales = "free") +
  ggtitle("Macroeconomics") +
  theme_minimal() +
  xlab("") + ylab("") +
  theme(axis.text = element_blank())

micro_s_ts1 <- cets_micro[[1]] |>
  as_tibble() |> 
  mutate(x = as.numeric(x), 
         t = 1:n(), 
         name = names(cets_micro)[[1]])
micro_s_ts2 <- cets_micro[[2]] |>
  as_tibble() |> 
  mutate(x = as.numeric(x), 
         t = 1:n(), 
         name = names(cets_micro)[[2]])
micro_s_ts3 <- cets_micro[[3]] |>
  as_tibble() |> 
  mutate(x = as.numeric(x), 
         t = 1:n(), 
         name = names(cets_micro)[[3]])
micro_s_ts <- bind_rows(micro_s_ts1,
                        micro_s_ts2,
                        micro_s_ts3)
mic <- ggplot(micro_s_ts, aes(x=t, y=x)) + 
  geom_line() + 
  facet_wrap(~name, ncol=1, scales = "free") +
  ggtitle("Microeconomics") +
  xlab("") + ylab("") +
  theme_minimal() +
  theme(axis.text = element_blank())

mac + mic


## ----eval=FALSE-----------------------------------------------------------------------------
# # Pre-processing code
# library(readxl)
# wbi <- read_xlsx("data/World_Development_Indicators.xlsx", n_max = 2436) # Cut off extra stuff
# 
# wbi_wide <- wbi |>
#   mutate(`2018 [YR2018]` = str_replace(`2018 [YR2018]`, "..","")) |>
#   mutate(`2018 [YR2018]` = as.numeric(`2018 [YR2018]`)) |>
#   select(`Country Code`, `Series Code`, `2018 [YR2018]`) |>
#   pivot_wider(names_from = `Series Code`, values_from = `2018 [YR2018]`, id_cols = `Country Code`, values_fn = mean) |>
#   filter(!is.na(`Country Code`))
# 
# # Drop variables with missings, or too similar
# library(naniar)
# s_miss <- miss_summary(wbi_wide)
# gg_miss_var(wbi_wide)
# 
# # Remove vars with more than 15 missing values
# drop <- s_miss$miss_var_summary[[1]] |>
#   filter(pct_miss > 15)
# wbi_wide <- wbi_wide |>
#   select(-drop$variable)
# 
# # Remove countries with more than 10 missing values
# drop_cnt <- s_miss$miss_case_summary[[1]] |>
#   filter(pct_miss > 10)
# wbi_wide <- wbi_wide[-drop_cnt$case,]
# 
# # Check again - still a few sporadic missings
# # The missing value plot shows no missing pattern
# s_miss <- miss_summary(wbi_wide)
# 
# vis_miss(wbi_wide)
# 
# # Remove any variables that are the same
# #scag_cor <- calc_scags_wide(wbi_wide_sub[,-1],
# #    scags = "monotonic")
# #wbi_wide_sub <- wbi_wide_sub |>
# #  select(-NY.GDP.MKTP.CD)
# wbi_wide <- wbi_wide[,-8] # TX.VAL.TECH.MF.ZS too few values, almost all zero
# save(wbi_wide, file="data/wbi_wide.rda")
# 
# # Check individual scag calculations, for testing
# calc_scags(wbi_wide$EN.ATM.CO2E.PC, wbi_wide$NV.AGR.TOTL.ZS, scags = "striped")
# 
# wbi_scags <- calc_scags_wide(wbi_wide[,-1],
#     scags = c("outlying", "stringy",
#               "striated2", "clumpy2", "skewed",
#               "convex", "skinny",
#               "splines", "striped"), out.rm=FALSE)
# save(wbi_scags, file="data/wbi_scags.rda")


## ----wbi------------------------------------------------------------------------------------
load("data/wbi_wide.rda")
load("data/wbi_scags.rda")


wbi_scags_long <- wbi_scags |> 
  pivot_longer(!c(Var1, Var2), names_to = "scag", values_to = "value") |>
  arrange(desc(value))

w1 <- ggplot(wbi_wide,
             aes(.data[[as.character(wbi_scags_long$Var1[1])]], 
                 .data[[as.character(wbi_scags_long$Var2[1])]])) +
  geom_point()

w2 <- ggplot(wbi_wide, 
       aes(.data[[as.character(wbi_scags_long$Var1[2])]],
           .data[[as.character(wbi_scags_long$Var2[2])]])) +
  geom_point()

# Summarise highest on each index
wbi_scags_top <- wbi_scags_long |> 
  group_by(scag) |>
  slice_head(n=1)

w3a <- ggplot(wbi_wide, 
       aes(.data[[as.character(wbi_scags_top$Var1[1])]],
           .data[[as.character(wbi_scags_top$Var2[1])]])) +
  geom_point(aes(label = `Country Code`)) +
  #ggtitle(wbi_scags_top$scag[1]) +
  theme_classic() +
  theme(axis.text = element_blank())

w3b <- ggplot(filter(wbi_wide, AG.LND.AGRI.ZS<10), #remove outlier so it is the scatter plot it used for calc
              aes(.data[[as.character(wbi_scags_top$Var1[2])]],
                  .data[[as.character(wbi_scags_top$Var2[2])]])) +
  geom_point(aes(label = `Country Code`)) +
  #ggtitle(wbi_scags_top$scag[2]) +
  theme_classic() +
  theme(axis.text = element_blank())
# Most tend to be vars with too many 0's

# Summarise highest on each variable pair
wbi_scags_topvars <- wbi_scags_long |> 
  group_by(Var1, Var2) |>
  slice_head(n=1) |>
  ungroup() |>
  arrange(scag)

pair1 <- wbi_scags_topvars |> filter(scag == "clumpy2")

#w4 <- ggplot(wbi_wide, 
#         aes_string(x=as.character(pair1$Var1),
#                    y=as.character(pair1$Var2))) +
#  geom_point(aes(label = `Country Code`)) +
#  theme(aspect.ratio=1) +
#  ggtitle(pair1$scag)

pair2 <- wbi_scags_topvars |> filter(scag == "striated2")

#w5 <- wbi_wide  |>
#  ggplot(aes_string(x=as.character(pair2$Var1),
#                    y=as.character(pair2$Var2))) +
#  geom_point(aes(label = `Country Code`)) +
#  theme(aspect.ratio=1) +
#  ggtitle(pair2$scag)

wbi_scags_topvars <- wbi_scags_topvars |>
  mutate(scag = factor(scag, 
           levels = c("skinny", "striated2",
           "outlying", "clumpy2", "convex",
            "skewed", "striped", "stringy")),
         vars = paste0(Var1, " ", Var2))
w6 <- ggplot(wbi_scags_topvars, 
             aes(x=scag,
                 y=value, label = vars)) +
  xlab("") + ylab("") +
  geom_point(alpha=0.7) + coord_flip()+ 
  theme(aspect.ratio = 1) +
  theme_classic()


## ----wbiinteractive, fig.alt = "Three scatter plots, the first shows the top scagnostic for each scatter plot in the world bank data, the second is the scatter plot that returned the highest clumpy 2 value, while the third is the one that returned the highest convext value. The first plot gives a summary of the overall shape of the data, neither the second or third plot have any interesting visual features.", fig.cap = "Using scagnostics to explore the variety of relationships present in the WBI data. The side-by-side dotplot (left) shows one point for each pair of variables, with its highest scagnostic value among all scagnostics calculated. Most of the pairs of indicators exhibit outliers, skewed, stringy or convex. There is one pair that has clumpy as the highest value. The plots, middle and right, show the pair of variables with highest value on clumpy2 and convex, respectively. (Mouseover allows identification of variable pairs, and countries.)",  include=knitr::is_html_output(), eval=knitr::is_html_output(), fig.height=3.5, fig.width=9, layout = "l-body"----
# ws1 <- ggplotly(w6)
# ws2 <- ggplotly(w3a)
# ws3 <- ggplotly(w3b)
# subplot(ws1, ws2, ws3, nrows=1, widths = c(0.33, 0.33, 0.33), heights = 0.6)


## ----wbistatic, fig.alt = "Three scatter plots, the first shows the top scagnostic for each scatter plot in the world bank data, the second is the scatter plot that returned the highest clumpy 2 value, while the third is the one that returned the highest convext value. The first plot gives a summary of the overall shape of the data, neither the second or third plot have any interesting visual features.", fig.cap="Using scagnostics to explore the variety of relationships present in the WBI data. The side-by-side dotplot (left) shows one point for each pair of variables, with its highest scagnostic value among all scagnostics calculated. Most of the pairs of indicators exhibit outliers, skewed, stringy or convex. There is one pair that has clumpy as the highest value. The plots, middle and right, show the pair of variables with highest value on clumpy2 and convex, respectively.", fig.height=3.5, fig.width=9, out.width="100%", include=knitr::is_latex_output(), eval=knitr::is_latex_output()----
w6 <- w6 + ggtitle("Top scagnostic for each pair") 
w3a <- w3a + ggtitle("Top clumpy2")
w3b <- w3b + ggtitle("Top convex")
w6 + w3a + w3b


## ----include=FALSE, eval=FALSE--------------------------------------------------------------
# # Spell check
# library(spelling)
# rmd <- "mason-lee-laa-cook.rmd"
# ignore <- readLines("WORDLIST")
# check_spelling <- spell_check_files(
#   rmd,
#   ignore = ignore,
#   lang = "en_GB"
# )
# if (nrow(check_spelling) > 0) {
#   print(check_spelling)
#   stop("Check spelling in Rmd files!")
# }
# 
# # Code to knit if issues
# # pandoc::pandoc_activate(version = '3.1.6')
# # rmarkdown::render('mason-lee-laa-cook.Rmd', output_format = 'all')