-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmason-lee-laa-cook.R
1005 lines (820 loc) · 48.5 KB
/
mason-lee-laa-cook.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand
# Please edit mason-lee-laa-cook.Rmd to modify this file
## ----setup, include=FALSE-------------------------------------------------------------------
knitr::opts_chunk$set(
echo = FALSE,
warning = FALSE,
message = FALSE)
## ----load-libraries-------------------------------------------------------------------------
library(cassowaryr)
library(GGally)
library(plotly)
library(dplyr)
library(tidyr)
library(stringr)
library(patchwork)
library(knitr)
library(kableExtra)
library(ggimg) #for the visual table
library(ggstance) #for vertical dodge on plot
options(digits=2) # for numerical printing
## ----building-blocks2, width = 15, height = 5, out.width = "100%", fig.alt = "A hand drawn illustration of a scatter plot with its respective convex hull, alpha hull, and minimum spanning tree.", fig.cap = "The building blocks for graph-based scagnostics: (a) convex hull, (b) alpha hull and (c) minimal spanning tree. The convex hull is a convex shell around all the data points. The alpha hull contains all the points but allows concavities better capturing some shapes, but it needs tuning. The minimal spanning tree connects all points once, and has a single chain connecting central points.", layout = "l-body"----
library(alphahull)
data("features")
nl <- features |> filter(feature == "ring")
d1 <- draw_convexhull(nl$x, nl$y, clr="#FFD700", fill=TRUE) +
ggtitle("a. Convex hull") +
xlab("") + ylab("") +
theme_void() +
theme(aspect.ratio=1, axis.text = element_blank())
d2 <- draw_alphahull(nl$x, nl$y, clr="#00a800") +
ggtitle("b. Alpha hull") +
xlab("") + ylab("") +
theme_void() +
theme(aspect.ratio=1, axis.text = element_blank())
d3 <- draw_mst(nl$x, nl$y) +
ggtitle("c. Minimal spanning tree") +
xlab("") + ylab("") +
theme_void() +
theme(aspect.ratio=1, axis.text = element_blank())
d1 + d2 + d3
## ----scagdrawn, out.width = "100%", fig.alt = "A hand drawn illustration showing the algorithm of the skinny, outlying and clumpy calculations. In each drawing the edges used in the calculation are highlighted in both the mathematical formula and in the illustration of the scatter plot.", fig.cap = " An visualisation of the calculation used to compute the skinny (top), outlying (middle), and clumpy (bottom) scagnostics. These three measure definitions are quite distinct, and each illustrates a unique method of capturing a visual feature in a scatter plot."----
knitr::include_graphics("figures/drawnmeasures.png")
## ----features-plot, width = 150, height = 150, out.width = "100%", fig.alt = "Fifteen scatter plots of the features data where each scatter plot has a distinct visual shape.", fig.cap = "The scatter plots of the features dataset. These scatter plots were designed to each represent a distinct visual feature, for example the ring scatter plot is a hollow version of disk. The scagnostics need to be able to differentiate these plots."----
mypal <- c("#b2182b", "#d53e4f","#FF4E50", "#FC913A", "#fdae61",
"#F9D423", "#fee08b" , "#abdda4" , "#a6d96a" , "#66c2a5" ,
"#66bd63","#3B8183", "#3288bd", "#74add1", "#abd9e9")
#plot them
ggplot(features, aes(x,y,colour=feature))+
geom_point() +
theme_minimal() +
facet_wrap(~feature, ncol=5, scales="free") +
xlab("") + ylab("") +
theme(legend.position = "none",
aspect.ratio= 1,
axis.text = element_blank()) +
scale_colour_manual(values = mypal)
## -------------------------------------------------------------------------------------------
scagissues_tb <- tibble(Scagnostic = c("Striated",
"Sparse",
"Skewed",
"Outlying",
"Stringy",
"Clumpy"),
Issues = c("The striated measure can identify when one variable is discrete, and one is continuous, but is unable to identify two discrete variables. Additionally, since the edges used to calculate the striated measure are a subset of those used by the stringy measure, the two scagnostics are highly correlated, and often identify the same plots.",
"While sparse does seem to identify spread out distributions, it rarely returns a value higher than 0.1. The removal of binning means that an infinitely large number of points can cluster in an infinsimally small space in the plot. This cluster can arbitrarily keep the sparse value low, even if the graph outside this cluster is very sparse. With a large number of observations on two continuous variables, this problem becomes unavoidable. Therefore, as the number of observations increase the sparse value approaches 0, independent of the shape of the scatterplot.",
"This measure can identify skewed edge lengths, such as the L shape in the visual table, that typically require a log transformation. The skewed value rarely drop below 0.5 or rise above 0.8. Skewed seems to suffers from a similar issue to sparse reguarding binning.",
"By definition an outlier must have all its adjacent edges in the MST above the outlying threshold. This means two or more observations that are close together but away from the main mass of data will not be identified as outliers, which does not align with human intuition. Even if we change the measure such that only one edge needs to be above the outlying threshold, it would only remove a single point. The measure also gives high outlying values to skewed scatter plots. If the number of points close to the centre of the cluster is large enough, outlying identifies the spread out points to be outliers and returns a large value, once again going against human intuition.",
"This measure rarely drops below 0.5 even on data generated from an independent bivariate normal distribution (which should intuitively return a 0). Unlike the other scagnostics on this list, stringy does not depend upon the edge lengths of the MST, so it is hard to say if this issue stems from binning. That being said, it was not reported in the binned version of the scagnostics, and so is likely a result of binning. ",
"With the removal of binning, clumpy does not identify a long edge connected to a short edge, but rather identifies any edge connected to an arbitrarily small edge. This means the clumpy measure rarely drops below 0.9, and also fails to order scatterplots similarly to human intuition."))
## ----scagissues-tb-html, eval=is_html_output()----------------------------------------------
# scagissues_tb |>
# kbl(caption = "Summary of MST scagnostic issues.")
## ----scagissues-tb-pdf, eval=is_latex_output()----------------------------------------------
scagissues_tb |>
kbl(caption = "Summary of MST scagnostic issues.", format="latex", booktabs = T) |>
column_spec(2, width = "12cm")
## ----scatterplots-as-images, include=FALSE, eval=FALSE-------------------------------------
# #set theme so all scatter plots in table match
# plot_theme <- theme_classic() + #theme_minimal() +
# theme(aspect.ratio=1, axis.title=element_blank(), axis.text = element_blank(),
# panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
# panel.border = element_rect(colour = "black", fill=NA, size=4),
# legend.position = "none"
# )
#
# #save scatter plots as images
# plots <- sort(unique(features$feature))
#
# for (i in seq(length(plots))){
# holdplot <- features |>
# filter(feature==plots[i]) |>
# ggplot(aes(x,y, size=2))+ geom_point(colour=mypal[i]) + plot_theme
# ggsave(paste0("paper-RJ/figures/", plots[i], ".png"),holdplot) #files already in /figure/
# }
#
## ----visual-table, width = 150, height = 200, out.width = "100%", fig.alt = "A visual table of the scagnostic values of each of the feature scatter plots where each scagnostic is identifying different features as important.", fig.cap = "A visual table that displays a selection of scagnostics computed on the features data. The rows correspond to different scagnostics and the horizontal axis is the calculated value on a range of 0-1. Thumbnail plots of variable pairs are placed at their scagnostic value, and indicate the type of structure that would produce high or low or medium values. Some scagnostics, e.g. clumpy, need adjustment as they do not correctly order the scagnostics, or range from 0 to 1. Other measures, such as splines work without any changes to their definition.", layout = "l-body-outset"----
# Calculate Scagnostics
features_scagnostics_long <- features |>
group_by(feature) |>
summarise(calc_scags(x,y)) |>
pivot_longer(cols=outlying:dcor, names_to = "scagnostic")
# edit data frame
plot_path_data <- features_scagnostics_long |>
mutate(plotad = paste0("figures/", feature, ".png"))
# which plots to include in visual table
whichplots <- function(scag, feature){
pad = FALSE
# Alphahull measures
if(all(scag=="convex", feature %in% c("discrete", "ring", "l-shape"))){
pad = TRUE
}
if(all(scag=="skinny", feature %in% c("line", "positive", "disk"))){
pad = TRUE
}
# MST measures
if(all(scag=="outlying", feature %in% c("outliers2","l-shape", "outliers"))){
pad = TRUE
}
if(all(scag=="stringy", feature %in% c("nonlinear1", "gaps"))){
pad = TRUE
}
if(all(scag=="striated", feature %in% c("vlines", "discrete", "weak"))){
pad = TRUE
}
if(all(scag=="clumpy", feature %in% c("vlines", "clusters", "nonlinear"))){
pad = TRUE
}
if(all(scag=="sparse", feature %in% c("weak", "line"))){
pad = TRUE
}
if(all(scag=="skewed", feature %in% c("l-shape", "barrier"))){
pad = TRUE
}
# Association Measures
if(all(scag=="monotonic", feature %in% c("line", "positive", "weak"))){
pad = TRUE
}
if(all(scag=="splines", feature %in% c("nonlinear2", "clusters", "vlines"))){
pad = TRUE
}
if(all(scag=="dcor", feature %in% c("positive", "barrier", "gaps"))){
pad = TRUE
}
pad
}
# Make Visual Table
# Data
plot_data <- plot_path_data |>
group_by(scagnostic, feature) |>
mutate(doplot = whichplots(scagnostic, feature)) |>
ungroup() |>
filter(doplot==TRUE)
# so i dont have to keep adjusting the image size
s <- length(unique(plot_data$feature))
# plot
visual_table <- ggplot(plot_data, aes(x=value , y=scagnostic))+
geom_point_img(aes(img = plotad), size = 1.5) +
theme_classic() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position="none") +
xlim(-0.1,1.1) +
scale_size_identity()+
xlab("Scagnostic value") +
ylab("") +
#ggtitle("Visual Table of Scagnostic Values")+
theme(
axis.line = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
panel.grid.major.y = element_line()
)
#ggsave("paper-RJ/figures/visual_table.png", visual_table, width=10, height=10)
visual_table
## ----striated-vtable, width = 150, height = 50, out.width = "100%", fig.alt = "A visual table of the feature scatter plots on the striated and striated 2 measures where striated 2, unlike striated, gives a value of 1 to a plots that is continuous in one axis and discrete on the other.", fig.cap = "Using a visual table to compare the striated and it's adjusted counterpart, striated2 allows us to visualise the difference in the measures. While the functions may seem similar at a glance, striated2 has a stricter version of discreteness, hence why line and vlines have the same result and plots with no discreteness score a 0."----
# Make Visual Table
# Data
plot_data_striated <- plot_path_data |>
group_by(scagnostic, feature) |>
mutate(doplot = ifelse(all(scagnostic %in% c("striated","striated2"),
feature %in% c("vlines", "discrete","line", "disk", "outliers2")),
TRUE,
FALSE)) |>
ungroup() |>
filter(doplot==TRUE)
# plot
striated_visual_table <- ggplot(plot_data_striated, aes(x=value , y=scagnostic))+
geom_point_img(aes(img = plotad), size = 2,
position=ggstance::position_dodgev(height=0.9)) +
xlim(-0.05,1.05) +
scale_size_identity()+
xlab("Scagnostic value") +
ylab("") +
#ggtitle("Striated Comparison") +
theme_classic() +
theme(
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position="none",
axis.line = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
panel.grid.major.y = element_line()
)
#ggsave("paper-RJ/figures/striated_visual_table.png", striated_visual_table, width=10, height=10)
striated_visual_table
## ----clumpy-vtable, width = 150, height = 50, out.width = "100%", fig.alt = "A visual table of the feature scatter plots on the cluump and clumpy 2 measures where clumpy 2, unlike clumpy, orders the features plots according to how 'clustered' they are and actually ranges from 0 to 1.", fig.cap = "A visual table comparing the scagnostic values of clumpy and clumpy2. We can see the clusters plot is next to last in the ordering of the original clumpy measure, but first in clumpy2. It is clear that clumpy2 achieves a more ballanced distribution and more intuitive plot ordering."----
plot_data_clumpy <- plot_path_data |>
group_by(scagnostic, feature) |>
mutate(doplot = ifelse(all(scagnostic %in% c("clumpy","clumpy2"),
feature %in% c("vlines", "clusters","gaps", "outliers", "barrier")),
TRUE,
FALSE)) |>
ungroup() |>
filter(doplot==TRUE)
# plot
clumpy_visual_table <- ggplot(plot_data_clumpy, aes(x=value , y=scagnostic))+
geom_point_img(aes(img = plotad), size = 2,
position=ggstance::position_dodgev(height=0.9)) +
xlim(-0.05,1.05) +
scale_size_identity()+
xlab("Scagnostic Value") +
ylab("") +
#ggtitle("Clumpy Comparison") +
theme_classic() +
theme(
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position="none",
axis.line = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
panel.grid.major.y = element_line()
)
#ggsave("paper-RJ/figures/clumpy_visual_table.png", clumpy_visual_table, width=10, height=10)
clumpy_visual_table
## -------------------------------------------------------------------------------------------
datasets_tb <- tibble(dt = c("features",
"anscombe_tidy",
"datasaurus_dozen",
"datasaurus_dozen_wide",
"numbat",
"pk"),
text = c("Simulated data with special features. ",
"Data from Anscombes famous example in tidy format.",
"Datasaurus Dozen data in a long tidy format.",
"Datasaurus Dozen Data in a wide tidy format.",
"A toy data set with a numbat shape hidden among noise variables.",
"Parkinsons data from UCI machine learning archive."))
## ----datasets-tb-html, eval=is_html_output()------------------------------------------------
# datasets_tb |>
# kable(caption = "Cassowaryr data sets", col.names = c("data", "explanation"))
## ----datasets-tb-pdf, eval=is_latex_output()------------------------------------------------
datasets_tb |>
kable(caption = "Cassowaryr data sets", format="latex", col.names = c("data", "explanation"), booktabs = T) |>
column_spec(1, width = "4cm") |>
column_spec(2, width = "8cm")
## -------------------------------------------------------------------------------------------
scagfuncs_tb <- tibble(dt = c("scree",
"sc_clumpy",
"sc_clumpy2",
"sc_clumpy_r",
"sc_convex",
"sc_dcor",
"sc_monotonic",
"sc_outlying",
"sc_skewed",
"sc_skinny",
"sc_sparse",
"sc_sparse2",
"sc_splines",
"sc_striated",
"sc_striated2",
"sc_stringy"),
text = c("Generates a scree object that contains the Delaunay triangulation of the scater plot.",
"Compute the original clumpy scagnostic measure.",
"Compute adjusted clumpy scagnositc measure.",
"Compute robust clumpy scagnostic measure.",
"Compute the original convex scagnostic measure",
"Compute the distance correlation index.",
"Compute the Spearman correlation.",
"Compute the original outlying scagnostic measure.",
"Compute the original skewed scagnostic measure.",
"Compute the original skinny scagnostic measure.",
"Compute the original sparse scagnostic measure.",
"Compute adjusted sparse measure.",
"Compute the spline based index.",
"Compute the original stirated scagnostic measure.",
"Compute angle adjusted striated measure.",
"Compute stringy scagnostic measure."))
## ----scagfuncs-tb-html, eval=is_html_output()-----------------------------------------------
# scagfuncs_tb |> kable(caption = "Cassowaryr Scagnostic functions")
## ----scagfuncs-tb-pdf, eval=is_latex_output()-----------------------------------------------
scagfuncs_tb |> kable(caption = "Cassowaryr Scagnostic functions", col.names = c("Function", "Explanation"), format="latex", booktabs = T) |>
column_spec(1, width = "3cm") |>
column_spec(2, width = "10cm")
## -------------------------------------------------------------------------------------------
drawfuncs_tb <- tibble(funcname = c("draw_alphahull",
"draw_convexhull",
"draw_mst"),
description = c("Drawing the alpha hull.",
"Drawing the convex hull.",
"Drawing the MST."))
## ----drawfuncs-tb-html, eval=is_html_output()-----------------------------------------------
# drawfuncs_tb |> kable(caption = "Cassowaryr drawing functions")
## ----drawfuncs-tb-pdf, eval=is_latex_output()-----------------------------------------------
drawfuncs_tb |> kable(caption = "Cassowaryr drawing functions", col.names = c("Function", "Explanation"), format="latex", booktabs = T) |>
column_spec(1, width = "3cm")
## ----calc-scags-args------------------------------------------------------------------------
calcfunc_tb <- tibble(argument = c("y", "x", "scags"),
description = c("numeric vector of x values.",
"numeric vector of y values.",
"collection of strings matching names of scagnostics to calculate: outlying, stringy, striated, striated2, striped, clumpy, clumpy2, sparse, skewed, convex, skinny, monotonic, splines, dcor. The default is to calculate all scagnostics."))
## ----calcfunc-tb-html, eval=is_html_output()------------------------------------------------
# calcfunc_tb |> kable(caption="The main arguments for `calc_scags()`.")
## ----calcfunc-tb-pdf, eval=is_latex_output()------------------------------------------------
calcfunc_tb |> kable(format="latex", caption="The main arguments for calc\\_scags().", col.names = c("Argument", "Explanation"), escape=FALSE, booktabs = T) |>
column_spec(2, width = "12cm")
## ----calcscags-example, echo=TRUE-----------------------------------------------------------
features_scags <- features |>
group_by(feature) |>
summarise(calc_scags(x,y, c("outlying", "clumpy2", "monotonic")))
## ----featuresscags-html, eval=is_html_output()----------------------------------------------
# features_scags |> kable(caption = "Summary of three scagnostics computed by `calc_scags()` on the long form of the features data.")
## ----featuresscags-pdf, eval=is_latex_output()----------------------------------------------
features_scags |> kable(format="latex", caption = "Summary of three scagnostics computed by calc\\_scags() on the long form of the features data.", escape=FALSE, booktabs = T) |>
column_spec(1, width = "3cm")
## ----echo=TRUE, include=TRUE----------------------------------------------------------------
top_scags(features_scags)
## ----outlying-test-plot, width = 100, height = 50, out.width = "100%", fig.alt = "A scatter plot of simulated organised data where there are multiple ways of drawing the minimum spanning tree.", fig.cap = "Plot of simulated data used for testing the outlying scagnostic. The left plot shows the raw data, while the right plot presents the MST generated on that data. When we created the test we expected the red dashed line to be in the MST, but instead the green line that connects points 3 and 4 is. If the red edge is in the MST rather than the black edge, the outlying value on this plot is much higher."----
x1<- c(0,1,1,1,0,0,0,1,1,1,0,0,0,1,1,1, 10, 10, 10)
y1 <- c(0,0,1,2,2,3,4,4,5,6,6,7,8,8,9,10, 10, 6, 5)
t1 <- ggplot(tibble(x1, y1), aes(x1, y1)) + geom_point() +
geom_text(data=tibble(x1= c(1,10,1,10), y1 = c(10, 10, 6,6), lab=c(1,2,3,4)),
aes(x=x1, y=y1, label=lab),
colour="#3288bd", nudge_x=0.3, nudge_y=0.3) +
theme_classic() + ggtitle("Simlulated Data for Outlying Test")
t2 <- draw_mst(x1, y1, out.rm = FALSE) + theme_classic() +
ggtitle("MST of Simulated Data") +
geom_segment(aes(x=0.1, y=0.6, xend=1, yend=0.6), colour="#00a800")+
geom_segment(aes(x=0.1, y=1, xend=1, yend=1), colour="#fb8072", linetype=2)+
#geom_segment(aes(x=1, y=1, xend=1, yend=0.7), colour="#FFD700")+
geom_text(data=tibble(x1= c(0.1,1,0.1,1), y1 = c(1, 1, 0.6,0.6), lab=c(1,2,3,4)),
aes(x=x1, y=y1, label=lab),
colour="#3288bd", nudge_x=0.03, nudge_y=0.03) +
theme(legend.position="none", aspect.ratio=1)
t1 + t2 + plot_layout(nrow=1)
## ----AFL DATA, include=FALSE, eval=FALSE----------------------------------------------------
# library(fitzRoy)
# aflw <- fetch_player_stats(2020, comp = "AFLW")
#
# aflw_num <- aflw |>
# select_if(is.numeric)
#
# aflw_num <- aggregate(aflw_num[,5:37],
# list(aflw$player.player.player.surname),
# mean)
# aflw_num <- aflw_num |>
# ungroup()
#
# aflw_scags <- calc_scags_wide(aflw_num[,2:34])
#
# #AFLW DATA SAVE
# save(aflw, file="data/aflw.rda")
# save(aflw_num, file="data/aflw_num.rda")
# save(aflw_scags, file="data/aflw_scags.rda")
## ----AFLW Scatter Plots , echo=FALSE--------------------------------------------------------
load("data/aflw_num.rda")
load("data/aflw_scags.rda")
mypal <- c("#66c2a5", "#fc8d62","#8da0cb", "#e78ac3", "#a6d854", "#ffd92f")
#standounts from splom
#outlying and Skewed + highest skinny not 1
p1 <- ggplot(aflw_num, aes(x=disposalEfficiency, y=hitouts, label=Group.1)) +
theme_classic()+
#ggtitle("Plot 1") +
geom_point(colour=mypal[1])
#high on the 3 associations measures
p2 <- ggplot(aflw_num, aes(x=totalPossessions, y=disposals, label=Group.1)) +
theme_classic()+
#ggtitle("Plot 2") +
geom_point(colour=mypal[2])
#low on sparse and high on convex
p3 <- ggplot(aflw_num, aes(x=marksInside50, y=goals)) +
theme_classic()+
#ggtitle("Plot 3") +
geom_point(colour=mypal[3])
#high on clumpy adjusted, low on monotonic
p4 <- ggplot(aflw_num, aes(x=onePercenters, y=handballs)) +
theme_classic()+
#ggtitle("Plot 4") +
geom_point(colour=mypal[4])
#interesting HIGH on striated, moderate on outlying
p5 <- ggplot(aflw_num, aes(x=bounces, y=hitouts, label=Group.1)) +
theme_classic()+
#ggtitle("Plot 5") +
geom_point(colour=mypal[5])
# me randomly picking two variables
p6 <- ggplot(aflw_num, aes(x=kicks, y=handballs)) +
theme_classic()+
#ggtitle("Plot 6") +
geom_point(colour=mypal[6])
## ----AFLW-scatters-interactive, out.width="100%", include=knitr::is_html_output(), eval=knitr::is_html_output(), fig.alt = "Six scatter plots of AFL sports statistics where all six scatter plots are clearly visually distinct from one another.", fig.cap="Six AFLW sport statistics scatter plots that were identified as identified as interesting by the scagnostics. Plots 1 to 5 had unique values on some combination of the scagnostics, Plot 6 had middling values on all measures. There is a clear difference in structure between these plots indicating the scagnostics ability to identify interesting visual features.", fig.width = 12, fig.height = 8----
#
# pp1 <- ggplotly(p1, width=650, height=450)
# pp2 <- ggplotly(p2, width=650, height=450)
# pp3 <- ggplotly(p3, width=650, height=450)
# pp4 <- ggplotly(p4, width=650, height=450)
# pp5 <- ggplotly(p5, width=650, height=450)
# pp6 <- ggplotly(p6, width=650, height=450)
# subplot(pp1, pp2, pp3, pp4, pp5, pp6,
# nrows=2, widths = c(0.33, 0.33, 0.33), heights = c(0.5,0.5))
## ----AFLW-scatters-static, out.width="100%", include=knitr::is_latex_output(), eval=knitr::is_latex_output(), fig.align='center', fig.alt = "Six scatter plots of AFL sports statistics where all six scatter plots are clearly visually distinct from one another.", fig.cap="Six AFLW sport statistics scatter plots that were identified as identified as interesting by the scagnostics. Plots 1 to 5 had unique values on some combination of the scagnostics, Plot 6 had middling values on all measures. There is a clear difference in structure between these plots indicating the scagnostics ability to identify interesting visual features.", fig.width = 12, fig.height = 8----
#titles
p1a <- p1 + ggtitle("Plot 1")
p2a <- p2 + ggtitle("Plot 2")
p3a <- p3 + ggtitle("Plot 3")
p4a <- p4 + ggtitle("Plot 4")
p5a <- p5 + ggtitle("Plot 5")
p6a <- p6 + ggtitle("Plot 6")
p1a + p2a + p3a + p4a + p5a + p6a + plot_layout(nrow=2)
## ----AFL relevent SPLOMS, include= FALSE, echo=FALSE----------------------------------------
test <- aflw_scags |>
mutate(lab = paste0(Var1, ", ", Var2)) |>
mutate(plot1=ifelse(lab=="disposalEfficiency, hitouts", TRUE,FALSE),
plot2=ifelse(lab=="totalPossessions, disposals", TRUE,FALSE),
plot3=ifelse(lab=="marksInside50, goals", TRUE,FALSE),
plot4=ifelse(lab=="onePercenters, handballs", TRUE,FALSE),
plot5=ifelse(lab=="hitouts, bounces", TRUE, FALSE)
) |>
mutate(plotted = any(plot1,plot2,plot3,plot4, plot5))
s1 <- ggplot(test, aes(x=outlying, skewed, colour=plot1, label=lab)) +
geom_point() +
theme_classic() +
theme(legend.position ="none")+
scale_colour_manual(values=c("grey", mypal[1]))
#ggtitle("Outlying vs Skewed")
s2 <- ggplot(test, aes(x=splines, dcor, colour=plot2, label=lab)) +
geom_point() +
theme_classic() +
theme(legend.position ="none")+
scale_colour_manual(values=c("grey", mypal[2]))
#ggtitle("Splines vs Dcor")
s5 <- ggplot(test, aes(x=outlying, y = striated2, colour=plot5, label=lab)) +
geom_point() +
theme_classic() +
theme(legend.position ="none") +
scale_colour_manual(values=c("grey", mypal[5]))
#ggtitle("Outlying vs Striated2")
## ----threeplot-interactive, out.width="100%", include=knitr::is_html_output(), fig.alt = "Three scatter plots of scagnostic data calculated on the AFLW data with the scatter plot each identified as interesting displayed below it.", fig.cap="Three plots that were identified as interesting below the scagnostic scatter plot used to identify it. Each scatter plot of AFLW data is displayed below a plot of the two scagnostic measures it stood out on. One of the most useful ways to identify plots is through scatter plots of the scagnostics.", fig.width = 12, fig.height = 8, eval=knitr::is_html_output(), layout = "l-body"----
# subplot(ggplotly(s1, width=650, height=450),
# ggplotly(s2, width=650, height=450),
# ggplotly(s5, width=650, height=450),
# ggplotly(p1, width=650, height=450),
# ggplotly(p2, width=650, height=450),
# ggplotly(p5, width=650, height=450),
# nrows=2, widths = c(0.3, 0.3, 0.3), heights = c(0.5, 0.5)) |>
# config(displayModeBar = FALSE)
## ----threeplot-static, out.width="100%", include=knitr::is_latex_output(), fig.alt = "Three scatter plots of scagnostic data calculated on the AFLW data with the scatter plot each identified as interesting displayed below it.", fig.cap="Three plots that were identified as interesting with the scagnostic scatter plot used to identify it. Each scatter plot of AFLW data is displayed below a plot of the two scagnostic measures it stood out on. One of the most useful ways to identify plots is through scatter plots of the scagnostics.", eval=knitr::is_latex_output(), fig.width = 12, fig.height = 8, fig.align='center'----
s1a <- s1 + ggtitle("Outlying vs Skewed")
s2a <- s2 + ggtitle("Splines vs Dcor")
s5a <- s5 + ggtitle("Outlying vs Striated2")
s1a + s2a + s5a + p1a + p2a + p5a + plot_layout(nrow=2)
## ----eval=FALSE-----------------------------------------------------------------------------
# set.seed(26)
# bbh <- read_csv("data/bbh_posterior_samples.csv") |>
# sample_n(200)
# bbh_scags <- calc_scags_wide(bbh)
# save(bbh_scags, file="data/bbh_scags.rda")
# save(bbh, file="data/bbh_samples.rda")
## ----bbh-scags-plots------------------------------------------------------------------------
load("data/bbh_scags.rda")
newpal = c("#1f78b4", "#33a02c", "#e31a1c")
bbh_scags <- mutate(bbh_scags, combination = paste0(Var1, ", ", Var2)) |>
mutate(colours = ifelse(combination %in% c("time, ra", "dec, ra", "dec, time"), "a", "z")) |>
mutate(colours = ifelse(combination=="alpha, time", "b", colours)) |>
mutate(colours = ifelse(combination %in% c("m2, m1","chi_p, chi_tot"), "c", colours))
bbh1 <- ggplot(bbh_scags, aes(x=convex, y=skinny, colour= colours,
label = combination)) +
geom_point() +
theme_classic() +
scale_colour_manual(values=c(newpal, "grey")) +
#ggtitle("Convex vs Skinny") +
theme(aspect.ratio=1, axis.text = element_blank(), legend.position = "none")
bbh2 <- ggplot(bbh_scags, aes(x=dcor, y=splines, colour= colours,
label = combination)) +
geom_point() +
theme_classic() +
scale_colour_manual(values=c(newpal, "grey")) +
#ggtitle("Splines vs Dcor") +
theme(aspect.ratio=1, axis.text = element_blank(), legend.position = "none")
bbh3 <- ggplot(bbh_scags, aes(x=clumpy, y=clumpy2, colour= colours,
label = combination)) +
geom_point() +
theme_classic() +
scale_colour_manual(values=c(newpal, "grey")) +
#ggtitle("Clumpy vs Clumpy2") +
theme(aspect.ratio=1, axis.text = element_blank(), legend.position = "none")
load("data/bbh_samples.rda")
bbh4 <- ggplot(bbh, aes(x=time, y=ra)) +
geom_point(colour=newpal[1]) +
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
bbh5 <- ggplot(bbh, aes(x=dec, y=ra)) +
geom_point(colour=newpal[1]) +
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
bbh6 <- ggplot(bbh, aes(x=dec, y=time)) +
geom_point(colour=newpal[1]) +
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
bbh7 <- ggplot(bbh, aes(x=m1, y=m2)) +
geom_point(colour=newpal[3]) +
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
bbh8 <- ggplot(bbh, aes(x=chi_p, y=chi_tot)) +
geom_point(colour=newpal[3]) +
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
bbh9 <- ggplot(bbh, aes(x=time, y=alpha)) +
geom_point(colour=newpal[2]) +
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
## ----bbh-scags-interactive, fig.alt = "Three scatter plots of scagnostic data calculated on black hole mergers data, where scatter plots of interest are highlighted in blue, green or red.", fig.cap="Selected pairs of scagnostics computed for the black hole mergers data. The coloured points in these plots align with the plot colours of Figure 11. Groups of parameter combinations can be seen to stand out in the left plot (high on skinny and low on convex) and in the middle plot (high on both dcor and splines). The plot on the right shows clumpy vs clumpy2, where we can see the big impact of the correction for this dataset.", layout = "l-body", out.width="100%", fig.width=9, fig.height=3, eval=knitr::is_html_output()----
# gs1 <- ggplotly(bbh1, width=600, height=500)
# gs2 <- ggplotly(bbh2, width=600, height=500)
# gs3 <- ggplotly(bbh3, width=600, height=500)
# subplot(gs1, gs2, gs3, nrows=1, widths = c(0.33, 0.33, 0.33), heights = 0.43)
## ----bbh-scags-static, fig.alt = "Three scatter plots of scagnostic data calculated on black hole mergers data, where scatter plots of interest are highlighted in blue, green or red.", fig.cap="Selected pairs of scagnostics computed for the black hole mergers data. The coloured points in these plots align with the plot colours of Figure 11. Groups of parameter combinations can be seen to stand out in the left plot (high on skinny and low on convex) and in the middle plot (high on both dcor and splines). The plot on the right shows clumpy vs clumpy2, where we can see a large impact of the correction. ", fig.width = 12, fig.height = 4, out.width="100%", eval=knitr::is_latex_output()----
bbh1 <- bbh1 + ggtitle("Convex vs Skinny")
bbh2 <- bbh2 + ggtitle("Splines vs Dcor")
bbh3 <- bbh3 + ggtitle("Clumpy vs Clumpy2")
bbh1 + bbh2 + bbh3
## ----blackholes, fig.alt = "The Six scatter plots of the black hole mergers data identified in the scagnostics as having interesting visual features where all the scatter plots have an interesting visual shape.", fig.cap="Features in the BBH data that stand out on several of the scagnostics measures (convey, skinny, splines and dcor), showing strong relations between variables including non-linear and non-functional dependencies. The colour of these point align with the plots position in the plots of Figure 10. The final example (time vs alpha) is expected to take high values in clumpy, but only stands out on the corrected clumpy2.", fig.width = 12, fig.height = 8, out.width="100%"----
#subplot(bbh1, bbh2, bbh3, bbh4, bbh5, bbh6,
# nrows=2, widths = c(0.33, 0.33, 0.33), heights = c(0.5, 0.5))
bbh4 + bbh5 + bbh6 + bbh7 + bbh8 + bbh9 + plot_layout(nrow=2)
## ----eval=FALSE-----------------------------------------------------------------------------
# # Getting the data
# library(compenginets)
#
# # Playing with other ideas
# cets_macro <- get_cets("macroeconomics")
# cets_micro <- get_cets("micoeconomics")
# save(cets_macro, file="data/cets_macro.rda")
# save(cets_micro, file="data/cets_micro.rda")
#
# # Comparing two types of time series
# library(feasts)
# get_features <- function(ts_in) {
# features(as_tsibble(ts_in),
# value, feature_set("feasts"))
# }
#
# load(here::here("data/cets_macro.rda"))
# feats_macro <- purrr::map_dfr(cets_macro, get_features)
# save(feats_macro, file=here::here("data/feats_macro.rda"))
#
# load("data/cets_micro.rda")
# feats_micro <- purrr::map_dfr(cets_micro, get_features)
# save(feats_micro, file = "data/feats_micro.rda")
## -------------------------------------------------------------------------------------------
load("data/feats_macro.rda")
load("data/feats_micro.rda")
scags_macro <- calc_scags_wide(feats_macro[,c(1,3,4,7,9,13,41)],
scags = c("convex", "splines", "skinny",
"outlying", "stringy", "striated2",
"clumpy2", "sparse", "skewed"))
scags_micro <- calc_scags_wide(feats_micro[,c(1,3,4,7,9,13,41)],
scags = c("convex", "splines", "skinny",
"outlying", "stringy", "striated2",
"clumpy2","sparse", "skewed"))
scags_macro <- scags_macro |>
pivot_longer(outlying:splines,
names_to = "scags",
values_to = "macro_value")
scags_micro <- scags_micro |>
pivot_longer(outlying:splines,
names_to = "scags",
values_to = "micro_value")
scags_mac_mic <- full_join(scags_macro, scags_micro, by = c("Var1", "Var2", "scags"))
scags_mac_mic <- scags_mac_mic |>
mutate(scag_dif = abs(macro_value-micro_value)) |>
group_by(scags) |>
arrange(desc(scag_dif))|>
slice_head(n=1)
scags_mac_mic$Var1 <- str_replace_all(scags_mac_mic$Var1, "_", ' ')
scags_mac_mic$Var2 <- str_replace_all(scags_mac_mic$Var2, "_", ' ')
## ----scagsmacmic-html, eval=is_html_output()------------------------------------------------
# scags_mac_mic |> kable(caption = "Pairs of time series features that have large differences between macroeconomic and microeconomic series on a range of scagnostics.", escape=FALSE, booktabs = T, col.names = c("Variable 1", "Variable 2", "Scagnostic", "Macro Value", "Micro Value", "Difference"))
## ----scagsmacmic-pdf, eval=is_latex_output()------------------------------------------------
scags_mac_mic |> kable(format="latex", caption = "Pairs of time series features that have large differences between macroeconomic and microeconomic series on a range of scagnostics.", escape=FALSE, booktabs = T, col.names = c("Variable 1", "Variable 2", "Scagnostic", "Macro Value", "Micro Value", "Difference"))
## ----timeseries, fig.alt = "Three scatter plots of time series feautes calcuated on microeconomic and macro economic series where the plots were identified due to having the biggest difference in a particular scagnostic value.",fig.cap="Interesting differences in the visual features of two groups of time series can be detected by scagnostics. The three plots represent the variable pairs that maximised the differences between outlying (left), skewed (middle) and convex (right). While these distributions may have overlapping means, their differencs in shape have been identified by the scagnostics.", fig.width = 10, fig.height = 4, out.width="100%", layout = "l-body"----
feats_mac_mic <- bind_rows(
mutate(feats_macro, type = "macro"),
mutate(feats_micro, type = "micro")
)
p1 <- ggplot(feats_mac_mic,
aes(pacf5 , y=diff1_acf1, colour = type)) +
geom_point(alpha = 0.5) +
scale_colour_brewer("", palette="Dark2") +
#scale_colour_manual(values=c(mypal[1], mypal[5]))+
theme_classic()+
ggtitle("Top difference on outlying") +
theme(aspect.ratio=1, axis.text = element_blank())
#facet_wrap(~type, nrow=2)
p2 <- ggplot(feats_mac_mic,
aes(x=curvature, y= trend_strength, colour = type)) +
geom_point(alpha = 0.5) +
scale_colour_brewer("", palette="Dark2") +
#scale_colour_manual(values=c(mypal[1], mypal[5]))+
theme_classic()+
ggtitle("Top difference on skewed") +
theme(aspect.ratio=1, axis.text = element_blank())
#facet_wrap(~type, nrow=2)
p3 <- ggplot(feats_mac_mic,
aes(x=longest_flat_spot, y=trend_strength, colour = type)) +
geom_point(alpha = 0.5) +
ggtitle("Top difference on convex") +
scale_colour_brewer("", palette="Dark2") +
#scale_colour_manual(values=c(mypal[1], mypal[5]))+
theme_classic() +
theme(aspect.ratio=1, axis.text = element_blank())
#facet_wrap(~type, nrow=2)
p1 + p2 + p3 + plot_layout(guides = "collect") & theme(legend.position = "bottom")
## ----tsplots, fig.alt = "Six line plots, three of macroeconomic time series data and three of microeconomic time series data, where the difference between the two series is in how jaggard they are.", fig.cap="Selection of three series from the 100 of each of two groups, macroeconomics and microeconomics. The difference between the groups appears to be primarily in the jagginess of the two series, which a little surprisingly, is captured by the time series features curvature and trend strength. The way that trend strength is calculated, on closer inspection, could lead to describing jagginess.", fig.width = 12, fig.height = 8, out.width="100%", layout = "l-body"----
load("data/cets_macro.rda")
load("data/cets_micro.rda")
macro_s_ts1 <- cets_macro[[1]] |>
as_tibble() |>
mutate(x = as.numeric(x),
t = 1:n(),
name = names(cets_macro)[[1]])
macro_s_ts2 <- cets_macro[[2]] |>
as_tibble() |>
mutate(x = as.numeric(x),
t = 1:n(),
name = names(cets_macro)[[2]])
macro_s_ts3 <- cets_macro[[3]] |>
as_tibble() |>
mutate(x = as.numeric(x),
t = 1:n(),
name = names(cets_macro)[[3]])
macro_s_ts <- bind_rows(macro_s_ts1,
macro_s_ts2,
macro_s_ts3)
mac <- ggplot(macro_s_ts, aes(x=t, y=x)) +
geom_line() +
facet_wrap(~name, ncol=1, scales = "free") +
ggtitle("Macroeconomics") +
theme_minimal() +
xlab("") + ylab("") +
theme(axis.text = element_blank())
micro_s_ts1 <- cets_micro[[1]] |>
as_tibble() |>
mutate(x = as.numeric(x),
t = 1:n(),
name = names(cets_micro)[[1]])
micro_s_ts2 <- cets_micro[[2]] |>
as_tibble() |>
mutate(x = as.numeric(x),
t = 1:n(),
name = names(cets_micro)[[2]])
micro_s_ts3 <- cets_micro[[3]] |>
as_tibble() |>
mutate(x = as.numeric(x),
t = 1:n(),
name = names(cets_micro)[[3]])
micro_s_ts <- bind_rows(micro_s_ts1,
micro_s_ts2,
micro_s_ts3)
mic <- ggplot(micro_s_ts, aes(x=t, y=x)) +
geom_line() +
facet_wrap(~name, ncol=1, scales = "free") +
ggtitle("Microeconomics") +
xlab("") + ylab("") +
theme_minimal() +
theme(axis.text = element_blank())
mac + mic
## ----eval=FALSE-----------------------------------------------------------------------------
# # Pre-processing code
# library(readxl)
# wbi <- read_xlsx("data/World_Development_Indicators.xlsx", n_max = 2436) # Cut off extra stuff
#
# wbi_wide <- wbi |>
# mutate(`2018 [YR2018]` = str_replace(`2018 [YR2018]`, "..","")) |>
# mutate(`2018 [YR2018]` = as.numeric(`2018 [YR2018]`)) |>
# select(`Country Code`, `Series Code`, `2018 [YR2018]`) |>
# pivot_wider(names_from = `Series Code`, values_from = `2018 [YR2018]`, id_cols = `Country Code`, values_fn = mean) |>
# filter(!is.na(`Country Code`))
#
# # Drop variables with missings, or too similar
# library(naniar)
# s_miss <- miss_summary(wbi_wide)
# gg_miss_var(wbi_wide)
#
# # Remove vars with more than 15 missing values
# drop <- s_miss$miss_var_summary[[1]] |>
# filter(pct_miss > 15)
# wbi_wide <- wbi_wide |>
# select(-drop$variable)
#
# # Remove countries with more than 10 missing values
# drop_cnt <- s_miss$miss_case_summary[[1]] |>
# filter(pct_miss > 10)
# wbi_wide <- wbi_wide[-drop_cnt$case,]
#
# # Check again - still a few sporadic missings
# # The missing value plot shows no missing pattern
# s_miss <- miss_summary(wbi_wide)
#
# vis_miss(wbi_wide)
#
# # Remove any variables that are the same
# #scag_cor <- calc_scags_wide(wbi_wide_sub[,-1],
# # scags = "monotonic")
# #wbi_wide_sub <- wbi_wide_sub |>
# # select(-NY.GDP.MKTP.CD)
# wbi_wide <- wbi_wide[,-8] # TX.VAL.TECH.MF.ZS too few values, almost all zero
# save(wbi_wide, file="data/wbi_wide.rda")
#
# # Check individual scag calculations, for testing
# calc_scags(wbi_wide$EN.ATM.CO2E.PC, wbi_wide$NV.AGR.TOTL.ZS, scags = "striped")
#
# wbi_scags <- calc_scags_wide(wbi_wide[,-1],
# scags = c("outlying", "stringy",
# "striated2", "clumpy2", "skewed",
# "convex", "skinny",
# "splines", "striped"), out.rm=FALSE)
# save(wbi_scags, file="data/wbi_scags.rda")
## ----wbi------------------------------------------------------------------------------------
load("data/wbi_wide.rda")
load("data/wbi_scags.rda")
wbi_scags_long <- wbi_scags |>
pivot_longer(!c(Var1, Var2), names_to = "scag", values_to = "value") |>
arrange(desc(value))
w1 <- ggplot(wbi_wide,
aes(.data[[as.character(wbi_scags_long$Var1[1])]],
.data[[as.character(wbi_scags_long$Var2[1])]])) +
geom_point()
w2 <- ggplot(wbi_wide,
aes(.data[[as.character(wbi_scags_long$Var1[2])]],
.data[[as.character(wbi_scags_long$Var2[2])]])) +
geom_point()
# Summarise highest on each index
wbi_scags_top <- wbi_scags_long |>
group_by(scag) |>
slice_head(n=1)
w3a <- ggplot(wbi_wide,
aes(.data[[as.character(wbi_scags_top$Var1[1])]],
.data[[as.character(wbi_scags_top$Var2[1])]])) +
geom_point(aes(label = `Country Code`)) +
#ggtitle(wbi_scags_top$scag[1]) +
theme_classic() +
theme(axis.text = element_blank())
w3b <- ggplot(filter(wbi_wide, AG.LND.AGRI.ZS<10), #remove outlier so it is the scatter plot it used for calc
aes(.data[[as.character(wbi_scags_top$Var1[2])]],
.data[[as.character(wbi_scags_top$Var2[2])]])) +
geom_point(aes(label = `Country Code`)) +
#ggtitle(wbi_scags_top$scag[2]) +
theme_classic() +
theme(axis.text = element_blank())
# Most tend to be vars with too many 0's
# Summarise highest on each variable pair
wbi_scags_topvars <- wbi_scags_long |>
group_by(Var1, Var2) |>
slice_head(n=1) |>
ungroup() |>
arrange(scag)
pair1 <- wbi_scags_topvars |> filter(scag == "clumpy2")
#w4 <- ggplot(wbi_wide,
# aes_string(x=as.character(pair1$Var1),
# y=as.character(pair1$Var2))) +
# geom_point(aes(label = `Country Code`)) +
# theme(aspect.ratio=1) +
# ggtitle(pair1$scag)
pair2 <- wbi_scags_topvars |> filter(scag == "striated2")
#w5 <- wbi_wide |>
# ggplot(aes_string(x=as.character(pair2$Var1),
# y=as.character(pair2$Var2))) +
# geom_point(aes(label = `Country Code`)) +
# theme(aspect.ratio=1) +
# ggtitle(pair2$scag)
wbi_scags_topvars <- wbi_scags_topvars |>
mutate(scag = factor(scag,
levels = c("skinny", "striated2",
"outlying", "clumpy2", "convex",
"skewed", "striped", "stringy")),
vars = paste0(Var1, " ", Var2))
w6 <- ggplot(wbi_scags_topvars,
aes(x=scag,
y=value, label = vars)) +
xlab("") + ylab("") +
geom_point(alpha=0.7) + coord_flip()+
theme(aspect.ratio = 1) +
theme_classic()
## ----wbiinteractive, fig.alt = "Three scatter plots, the first shows the top scagnostic for each scatter plot in the world bank data, the second is the scatter plot that returned the highest clumpy 2 value, while the third is the one that returned the highest convext value. The first plot gives a summary of the overall shape of the data, neither the second or third plot have any interesting visual features.", fig.cap = "Using scagnostics to explore the variety of relationships present in the WBI data. The side-by-side dotplot (left) shows one point for each pair of variables, with its highest scagnostic value among all scagnostics calculated. Most of the pairs of indicators exhibit outliers, skewed, stringy or convex. There is one pair that has clumpy as the highest value. The plots, middle and right, show the pair of variables with highest value on clumpy2 and convex, respectively. (Mouseover allows identification of variable pairs, and countries.)", include=knitr::is_html_output(), eval=knitr::is_html_output(), fig.height=3.5, fig.width=9, layout = "l-body"----
# ws1 <- ggplotly(w6)
# ws2 <- ggplotly(w3a)
# ws3 <- ggplotly(w3b)
# subplot(ws1, ws2, ws3, nrows=1, widths = c(0.33, 0.33, 0.33), heights = 0.6)
## ----wbistatic, fig.alt = "Three scatter plots, the first shows the top scagnostic for each scatter plot in the world bank data, the second is the scatter plot that returned the highest clumpy 2 value, while the third is the one that returned the highest convext value. The first plot gives a summary of the overall shape of the data, neither the second or third plot have any interesting visual features.", fig.cap="Using scagnostics to explore the variety of relationships present in the WBI data. The side-by-side dotplot (left) shows one point for each pair of variables, with its highest scagnostic value among all scagnostics calculated. Most of the pairs of indicators exhibit outliers, skewed, stringy or convex. There is one pair that has clumpy as the highest value. The plots, middle and right, show the pair of variables with highest value on clumpy2 and convex, respectively.", fig.height=3.5, fig.width=9, out.width="100%", include=knitr::is_latex_output(), eval=knitr::is_latex_output()----
w6 <- w6 + ggtitle("Top scagnostic for each pair")
w3a <- w3a + ggtitle("Top clumpy2")
w3b <- w3b + ggtitle("Top convex")
w6 + w3a + w3b
## ----include=FALSE, eval=FALSE--------------------------------------------------------------
# # Spell check
# library(spelling)
# rmd <- "mason-lee-laa-cook.rmd"
# ignore <- readLines("WORDLIST")
# check_spelling <- spell_check_files(
# rmd,
# ignore = ignore,
# lang = "en_GB"
# )
# if (nrow(check_spelling) > 0) {
# print(check_spelling)
# stop("Check spelling in Rmd files!")
# }