NationalSecurityAgency · ldhardy · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/pom.xml b/pom.xml
@@ -64,6 +64,7 @@
     <dep.guava.version>32.1.2-jre</dep.guava.version>
     <dep.httpclient.version>5.2.1</dep.httpclient.version>
     <dep.httpcore.version>5.2.1</dep.httpcore.version>
+    <dep.icu4j.version>73.2</dep.icu4j.version>
     <dep.jackson.version>2.15.2</dep.jackson.version>
     <dep.jakarta.xml.bind-api.version>3.0.1</dep.jakarta.xml.bind-api.version>
     <dep.janino.version>3.1.12</dep.janino.version>
@@ -305,6 +306,12 @@
         <type>pom</type>
         <scope>import</scope>
       </dependency>
+      <dependency>
+        <groupId>com.ibm.icu</groupId>
+        <artifactId>icu4j</artifactId>
+        <version>${dep.icu4j.version}</version>
+        <scope>test</scope>
+      </dependency>
       <dependency>
         <groupId>org.junit.jupiter</groupId>
         <artifactId>junit-jupiter</artifactId>
@@ -523,6 +530,11 @@
       <artifactId>error_prone_annotations</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>com.ibm.icu</groupId>
+      <artifactId>icu4j</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.glassfish.jersey.test-framework.providers</groupId>
       <artifactId>jersey-test-framework-provider-jetty</artifactId>

diff --git a/src/test/java/emissary/test/util/ComplexUnicodeSamples.java b/src/test/java/emissary/test/util/ComplexUnicodeSamples.java
@@ -0,0 +1,188 @@
+package emissary.test.util;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * A class that provides some tricky samples. These samples can be used in testing to make sure our code and the 3rd
+ * party libraries we choose can handle unusual cases.
+ * <p>
+ * Each example contains detailed explanation and links to useful reference materials.
+ */
+public final class ComplexUnicodeSamples {
+
+    private ComplexUnicodeSamples() {}
+
+    /**
+     * Returns a string that contains one graphical unit (in this case an emoji) that consists of 5 Unicode scalar values.
+     * The user-perceived string would be one facepalming emoji. A user would expect hit the arrow key once to traverse the
+     * cursor across this one emoji on the screen. The length of the UTF-8 encoded byte array is 17 bytes. One emoji, 17
+     * UTF8 bytes.
+     * <p>
+     * SCALAR 1: First, there’s a base character that means a person face palming.
+     * <p>
+     * SCALAR 2: By default, the person would have a cartoonish yellow color. The next character is an emoji skintone
+     * modifier the changes the color of the person’s skin (and, in practice, also the color of the person’s hair).
+     * <p>
+     * SCALAR 3 and 4: By default, the gender of the person is undefined, and e.g. Apple defaults to what they consider a
+     * male appearance and e.g. Google defaults to what they consider a female appearance. The next two scalar values pick a
+     * male-typical appearance specifically regardless of font and vendor. Instead of being an emoji-specific modifier like
+     * the skin tone, the gender specification uses an emoji-predating gender symbol (MALE SIGN) explicitly ligated using
+     * the ZERO WIDTH JOINER with the (skin-toned) face-palming person. (Whether it is a good or a bad idea that the skin
+     * tone and gender specifications use different mechanisms is out of the scope of this post.)
+     * <p>
+     * SCALAR 5: Finally, VARIATION SELECTOR-16 makes it explicit that we want a multicolor emoji rendering instead of a
+     * monochrome dingbat rendering.
+     * 
+     * @return the Java string containing this one facepalming dude emoji with a not-yellow skin tone.
+     * 
+     * @see ComplexUnicodeSamplesTest#demonstrateMetadataAboutFacePalmDude()
+     * @see <a href="https://hsivonen.fi/string-length/">https://hsivonen.fi/string-length/</a>
+     */
+    public static String getFacePalmingMaleControlSkintone() {
+
+        StringBuilder sb = new StringBuilder();
+
+        // SCALAR 1: U+1F926 FACE PALM
+        // Use the lookup for how to represent in java
+        // https://www.fileformat.info/info/unicode/char/1f926/index.htm
+        // UTF-32 code units: 1
+        // UTF-16 code units: 2
+        // UTF-8 code units: 4
+        // UTF-32 bytes: 4
+        // UTF-16 bytes: 4
+        // UTF-8 bytes: 4
+        sb.append("\uD83E\uDD26");
+
+        // SCALAR 2: U+1F3FC EMOJI MODIFIER FITZPATRICK TYPE-3
+        // https://www.fileformat.info/info/unicode/char/1f3fc/index.htm
+        // UTF-32 code units: 1
+        // UTF-16 code units: 2
+        // UTF-8 code units: 4
+        // UTF-32 bytes: 4
+        // UTF-16 bytes: 4
+        // UTF-8 bytes: 4
+        sb.append("\uD83C\uDFFC");
+
+        // SCALAR 3: U+200D ZERO WIDTH JOINER
+        // UTF-32 code units: 1
+        // UTF-16 code units: 1
+        // UTF-8 code units: 3
+        // UTF-32 bytes: 4
+        // UTF-16 bytes: 2
+        // UTF-8 bytes: 3
+        sb.append("\u200D");
+
+        // SCALAR 4: U+2642 MALE SIGN
+        // UTF-32 code units: 1
+        // UTF-16 code units: 1
+        // UTF-8 code units: 3
+        // UTF-32 bytes: 4
+        // UTF-16 bytes: 2
+        // UTF-8 bytes: 3
+        sb.append("\u2642");
+
+        // SCALAR 5: U+FE0F VARIATION SELECTOR-16
+        // UTF-32 code units: 1
+        // UTF-16 code units: 1
+        // UTF-8 code units: 3
+        // UTF-32 bytes: 4
+        // UTF-16 bytes: 2
+        // UTF-8 bytes: 3
+        sb.append("\uFE0F");
+
+        return sb.toString();
+    }
+
+
+    /**
+     * This map is useful for testing that our code and any 3rd party XML library we are using is handling unicode within
+     * XML correctly.
+     * 
+     * @return A map of strings where the key is the XML node containing an XML-escaped surrogate pair unicode value and the
+     *         value is is the properly extracted java string value with un-escaped unicode strings.
+     * @see <a href=
+     *      "https://github.com/FasterXML/woodstox/pull/174/files">https://github.com/FasterXML/woodstox/pull/174/files</a>
+     */
+    public static Map<String, String> getXmlSamples() {
+        // See https://github.com/FasterXML/woodstox/pull/174/files
+        Map<String, String> xmlWithExp = new HashMap<String, String>();
+        // Numeric surrogate pairs
+        xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;.</root>",
+                "surrogate pair: \uD83C\uDF85.");
+        // Hex and numeric surrogate pairs
+        xmlWithExp.put("<root>surrogate pair: &#xD83C;&#57221;.</root>",
+                "surrogate pair: \uD83C\uDF85.");
+        // Numeric and hex surrogate pairs
+        xmlWithExp.put("<root>surrogate pair: &#55356;&#xDF85;.</root>",
+                "surrogate pair: \uD83C\uDF85.");
+        // Hex surrogate pairs
+        xmlWithExp.put("<root>surrogate pair: &#xD83C;&#xDF85;.</root>",
+                "surrogate pair: \uD83C\uDF85.");
+        // Two surrogate pairs
+        xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#55356;&#57220;.</root>",
+                "surrogate pair: \uD83C\uDF85\uD83C\uDF84.");
+        // Surrogate pair and simple entity
+        xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#8482;.</root>",
+                "surrogate pair: \uD83C\uDF85\u2122.");
+
+        return xmlWithExp;
+    }
+
+    /**
+     * This will not work properly in versions of java earlier than Java 20.
+     * <p>
+     * Once we get to Java 20, this method should work properly.
+     * <p>
+     * Character boundary analysis allows users to interact with characters as they expect to, for example, when moving the
+     * cursor through a text string. Character boundary analysis provides correct navigation through character strings,
+     * regardless of how the character is stored. The boundaries returned may be those of supplementary characters,
+     * combining character sequences, or ligature clusters. For example, an accented character might be stored as a base
+     * character and a diacritical mark. What users consider to be a character can differ between languages.
+     * 
+     * @see <a href=
+     *      "https://horstmann.com/unblog/2023-10-03/index.html">https://horstmann.com/unblog/2023-10-03/index.html</a> -
+     *      Scroll to the section titled "Just Use Strings"
+     *
+     * @param text - the string to analyze.
+     * @return the count of user-perceived graphemes as based on the character break iterator. In versions of java earlier
+     *         than Java 20, this will not function as expected.
+     */
+    public static int countGraphemesUsingJavaBuiltInBreakIterator(String text) {
+
+        java.text.BreakIterator breakIterator = java.text.BreakIterator.getCharacterInstance();
+        breakIterator.setText(text);
+
+        int count = 0;
+        for (int end = breakIterator.next(); end != java.text.BreakIterator.DONE; end = breakIterator.next()) {
+            count++;
+        }
+
+        return count;
+    }
+
+    /**
+     * Using the industry-standard ICU4J library provided by IBM.
+     * <p>
+     * NOTE: Updating the version of this library might change which unicode database is referenced for these calculations.
+     * We should strive to keep this library as up-to-date as possible in both test and production source code.
+     * 
+     * @param text the string to analyze
+     * @return a count of how many user-perceived glyphs/graphemes are present in the string. If you placed a cursor diretly
+     *         to the left (or right for right-to-left string), and pressed the arrow key to traverse the string, how many
+     *         times would you need to press the arrow key to traverse to the right-most end of the string (or leftmost for
+     *         R-to-L strings).
+     */
+    public static int countGraphemesUsingIcu4J(String text) {
+        com.ibm.icu.text.BreakIterator breakIterator = com.ibm.icu.text.BreakIterator.getCharacterInstance();
+        breakIterator.setText(text);
+
+        int count = 0;
+        for (int end = breakIterator.next(); end != com.ibm.icu.text.BreakIterator.DONE; end = breakIterator.next()) {
+            count++;
+        }
+
+        return count;
+    }
+
+}
diff --git a/src/test/java/emissary/test/util/ComplexUnicodeSamplesTest.java b/src/test/java/emissary/test/util/ComplexUnicodeSamplesTest.java
@@ -0,0 +1,143 @@
+package emissary.test.util;
+
+import com.ibm.icu.text.Normalizer2;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledForJreRange;
+import org.junit.jupiter.api.condition.EnabledForJreRange;
+import org.junit.jupiter.api.condition.JRE;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class ComplexUnicodeSamplesTest {
+
+    /**
+     * Interesting observations about face palm dude emoji.
+     * <p>
+     * We’ve seen four different lengths so far:
+     * 
+     * <ul>
+     * <li>Number of UTF-8 code units (17 in this case)</li>
+     * <li>Number of UTF-16 code units (7 in this case)</li>
+     * <li>Number of UTF-32 code units or Unicode scalar values (5 in this case)</li>
+     * <li>Number of extended grapheme clusters (1 in this case)</li>
+     * </ul>
+     * Given a valid Unicode string and a version of Unicode, all of the above are well-defined and it holds that each item
+     * higher on the list is greater or equal than the items lower on the list.
+     * <p>
+     * One of these is not like the others, though: The first three numbers have an unchanging definition for any valid
+     * Unicode string whether it contains currently assigned scalar values or whether it is from the future and contains
+     * unassigned scalar values as far as software written today is aware. Also, computing the first three lengths does not
+     * involve lookups from the Unicode database. However, the last item depends on the Unicode version and involves lookups
+     * from the Unicode database. If a string contains scalar values that are unassigned as far as the copy of the Unicode
+     * database that the program is using is aware, the program will potentially overcount extended grapheme clusters in the
+     * string compared to a program whose copy of the Unicode database is newer and has assignments for those scalar values
+     * (and some of those assignments turn out to be combining characters).
+     */
+    @Test
+    void demonstrateMetadataAboutFacePalmDude() {
+
+        String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
+
+        // SCALAR 1 is 4 UTF8 bytes
+        // SCALAR 2 is 4 UTF8 bytes
+        // SCALAR 3 is 3 UTF8 bytes
+        // SCALAR 4 is 3 UTF8 bytes
+        // SCALAR 5 is 3 UTF8 bytes
+        // TOTAL : 17 UTF8 bytes
+        assertEquals(17, facepalm.getBytes(StandardCharsets.UTF_8).length);
+        assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_8)));
+
+        // SCALAR 1 is 4 UTF16 bytes
+        // SCALAR 2 is 4 UTF16 bytes
+        // SCALAR 3 is 2 UTF16 bytes
+        // SCALAR 4 is 2 UTF16 bytes
+        // SCALAR 5 is 2 UTF16 bytes
+        // TOTAL : 14 UTF16 bytes if no BOM is needed
+        // Java typically defaults to UTF-16BE
+        assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16BE).length);
+        assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16BE), StandardCharsets.UTF_16BE));
+        assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16LE).length);
+        assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16LE), StandardCharsets.UTF_16LE));
+
+        // When the endianness isn't specified, 2 bytes are used for the byte order marker
+        // The BOM is a special character (U+FEFF) used to indicate the endianness (byte order)
+        // of a UTF-16 encoded file or stream. In UTF-16, the BOM can be either:
+        // FE FF (Big Endian)
+        // FF FE (Little Endian)
+        assertEquals(16, facepalm.getBytes(StandardCharsets.UTF_16).length);
+        assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16), StandardCharsets.UTF_16));
+
+        // 5 UTF-32 characters at 4 bytes per character
+        assertEquals(20, facepalm.getBytes(Charset.forName("UTF-32")).length);
+        assertEquals(facepalm, new String(facepalm.getBytes(Charset.forName("UTF-32")), Charset.forName("UTF-32")));
+
+        // single byte encoding is not going to produce what you want
+        assertEquals(5, facepalm.getBytes(StandardCharsets.ISO_8859_1).length);
+        assertNotEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.ISO_8859_1));
+
+
+        assertEquals(5, facepalm.codePointCount(0, facepalm.length()));
+
+        // ICU4J BreakIterator gets it right
+        assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingIcu4J(facepalm));
+
+        // See
+        // demonstrateMetadataAboutFacePalmDudeForJava20()
+        // and
+        // demonstrateMetadataAboutFacePalmDudePriorToJava20()
+        // to see how using the intrinsic java BreakIterator doesn't
+        // get it right until Java 20.
+
+
+        // It's already normalized in it's natural form.
+        Normalizer2 nfcDecomp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
+        Normalizer2 nfkcDecomp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
+        assertTrue(nfcDecomp.isNormalized(facepalm));
+        assertTrue(nfkcDecomp.isNormalized(facepalm));
+
+        Normalizer2 nfcComp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
+        Normalizer2 nfkcComp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE);
+        assertTrue(nfcComp.isNormalized(facepalm));
+        assertTrue(nfkcComp.isNormalized(facepalm));
+
+    }
+
+    @Test
+    @EnabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java 20 and later.")
+    void demonstrateMetadataAboutFacePalmDudeForJava20() {
+        String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
+        assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
+    }
+
+    @Test
+    @DisabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java versions up to not including Java 20.")
+    void demonstrateMetadataAboutFacePalmDudePriorToJava20() {
+        String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
+        assertEquals(4, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
+        // it should be 1, but it's wrong until Java 20.
+    }
+
+    @Test
+    @EnabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java 17 and later.")
+    void demonstrateMetadataAboutFacePalmDudeForJava17AndLater() {
+        String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
+        int j = 27;
+        assertEquals(j, facepalm.repeat(j).split("\\b{g}").length);
+    }
+
+    @Test
+    @DisabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java versions up to not including Java 17.")
+    void demonstrateMetadataAboutFacePalmDudePriorToJava17() {
+        String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
+        int j = 27;
+        assertEquals(j * 3, facepalm.repeat(j).split("\\b{g}").length);
+        // it should be 27, but it's wrong until Java 17
+    }
+
+
+}