Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding a utility to provide test data for specific unicode handling issues. #1044

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
12 changes: 12 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
<dep.guava.version>32.1.2-jre</dep.guava.version>
<dep.httpclient.version>5.2.1</dep.httpclient.version>
<dep.httpcore.version>5.2.1</dep.httpcore.version>
<dep.icu4j.version>73.2</dep.icu4j.version>
<dep.jackson.version>2.15.2</dep.jackson.version>
<dep.jakarta.xml.bind-api.version>3.0.1</dep.jakarta.xml.bind-api.version>
<dep.janino.version>3.1.12</dep.janino.version>
Expand Down Expand Up @@ -305,6 +306,12 @@
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${dep.icu4j.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
Expand Down Expand Up @@ -523,6 +530,11 @@
<artifactId>error_prone_annotations</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.glassfish.jersey.test-framework.providers</groupId>
<artifactId>jersey-test-framework-provider-jetty</artifactId>
Expand Down
188 changes: 188 additions & 0 deletions src/test/java/emissary/test/util/ComplexUnicodeSamples.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
package emissary.test.util;

import java.util.HashMap;
import java.util.Map;

/**
* A class that provides some tricky samples. These samples can be used in testing to make sure our code and the 3rd
* party libraries we choose can handle unusual cases.
* <p>
* Each example contains detailed explanation and links to useful reference materials.
*/
public final class ComplexUnicodeSamples {

private ComplexUnicodeSamples() {}

/**
* Returns a string that contains one graphical unit (in this case an emoji) that consists of 5 Unicode scalar values.
* The user-perceived string would be one facepalming emoji. A user would expect hit the arrow key once to traverse the
* cursor across this one emoji on the screen. The length of the UTF-8 encoded byte array is 17 bytes. One emoji, 17
* UTF8 bytes.
* <p>
* SCALAR 1: First, there’s a base character that means a person face palming.
* <p>
* SCALAR 2: By default, the person would have a cartoonish yellow color. The next character is an emoji skintone
* modifier the changes the color of the person’s skin (and, in practice, also the color of the person’s hair).
* <p>
* SCALAR 3 and 4: By default, the gender of the person is undefined, and e.g. Apple defaults to what they consider a
* male appearance and e.g. Google defaults to what they consider a female appearance. The next two scalar values pick a
* male-typical appearance specifically regardless of font and vendor. Instead of being an emoji-specific modifier like
* the skin tone, the gender specification uses an emoji-predating gender symbol (MALE SIGN) explicitly ligated using
* the ZERO WIDTH JOINER with the (skin-toned) face-palming person. (Whether it is a good or a bad idea that the skin
* tone and gender specifications use different mechanisms is out of the scope of this post.)
* <p>
* SCALAR 5: Finally, VARIATION SELECTOR-16 makes it explicit that we want a multicolor emoji rendering instead of a
* monochrome dingbat rendering.
*
* @return the Java string containing this one facepalming dude emoji with a not-yellow skin tone.
*
* @see ComplexUnicodeSamplesTest#demonstrateMetadataAboutFacePalmDude()
* @see <a href="https://hsivonen.fi/string-length/">https://hsivonen.fi/string-length/</a>
*/
public static String getFacePalmingMaleControlSkintone() {

StringBuilder sb = new StringBuilder();

// SCALAR 1: U+1F926 FACE PALM
// Use the lookup for how to represent in java
// https://www.fileformat.info/info/unicode/char/1f926/index.htm
// UTF-32 code units: 1
// UTF-16 code units: 2
// UTF-8 code units: 4
// UTF-32 bytes: 4
// UTF-16 bytes: 4
// UTF-8 bytes: 4
sb.append("\uD83E\uDD26");

// SCALAR 2: U+1F3FC EMOJI MODIFIER FITZPATRICK TYPE-3
// https://www.fileformat.info/info/unicode/char/1f3fc/index.htm
// UTF-32 code units: 1
// UTF-16 code units: 2
// UTF-8 code units: 4
// UTF-32 bytes: 4
// UTF-16 bytes: 4
// UTF-8 bytes: 4
sb.append("\uD83C\uDFFC");

// SCALAR 3: U+200D ZERO WIDTH JOINER
// UTF-32 code units: 1
// UTF-16 code units: 1
// UTF-8 code units: 3
// UTF-32 bytes: 4
// UTF-16 bytes: 2
// UTF-8 bytes: 3
sb.append("\u200D");

// SCALAR 4: U+2642 MALE SIGN
// UTF-32 code units: 1
// UTF-16 code units: 1
// UTF-8 code units: 3
// UTF-32 bytes: 4
// UTF-16 bytes: 2
// UTF-8 bytes: 3
sb.append("\u2642");

// SCALAR 5: U+FE0F VARIATION SELECTOR-16
// UTF-32 code units: 1
// UTF-16 code units: 1
// UTF-8 code units: 3
// UTF-32 bytes: 4
// UTF-16 bytes: 2
// UTF-8 bytes: 3
sb.append("\uFE0F");

return sb.toString();
}


/**
* This map is useful for testing that our code and any 3rd party XML library we are using is handling unicode within
* XML correctly.
*
* @return A map of strings where the key is the XML node containing an XML-escaped surrogate pair unicode value and the
* value is is the properly extracted java string value with un-escaped unicode strings.
* @see <a href=
* "https://github.com/FasterXML/woodstox/pull/174/files">https://github.com/FasterXML/woodstox/pull/174/files</a>
*/
public static Map<String, String> getXmlSamples() {
// See https://github.com/FasterXML/woodstox/pull/174/files
Map<String, String> xmlWithExp = new HashMap<String, String>();
// Numeric surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Hex and numeric surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#xD83C;&#57221;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Numeric and hex surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#55356;&#xDF85;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Hex surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#xD83C;&#xDF85;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Two surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#55356;&#57220;.</root>",
"surrogate pair: \uD83C\uDF85\uD83C\uDF84.");
// Surrogate pair and simple entity
xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#8482;.</root>",
"surrogate pair: \uD83C\uDF85\u2122.");

return xmlWithExp;
}

/**
* This will not work properly in versions of java earlier than Java 20.
* <p>
* Once we get to Java 20, this method should work properly.
* <p>
* Character boundary analysis allows users to interact with characters as they expect to, for example, when moving the
* cursor through a text string. Character boundary analysis provides correct navigation through character strings,
* regardless of how the character is stored. The boundaries returned may be those of supplementary characters,
* combining character sequences, or ligature clusters. For example, an accented character might be stored as a base
* character and a diacritical mark. What users consider to be a character can differ between languages.
*
* @see <a href=
* "https://horstmann.com/unblog/2023-10-03/index.html">https://horstmann.com/unblog/2023-10-03/index.html</a> -
* Scroll to the section titled "Just Use Strings"
*
* @param text - the string to analyze.
* @return the count of user-perceived graphemes as based on the character break iterator. In versions of java earlier
* than Java 20, this will not function as expected.
*/
public static int countGraphemesUsingJavaBuiltInBreakIterator(String text) {

java.text.BreakIterator breakIterator = java.text.BreakIterator.getCharacterInstance();
breakIterator.setText(text);

int count = 0;
for (int end = breakIterator.next(); end != java.text.BreakIterator.DONE; end = breakIterator.next()) {
count++;
}

return count;
}

/**
* Using the industry-standard ICU4J library provided by IBM.
* <p>
* NOTE: Updating the version of this library might change which unicode database is referenced for these calculations.
* We should strive to keep this library as up-to-date as possible in both test and production source code.
*
* @param text the string to analyze
* @return a count of how many user-perceived glyphs/graphemes are present in the string. If you placed a cursor diretly
* to the left (or right for right-to-left string), and pressed the arrow key to traverse the string, how many
* times would you need to press the arrow key to traverse to the right-most end of the string (or leftmost for
* R-to-L strings).
*/
public static int countGraphemesUsingIcu4J(String text) {
com.ibm.icu.text.BreakIterator breakIterator = com.ibm.icu.text.BreakIterator.getCharacterInstance();
breakIterator.setText(text);

int count = 0;
for (int end = breakIterator.next(); end != com.ibm.icu.text.BreakIterator.DONE; end = breakIterator.next()) {
count++;
}

return count;
}

}
143 changes: 143 additions & 0 deletions src/test/java/emissary/test/util/ComplexUnicodeSamplesTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package emissary.test.util;

import com.ibm.icu.text.Normalizer2;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledForJreRange;
import org.junit.jupiter.api.condition.EnabledForJreRange;
import org.junit.jupiter.api.condition.JRE;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class ComplexUnicodeSamplesTest {

/**
* Interesting observations about face palm dude emoji.
* <p>
* We’ve seen four different lengths so far:
*
* <ul>
* <li>Number of UTF-8 code units (17 in this case)</li>
* <li>Number of UTF-16 code units (7 in this case)</li>
* <li>Number of UTF-32 code units or Unicode scalar values (5 in this case)</li>
* <li>Number of extended grapheme clusters (1 in this case)</li>
* </ul>
* Given a valid Unicode string and a version of Unicode, all of the above are well-defined and it holds that each item
* higher on the list is greater or equal than the items lower on the list.
* <p>
* One of these is not like the others, though: The first three numbers have an unchanging definition for any valid
* Unicode string whether it contains currently assigned scalar values or whether it is from the future and contains
* unassigned scalar values as far as software written today is aware. Also, computing the first three lengths does not
* involve lookups from the Unicode database. However, the last item depends on the Unicode version and involves lookups
* from the Unicode database. If a string contains scalar values that are unassigned as far as the copy of the Unicode
* database that the program is using is aware, the program will potentially overcount extended grapheme clusters in the
* string compared to a program whose copy of the Unicode database is newer and has assignments for those scalar values
* (and some of those assignments turn out to be combining characters).
*/
@Test
void demonstrateMetadataAboutFacePalmDude() {

String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();

// SCALAR 1 is 4 UTF8 bytes
// SCALAR 2 is 4 UTF8 bytes
// SCALAR 3 is 3 UTF8 bytes
// SCALAR 4 is 3 UTF8 bytes
// SCALAR 5 is 3 UTF8 bytes
// TOTAL : 17 UTF8 bytes
assertEquals(17, facepalm.getBytes(StandardCharsets.UTF_8).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_8)));

// SCALAR 1 is 4 UTF16 bytes
// SCALAR 2 is 4 UTF16 bytes
// SCALAR 3 is 2 UTF16 bytes
// SCALAR 4 is 2 UTF16 bytes
// SCALAR 5 is 2 UTF16 bytes
// TOTAL : 14 UTF16 bytes if no BOM is needed
// Java typically defaults to UTF-16BE
assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16BE).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16BE), StandardCharsets.UTF_16BE));
assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16LE).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16LE), StandardCharsets.UTF_16LE));

// When the endianness isn't specified, 2 bytes are used for the byte order marker
// The BOM is a special character (U+FEFF) used to indicate the endianness (byte order)
// of a UTF-16 encoded file or stream. In UTF-16, the BOM can be either:
// FE FF (Big Endian)
// FF FE (Little Endian)
assertEquals(16, facepalm.getBytes(StandardCharsets.UTF_16).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16), StandardCharsets.UTF_16));

// 5 UTF-32 characters at 4 bytes per character
assertEquals(20, facepalm.getBytes(Charset.forName("UTF-32")).length);
assertEquals(facepalm, new String(facepalm.getBytes(Charset.forName("UTF-32")), Charset.forName("UTF-32")));

// single byte encoding is not going to produce what you want
assertEquals(5, facepalm.getBytes(StandardCharsets.ISO_8859_1).length);
assertNotEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.ISO_8859_1));


assertEquals(5, facepalm.codePointCount(0, facepalm.length()));

// ICU4J BreakIterator gets it right
assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingIcu4J(facepalm));

// See
// demonstrateMetadataAboutFacePalmDudeForJava20()
// and
// demonstrateMetadataAboutFacePalmDudePriorToJava20()
// to see how using the intrinsic java BreakIterator doesn't
// get it right until Java 20.


// It's already normalized in it's natural form.
Normalizer2 nfcDecomp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
Normalizer2 nfkcDecomp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
assertTrue(nfcDecomp.isNormalized(facepalm));
assertTrue(nfkcDecomp.isNormalized(facepalm));

Normalizer2 nfcComp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
Normalizer2 nfkcComp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE);
assertTrue(nfcComp.isNormalized(facepalm));
assertTrue(nfkcComp.isNormalized(facepalm));

}

@Test
@EnabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java 20 and later.")
void demonstrateMetadataAboutFacePalmDudeForJava20() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
}

@Test
@DisabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java versions up to not including Java 20.")
void demonstrateMetadataAboutFacePalmDudePriorToJava20() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
assertEquals(4, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
// it should be 1, but it's wrong until Java 20.
}

@Test
@EnabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java 17 and later.")
void demonstrateMetadataAboutFacePalmDudeForJava17AndLater() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
int j = 27;
assertEquals(j, facepalm.repeat(j).split("\\b{g}").length);
}

@Test
@DisabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java versions up to not including Java 17.")
void demonstrateMetadataAboutFacePalmDudePriorToJava17() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
int j = 27;
assertEquals(j * 3, facepalm.repeat(j).split("\\b{g}").length);
// it should be 27, but it's wrong until Java 17
}


}
Loading