diff --git a/src/main/java/io/github/spannm/jackcess/DatabaseBuilder.java b/src/main/java/io/github/spannm/jackcess/DatabaseBuilder.java index a39ee3f..14de376 100644 --- a/src/main/java/io/github/spannm/jackcess/DatabaseBuilder.java +++ b/src/main/java/io/github/spannm/jackcess/DatabaseBuilder.java @@ -243,7 +243,7 @@ public DatabaseBuilder withIgnoreBrokenSystemCatalogIndex(boolean ignore) { } /** - * Opens an existingnew Database using the configured information. + * Opens an existing new Database using the configured information. */ public Database open() throws IOException { return DatabaseImpl.open(_mdbFile, _readOnly, _channel, _autoSync, _charset, diff --git a/src/main/java/io/github/spannm/jackcess/impl/General97IndexCodes.java b/src/main/java/io/github/spannm/jackcess/impl/General97IndexCodes.java index 693278b..8ecfd6e 100644 --- a/src/main/java/io/github/spannm/jackcess/impl/General97IndexCodes.java +++ b/src/main/java/io/github/spannm/jackcess/impl/General97IndexCodes.java @@ -105,7 +105,7 @@ void writeNonNullIndexTextValue( char c = str.charAt(i); CharHandler ch = getCharHandler(c); - byte[] bytes = ch.getInlineBytes(); + byte[] bytes = ch.getInlineBytes(c); if (bytes != null) { // write the "inline" codes immediately bout.write(bytes); diff --git a/src/main/java/io/github/spannm/jackcess/impl/GeneralLegacyIndexCodes.java b/src/main/java/io/github/spannm/jackcess/impl/GeneralLegacyIndexCodes.java index 84f2195..7d948c7 100644 --- a/src/main/java/io/github/spannm/jackcess/impl/GeneralLegacyIndexCodes.java +++ b/src/main/java/io/github/spannm/jackcess/impl/GeneralLegacyIndexCodes.java @@ -109,6 +109,13 @@ public CharHandler parseCodes(String[] codeStrings) { return parseSignificantCodes(codeStrings); } }, + SURROGATE("Q") { + @Override + public CharHandler parseCodes(String[] _codeStrings) { + // these are not parsed from the codes files + throw new UnsupportedOperationException(); + } + }, IGNORED("X") { @Override public CharHandler parseCodes(String[] codeStrings) { @@ -135,7 +142,7 @@ public String getPrefixCode() { abstract static class CharHandler { public abstract Type getType(); - public byte[] getInlineBytes() { + public byte[] getInlineBytes(char c) { return null; } @@ -176,7 +183,7 @@ public Type getType() { } @Override - public byte[] getInlineBytes() { + public byte[] getInlineBytes(char c) { return _bytes; } } @@ -199,7 +206,7 @@ public Type getType() { } @Override - public byte[] getInlineBytes() { + public byte[] getInlineBytes(char c) { return _bytes; } @@ -272,7 +279,7 @@ public Type getType() { } @Override - public byte[] getInlineBytes() { + public byte[] getInlineBytes(char c) { return _bytes; } @@ -303,7 +310,7 @@ public Type getType() { } @Override - public byte[] getInlineBytes() { + public byte[] getInlineBytes(char c) { return _bytes; } @@ -321,21 +328,67 @@ public Type getType() { } }; - /** - * alternate shared CharHandler instance for "surrogate" chars (which we do not handle) - */ - static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() { + /** the surrogate char buffers are computed on the fly. Re-use a buffer for those. */ + private static final ThreadLocal SURROGATE_CHAR_BUF = ThreadLocal.withInitial(() -> new byte[2]); + private static final byte[] SURROGATE_EXTRA_BYTES = {0x3f}; + + private abstract static class SurrogateCharHandler extends CharHandler { @Override public Type getType() { - return Type.IGNORED; + return Type.SURROGATE; } @Override - public byte[] getInlineBytes() { - throw new IllegalStateException( - "Surrogate pair chars are not handled"); + public byte[] getExtraBytes() { + return SURROGATE_EXTRA_BYTES; } - }; + + protected static byte[] toInlineBytes(int _idxC) { + byte[] bytes = SURROGATE_CHAR_BUF.get(); + bytes[0] = (byte) ((_idxC >>> 8) & 0xFF); + bytes[1] = (byte) (_idxC & 0xFF); + return bytes; + } + } + + /** + * shared CharHandler instance for "high surrogate" chars (which are computed) + */ + static final CharHandler HIGH_SURROGATE_CHAR_HANDLER = new SurrogateCharHandler() { + @Override + public byte[] getInlineBytes(char c) { + // the high sorrogate bytes seems to be computed from a fixed offset + int idxC = asUnsignedChar(c) - 10238; + return toInlineBytes(idxC); + } + }; + + /** + * shared CharHandler instance for "low surrogate" chars (which are computed) + */ + static final CharHandler LOW_SURROGATE_CHAR_HANDLER = new SurrogateCharHandler() { + @Override + public byte[] getInlineBytes(char c) { + // the low surrogate bytes are computed with a specific value based in + // its location in a 1024 character block. + int charOffset = (asUnsignedChar(c) - 0xdc00) % 1024; + + int idxOffset = 0; + if (charOffset < 8) { + idxOffset = 9992; + } else if (charOffset < (8 + 254)) { + idxOffset = 9990; + } else if (charOffset < (8 + 254 + 254)) { + idxOffset = 9988; + } else if (charOffset < (8 + 254 + 254 + 254)) { + idxOffset = 9986; + } else { + idxOffset = 9984; + } + int idxC = asUnsignedChar(c) - idxOffset; + return toInlineBytes(idxC); + } + }; static final char FIRST_CHAR = (char) 0x0000; static final char LAST_CHAR = (char) 0x00FF; @@ -356,8 +409,7 @@ private static final class ExtCodes { private static final CharHandler[] VALUES = loadCodes(EXT_CODES_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR); } - static final GeneralLegacyIndexCodes GEN_LEG_INSTANCE = - new GeneralLegacyIndexCodes(); + static final GeneralLegacyIndexCodes GEN_LEG_INSTANCE = new GeneralLegacyIndexCodes(); GeneralLegacyIndexCodes() { } @@ -394,9 +446,12 @@ static CharHandler[] loadCodes(String codesFilePath, char firstChar, char lastCh for (int i = start; i <= end; ++i) { char c = (char) i; CharHandler ch = null; - if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) { + if (Character.isHighSurrogate(c)) { + // surrogate chars are not included in the codes files + ch = HIGH_SURROGATE_CHAR_HANDLER; + } else if (Character.isLowSurrogate(c)) { // surrogate chars are not included in the codes files - ch = SURROGATE_CHAR_HANDLER; + ch = LOW_SURROGATE_CHAR_HANDLER; } else { String codeLine = reader.readLine(); ch = parseCodes(prefixMap, codeLine); @@ -542,7 +597,7 @@ void writeNonNullIndexTextValue( CharHandler ch = getCharHandler(c); int curCharOffset = charOffset; - byte[] bytes = ch.getInlineBytes(); + byte[] bytes = ch.getInlineBytes(c); if (bytes != null) { // write the "inline" codes immediately bout.write(bytes); diff --git a/src/test/java/io/github/spannm/jackcess/impl/IndexCodesTest.java b/src/test/java/io/github/spannm/jackcess/impl/IndexCodesTest.java index 4ab5012..8e3cd9f 100644 --- a/src/test/java/io/github/spannm/jackcess/impl/IndexCodesTest.java +++ b/src/test/java/io/github/spannm/jackcess/impl/IndexCodesTest.java @@ -16,6 +16,7 @@ package io.github.spannm.jackcess.impl; +import static io.github.spannm.jackcess.test.Basename.EMOTICONS; import static io.github.spannm.jackcess.test.Basename.INDEX_CODES; import io.github.spannm.jackcess.*; @@ -56,7 +57,7 @@ public class IndexCodesTest extends AbstractBaseTest { }}; @ParameterizedTest(name = "[{index}] {0}") - @TestDbReadOnlySource(INDEX_CODES) + @TestDbReadOnlySource({INDEX_CODES, EMOTICONS}) void testIndexCodes(TestDb testDb) throws Exception { try (Database db = testDb.openMem()) { db.setDateTimeType(DateTimeType.DATE); diff --git a/src/test/java/io/github/spannm/jackcess/test/Basename.java b/src/test/java/io/github/spannm/jackcess/test/Basename.java index c52a5a8..861c5f4 100644 --- a/src/test/java/io/github/spannm/jackcess/test/Basename.java +++ b/src/test/java/io/github/spannm/jackcess/test/Basename.java @@ -18,6 +18,7 @@ public enum Basename { COMP_INDEX, DEL, DEL_COL, + EMOTICONS, EXT_DATE, FIXED_NUMERIC, FIXED_TEXT, diff --git a/src/test/java/io/github/spannm/jackcess/test/TestUtil.java b/src/test/java/io/github/spannm/jackcess/test/TestUtil.java index fd6d830..97895fe 100644 --- a/src/test/java/io/github/spannm/jackcess/test/TestUtil.java +++ b/src/test/java/io/github/spannm/jackcess/test/TestUtil.java @@ -249,17 +249,25 @@ private static Map massageRow(Map row) throws IO return row; } - static void dumpIndex(Index index) throws IOException { - dumpIndex(index, new PrintWriter(System.out, true)); - } + static void dumpIndex(Index index) throws Exception { + dumpIndex(index, Integer.MAX_VALUE); + } + + static void dumpIndex(Index index, int limit) throws Exception { + dumpIndex(index, new PrintWriter(System.out, true), limit); + } - static void dumpIndex(Index index, PrintWriter writer) throws IOException { + static void dumpIndex(Index index, PrintWriter writer, int limit) throws IOException { writer.println("INDEX: " + index); IndexData.EntryCursor ec = ((IndexImpl) index).cursor(); IndexData.Entry lastE = ec.getLastEntry(); IndexData.Entry e = null; + int count = 0; while ((e = ec.getNextEntry()) != lastE) { writer.println(e); + if ((count++) > limit) { + break; + } } } diff --git a/src/test/resources/data/V2010/emoticonsV2010.accdb b/src/test/resources/data/V2010/emoticonsV2010.accdb new file mode 100755 index 0000000..8b70b95 Binary files /dev/null and b/src/test/resources/data/V2010/emoticonsV2010.accdb differ