Skip to content

Commit

Permalink
Add support for surrogate pairs in text indexes (e.g. emoticons), fix…
Browse files Browse the repository at this point in the history
…es #157

Retrofit from
git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@1413 f203690c-595d-4dc9-a70b-905162fa7fd2
  • Loading branch information
spannm committed Oct 30, 2024
1 parent 32b895c commit 0d0407c
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ public DatabaseBuilder withIgnoreBrokenSystemCatalogIndex(boolean ignore) {
}

/**
* Opens an existingnew Database using the configured information.
* Opens an existing new Database using the configured information.
*/
public Database open() throws IOException {
return DatabaseImpl.open(_mdbFile, _readOnly, _channel, _autoSync, _charset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void writeNonNullIndexTextValue(
char c = str.charAt(i);
CharHandler ch = getCharHandler(c);

byte[] bytes = ch.getInlineBytes();
byte[] bytes = ch.getInlineBytes(c);
if (bytes != null) {
// write the "inline" codes immediately
bout.write(bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ public CharHandler parseCodes(String[] codeStrings) {
return parseSignificantCodes(codeStrings);
}
},
SURROGATE("Q") {
@Override
public CharHandler parseCodes(String[] _codeStrings) {
// these are not parsed from the codes files
throw new UnsupportedOperationException();
}
},
IGNORED("X") {
@Override
public CharHandler parseCodes(String[] codeStrings) {
Expand All @@ -135,7 +142,7 @@ public String getPrefixCode() {
abstract static class CharHandler {
public abstract Type getType();

public byte[] getInlineBytes() {
public byte[] getInlineBytes(char c) {
return null;
}

Expand Down Expand Up @@ -176,7 +183,7 @@ public Type getType() {
}

@Override
public byte[] getInlineBytes() {
public byte[] getInlineBytes(char c) {
return _bytes;
}
}
Expand All @@ -199,7 +206,7 @@ public Type getType() {
}

@Override
public byte[] getInlineBytes() {
public byte[] getInlineBytes(char c) {
return _bytes;
}

Expand Down Expand Up @@ -272,7 +279,7 @@ public Type getType() {
}

@Override
public byte[] getInlineBytes() {
public byte[] getInlineBytes(char c) {
return _bytes;
}

Expand Down Expand Up @@ -303,7 +310,7 @@ public Type getType() {
}

@Override
public byte[] getInlineBytes() {
public byte[] getInlineBytes(char c) {
return _bytes;
}

Expand All @@ -321,21 +328,67 @@ public Type getType() {
}
};

/**
* alternate shared CharHandler instance for "surrogate" chars (which we do not handle)
*/
static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() {
/** the surrogate char buffers are computed on the fly. Re-use a buffer for those. */
private static final ThreadLocal<byte[]> SURROGATE_CHAR_BUF = ThreadLocal.withInitial(() -> new byte[2]);
private static final byte[] SURROGATE_EXTRA_BYTES = {0x3f};

private abstract static class SurrogateCharHandler extends CharHandler {
@Override
public Type getType() {
return Type.IGNORED;
return Type.SURROGATE;
}

@Override
public byte[] getInlineBytes() {
throw new IllegalStateException(
"Surrogate pair chars are not handled");
public byte[] getExtraBytes() {
return SURROGATE_EXTRA_BYTES;
}
};

protected static byte[] toInlineBytes(int _idxC) {
byte[] bytes = SURROGATE_CHAR_BUF.get();
bytes[0] = (byte) ((_idxC >>> 8) & 0xFF);
bytes[1] = (byte) (_idxC & 0xFF);
return bytes;
}
}

/**
* shared CharHandler instance for "high surrogate" chars (which are computed)
*/
static final CharHandler HIGH_SURROGATE_CHAR_HANDLER = new SurrogateCharHandler() {
@Override
public byte[] getInlineBytes(char c) {
// the high sorrogate bytes seems to be computed from a fixed offset
int idxC = asUnsignedChar(c) - 10238;
return toInlineBytes(idxC);
}
};

/**
* shared CharHandler instance for "low surrogate" chars (which are computed)
*/
static final CharHandler LOW_SURROGATE_CHAR_HANDLER = new SurrogateCharHandler() {
@Override
public byte[] getInlineBytes(char c) {
// the low surrogate bytes are computed with a specific value based in
// its location in a 1024 character block.
int charOffset = (asUnsignedChar(c) - 0xdc00) % 1024;

int idxOffset = 0;
if (charOffset < 8) {
idxOffset = 9992;
} else if (charOffset < (8 + 254)) {
idxOffset = 9990;
} else if (charOffset < (8 + 254 + 254)) {
idxOffset = 9988;
} else if (charOffset < (8 + 254 + 254 + 254)) {
idxOffset = 9986;
} else {
idxOffset = 9984;
}
int idxC = asUnsignedChar(c) - idxOffset;
return toInlineBytes(idxC);
}
};

static final char FIRST_CHAR = (char) 0x0000;
static final char LAST_CHAR = (char) 0x00FF;
Expand All @@ -356,8 +409,7 @@ private static final class ExtCodes {
private static final CharHandler[] VALUES = loadCodes(EXT_CODES_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR);
}

static final GeneralLegacyIndexCodes GEN_LEG_INSTANCE =
new GeneralLegacyIndexCodes();
static final GeneralLegacyIndexCodes GEN_LEG_INSTANCE = new GeneralLegacyIndexCodes();

GeneralLegacyIndexCodes() {
}
Expand Down Expand Up @@ -394,9 +446,12 @@ static CharHandler[] loadCodes(String codesFilePath, char firstChar, char lastCh
for (int i = start; i <= end; ++i) {
char c = (char) i;
CharHandler ch = null;
if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
if (Character.isHighSurrogate(c)) {
// surrogate chars are not included in the codes files
ch = HIGH_SURROGATE_CHAR_HANDLER;
} else if (Character.isLowSurrogate(c)) {
// surrogate chars are not included in the codes files
ch = SURROGATE_CHAR_HANDLER;
ch = LOW_SURROGATE_CHAR_HANDLER;
} else {
String codeLine = reader.readLine();
ch = parseCodes(prefixMap, codeLine);
Expand Down Expand Up @@ -542,7 +597,7 @@ void writeNonNullIndexTextValue(
CharHandler ch = getCharHandler(c);

int curCharOffset = charOffset;
byte[] bytes = ch.getInlineBytes();
byte[] bytes = ch.getInlineBytes(c);
if (bytes != null) {
// write the "inline" codes immediately
bout.write(bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package io.github.spannm.jackcess.impl;

import static io.github.spannm.jackcess.test.Basename.EMOTICONS;
import static io.github.spannm.jackcess.test.Basename.INDEX_CODES;

import io.github.spannm.jackcess.*;
Expand Down Expand Up @@ -56,7 +57,7 @@ public class IndexCodesTest extends AbstractBaseTest {
}};

@ParameterizedTest(name = "[{index}] {0}")
@TestDbReadOnlySource(INDEX_CODES)
@TestDbReadOnlySource({INDEX_CODES, EMOTICONS})
void testIndexCodes(TestDb testDb) throws Exception {
try (Database db = testDb.openMem()) {
db.setDateTimeType(DateTimeType.DATE);
Expand Down
1 change: 1 addition & 0 deletions src/test/java/io/github/spannm/jackcess/test/Basename.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public enum Basename {
COMP_INDEX,
DEL,
DEL_COL,
EMOTICONS,
EXT_DATE,
FIXED_NUMERIC,
FIXED_TEXT,
Expand Down
16 changes: 12 additions & 4 deletions src/test/java/io/github/spannm/jackcess/test/TestUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -249,17 +249,25 @@ private static Map<String, Object> massageRow(Map<String, Object> row) throws IO
return row;
}

static void dumpIndex(Index index) throws IOException {
dumpIndex(index, new PrintWriter(System.out, true));
}
static void dumpIndex(Index index) throws Exception {
dumpIndex(index, Integer.MAX_VALUE);
}

static void dumpIndex(Index index, int limit) throws Exception {
dumpIndex(index, new PrintWriter(System.out, true), limit);
}

static void dumpIndex(Index index, PrintWriter writer) throws IOException {
static void dumpIndex(Index index, PrintWriter writer, int limit) throws IOException {
writer.println("INDEX: " + index);
IndexData.EntryCursor ec = ((IndexImpl) index).cursor();
IndexData.Entry lastE = ec.getLastEntry();
IndexData.Entry e = null;
int count = 0;
while ((e = ec.getNextEntry()) != lastE) {
writer.println(e);
if ((count++) > limit) {
break;
}
}
}

Expand Down
Binary file not shown.

0 comments on commit 0d0407c

Please sign in to comment.