From 433a00c0ee70ee285987f7ee9125be791bb22b86 Mon Sep 17 00:00:00 2001 From: Max Hniebergall <137079448+maxhniebergall@users.noreply.github.com> Date: Tue, 26 Nov 2024 18:00:19 -0500 Subject: [PATCH] [ML] Fix for Deberta tokenizer when input sequence exceeds 512 tokens (#117595) * Add test and fix * Update docs/changelog/117595.yaml * Remove test which wasn't working --- docs/changelog/117595.yaml | 5 +++ .../nlp/tokenizers/NlpTokenizer.java | 23 ++++++++++++++ .../nlp/TextSimilarityProcessorTests.java | 31 +++++++++++++++++++ .../tokenizers/DebertaV2TokenizerTests.java | 4 +-- 4 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 docs/changelog/117595.yaml diff --git a/docs/changelog/117595.yaml b/docs/changelog/117595.yaml new file mode 100644 index 000000000000..9360c372ac97 --- /dev/null +++ b/docs/changelog/117595.yaml @@ -0,0 +1,5 @@ +pr: 117595 +summary: Fix for Deberta tokenizer when input sequence exceeds 512 tokens +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java index 0b4a5b651d8d..930dbee30479 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java @@ -331,6 +331,29 @@ public List tokenize(String seq1, String seq2, Tokeni tokenIdsSeq2 = tokenIdsSeq2.subList(0, maxSequenceLength() - extraTokens - tokenIdsSeq1.size()); tokenPositionMapSeq2 = tokenPositionMapSeq2.subList(0, maxSequenceLength() - extraTokens - tokenIdsSeq1.size()); } + case BALANCED -> { + isTruncated = true; + int firstSequenceLength = 0; + + if (tokenIdsSeq2.size() > (maxSequenceLength() - getNumExtraTokensForSeqPair()) / 2) { + firstSequenceLength = min(tokenIdsSeq1.size(), (maxSequenceLength() - getNumExtraTokensForSeqPair()) / 2); + } else { + firstSequenceLength = min( + tokenIdsSeq1.size(), + maxSequenceLength() - tokenIdsSeq2.size() - getNumExtraTokensForSeqPair() + ); + } + int secondSequenceLength = min( + tokenIdsSeq2.size(), + maxSequenceLength() - firstSequenceLength - getNumExtraTokensForSeqPair() + ); + + tokenIdsSeq1 = tokenIdsSeq1.subList(0, firstSequenceLength); + tokenPositionMapSeq1 = tokenPositionMapSeq1.subList(0, firstSequenceLength); + + tokenIdsSeq2 = tokenIdsSeq2.subList(0, secondSequenceLength); + tokenPositionMapSeq2 = tokenPositionMapSeq2.subList(0, secondSequenceLength); + } case NONE -> throw ExceptionsHelper.badRequestException( "Input too large. The tokenized input length [{}] exceeds the maximum sequence length [{}]", numTokens, diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java index 3590793b81ab..7460e17055a0 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java @@ -10,11 +10,13 @@ import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xpack.core.ml.inference.results.TextSimilarityInferenceResults; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.BertTokenization; +import org.elasticsearch.xpack.core.ml.inference.trainedmodel.DebertaV2Tokenization; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.TextSimilarityConfig; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.VocabularyConfig; import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizationResult; import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer; +import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DebertaV2Tokenizer; import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.TokenizationResult; import org.elasticsearch.xpack.ml.inference.pytorch.results.PyTorchInferenceResult; @@ -22,6 +24,8 @@ import java.util.List; import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizerTests.TEST_CASED_VOCAB; +import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DebertaV2TokenizerTests.TEST_CASE_SCORES; +import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DebertaV2TokenizerTests.TEST_CASE_VOCAB; import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; @@ -62,6 +66,33 @@ public void testProcessor() throws IOException { assertThat(result.predictedValue(), closeTo(42, 1e-6)); } + public void testBalancedTruncationWithLongInput() throws IOException { + String question = "Is Elasticsearch scalable?"; + StringBuilder longInputBuilder = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + longInputBuilder.append(TEST_CASE_VOCAB.get(randomIntBetween(0, TEST_CASE_VOCAB.size() - 1))).append(i).append(" "); + } + String longInput = longInputBuilder.toString().trim(); + + DebertaV2Tokenization tokenization = new DebertaV2Tokenization(false, true, null, Tokenization.Truncate.BALANCED, -1); + DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(TEST_CASE_VOCAB, TEST_CASE_SCORES, tokenization).build(); + TextSimilarityConfig textSimilarityConfig = new TextSimilarityConfig( + question, + new VocabularyConfig(""), + tokenization, + "result", + TextSimilarityConfig.SpanScoreFunction.MAX + ); + TextSimilarityProcessor processor = new TextSimilarityProcessor(tokenizer); + TokenizationResult tokenizationResult = processor.getRequestBuilder(textSimilarityConfig) + .buildRequest(List.of(longInput), "1", Tokenization.Truncate.BALANCED, -1, null) + .tokenization(); + + // Assert that the tokenization result is as expected + assertThat(tokenizationResult.anyTruncated(), is(true)); + assertThat(tokenizationResult.getTokenization(0).tokenIds().length, equalTo(512)); + } + public void testResultFunctions() { BertTokenization tokenization = new BertTokenization(false, true, 384, Tokenization.Truncate.NONE, 128); BertTokenizer tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, tokenization).build(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java index a8461de8630a..fc070ec25dc6 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java @@ -23,7 +23,7 @@ public class DebertaV2TokenizerTests extends ESTestCase { - private static final List TEST_CASE_VOCAB = List.of( + public static final List TEST_CASE_VOCAB = List.of( DebertaV2Tokenizer.CLASS_TOKEN, DebertaV2Tokenizer.PAD_TOKEN, DebertaV2Tokenizer.SEPARATOR_TOKEN, @@ -48,7 +48,7 @@ public class DebertaV2TokenizerTests extends ESTestCase { "<0xAD>", "▁" ); - private static final List TEST_CASE_SCORES = List.of( + public static final List TEST_CASE_SCORES = List.of( 0.0, 0.0, 0.0,