From 217403e7e6b0085991a2b0d8cbb18048a4420a99 Mon Sep 17 00:00:00 2001 From: Milan Straka Date: Wed, 15 Nov 2023 16:17:27 +0100 Subject: [PATCH] Fix a bug causing a sentence not to end on empty line... if the previous line ended with whitespace. --- CHANGES | 4 ++ src/tokenizer/czech_tokenizer.cpp | 80 +++++++++++----------- src/tokenizer/czech_tokenizer.rl | 2 +- src/tokenizer/english_tokenizer.cpp | 101 ++++++++++++++-------------- src/tokenizer/english_tokenizer.rl | 2 +- src/tokenizer/generic_tokenizer.cpp | 79 +++++++++++----------- src/tokenizer/generic_tokenizer.rl | 2 +- 7 files changed, 133 insertions(+), 137 deletions(-) diff --git a/CHANGES b/CHANGES index 13f3344..fcfd697 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,9 @@ Version 1.11.2-dev ------------------ +- Fix a bug in the tokenizers, which did not end a sentence on an + empty line if the preceding line ended with whitespace. However, + only the REST server and the API was affected; the `run_*` binaries + read input a paragraph at a time, and so circumvent the problem. Version 1.11.1 [18 Feb 23] diff --git a/src/tokenizer/czech_tokenizer.cpp b/src/tokenizer/czech_tokenizer.cpp index acf7125..19d8f7f 100644 --- a/src/tokenizer/czech_tokenizer.cpp +++ b/src/tokenizer/czech_tokenizer.cpp @@ -17,13 +17,13 @@ namespace morphodita { static const char _czech_tokenizer_cond_offsets[] = { 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2 + 2, 2, 2, 2 }; static const char _czech_tokenizer_cond_lengths[] = { 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0 }; static const short _czech_tokenizer_cond_keys[] = { @@ -36,8 +36,8 @@ static const char _czech_tokenizer_cond_spaces[] = { static const unsigned char _czech_tokenizer_key_offsets[] = { 0, 0, 17, 29, 43, 46, 51, 54, - 89, 94, 98, 101, 105, 110, 111, 116, - 117, 122, 136, 143, 148, 151, 163 + 89, 94, 99, 100, 105, 106, 111, 125, + 132, 137, 140, 152 }; static const short _czech_tokenizer_trans_keys[] = { @@ -52,34 +52,33 @@ static const short _czech_tokenizer_trans_keys[] = { 131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, - 255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u, - 32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u, - 147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u, - 10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u, - 32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u, - 93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u, - 44u, 46u, 69u, 101u, 159u, 48u, 57u, 69u, - 101u, 159u, 48u, 57u, 159u, 48u, 57u, 129u, - 131u, 135u, 151u, 155u, 157u, 65u, 90u, 97u, - 122u, 142u, 143u, 159u, 48u, 57u, 0 + 255u, 9u, 10u, 13u, 32u, 147u, 9u, 10u, + 13u, 32u, 147u, 13u, 9u, 10u, 13u, 32u, + 147u, 10u, 9u, 10u, 13u, 32u, 147u, 13u, + 32u, 34u, 39u, 41u, 59u, 93u, 125u, 139u, + 141u, 147u, 161u, 9u, 10u, 44u, 46u, 69u, + 101u, 159u, 48u, 57u, 69u, 101u, 159u, 48u, + 57u, 159u, 48u, 57u, 129u, 131u, 135u, 151u, + 155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u, + 159u, 48u, 57u, 0 }; static const char _czech_tokenizer_single_lengths[] = { 0, 13, 10, 12, 1, 3, 1, 21, - 5, 4, 3, 4, 5, 1, 5, 1, - 5, 12, 5, 3, 1, 6, 1 + 5, 5, 1, 5, 1, 5, 12, 5, + 3, 1, 6, 1 }; static const char _czech_tokenizer_range_lengths[] = { 0, 2, 1, 1, 1, 1, 1, 7, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 3, 1 + 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 3, 1 }; static const unsigned char _czech_tokenizer_index_offsets[] = { 0, 0, 16, 28, 42, 45, 50, 53, - 82, 88, 93, 97, 102, 108, 110, 116, - 118, 124, 138, 145, 150, 153, 163 + 82, 88, 94, 96, 102, 104, 110, 124, + 131, 136, 139, 149 }; static const char _czech_tokenizer_indicies[] = { @@ -93,49 +92,48 @@ static const char _czech_tokenizer_indicies[] = { 10, 13, 9, 13, 9, 13, 16, 16, 16, 16, 10, 16, 15, 13, 9, 17, 9, 17, 9, 15, 9, 16, 9, 16, - 9, 14, 10, 19, 20, 10, 10, 18, - 10, 21, 10, 10, 18, 10, 10, 10, - 18, 10, 21, 10, 10, 18, 10, 22, - 23, 10, 10, 18, 25, 24, 10, 22, - 26, 10, 10, 18, 25, 24, 10, 23, - 26, 10, 10, 18, 4, 4, 5, 5, - 5, 5, 5, 5, 5, 5, 4, 5, - 4, 27, 28, 28, 29, 29, 15, 15, - 27, 29, 29, 6, 6, 27, 8, 8, - 27, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 27, 15, 15, 27, 0 + 9, 14, 10, 11, 12, 10, 10, 18, + 10, 19, 20, 10, 10, 18, 22, 21, + 10, 19, 23, 10, 10, 18, 22, 21, + 10, 20, 23, 10, 10, 18, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 4, 5, 4, 24, 25, 25, 26, 26, + 15, 15, 24, 26, 26, 6, 6, 24, + 8, 8, 24, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 24, 15, 15, 24, + 0 }; static const char _czech_tokenizer_trans_targs[] = { - 7, 1, 2, 7, 1, 3, 19, 6, - 20, 7, 8, 12, 16, 17, 0, 18, - 21, 22, 7, 9, 11, 10, 13, 14, - 7, 7, 15, 7, 4, 5 + 7, 1, 2, 7, 1, 3, 16, 6, + 17, 7, 8, 9, 13, 14, 0, 15, + 18, 19, 7, 10, 11, 7, 7, 12, + 7, 4, 5 }; static const char _czech_tokenizer_trans_actions[] = { 1, 0, 0, 2, 3, 0, 4, 0, 0, 7, 0, 0, 0, 4, 0, 4, - 0, 0, 8, 0, 0, 0, 0, 0, - 9, 10, 0, 11, 0, 0 + 0, 0, 8, 0, 0, 9, 10, 0, + 11, 0, 0 }; static const char _czech_tokenizer_to_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0 }; static const char _czech_tokenizer_from_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0 }; static const unsigned char _czech_tokenizer_eof_trans[] = { 0, 1, 1, 1, 1, 1, 1, 0, - 19, 19, 19, 19, 19, 25, 19, 25, - 19, 28, 28, 28, 28, 28, 28 + 19, 19, 22, 19, 22, 19, 25, 25, + 25, 25, 25, 25 }; static const int czech_tokenizer_start = 7; diff --git a/src/tokenizer/czech_tokenizer.rl b/src/tokenizer/czech_tokenizer.rl index dd8e865..d443d77 100644 --- a/src/tokenizer/czech_tokenizer.rl +++ b/src/tokenizer/czech_tokenizer.rl @@ -152,7 +152,7 @@ bool czech_tokenizer::next_sentence(vector& tokens) { fhold; }; - eol eol + (whitespace* eol -- eol eol) eol { if (!tokens.empty()) fbreak; current = te; diff --git a/src/tokenizer/english_tokenizer.cpp b/src/tokenizer/english_tokenizer.cpp index 328a28e..31a70c1 100644 --- a/src/tokenizer/english_tokenizer.cpp +++ b/src/tokenizer/english_tokenizer.cpp @@ -234,14 +234,14 @@ static const char _english_tokenizer_cond_offsets[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2 + 2, 2 }; static const char _english_tokenizer_cond_lengths[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 + 0, 0 }; static const short _english_tokenizer_cond_keys[] = { @@ -254,9 +254,9 @@ static const char _english_tokenizer_cond_spaces[] = { static const unsigned char _english_tokenizer_key_offsets[] = { 0, 0, 17, 29, 43, 46, 49, 52, - 55, 60, 63, 98, 103, 107, 110, 114, - 119, 120, 125, 126, 131, 145, 152, 156, - 161, 164, 179, 192, 206 + 55, 60, 63, 98, 103, 108, 109, 114, + 115, 120, 134, 141, 145, 150, 153, 168, + 181, 195 }; static const short _english_tokenizer_trans_keys[] = { @@ -273,41 +273,39 @@ static const short _english_tokenizer_trans_keys[] = { 301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, 255u, 9u, 10u, 13u, 32u, 147u, 9u, - 13u, 32u, 147u, 9u, 32u, 147u, 9u, 10u, - 32u, 147u, 9u, 10u, 13u, 32u, 147u, 13u, - 9u, 10u, 13u, 32u, 147u, 10u, 9u, 10u, - 13u, 32u, 147u, 13u, 32u, 34u, 39u, 41u, - 59u, 93u, 125u, 139u, 141u, 147u, 161u, 9u, - 10u, 44u, 46u, 69u, 101u, 159u, 48u, 57u, - 44u, 46u, 69u, 101u, 69u, 101u, 159u, 48u, - 57u, 159u, 48u, 57u, 39u, 45u, 129u, 131u, + 10u, 13u, 32u, 147u, 13u, 9u, 10u, 13u, + 32u, 147u, 10u, 9u, 10u, 13u, 32u, 147u, + 13u, 32u, 34u, 39u, 41u, 59u, 93u, 125u, + 139u, 141u, 147u, 161u, 9u, 10u, 44u, 46u, + 69u, 101u, 159u, 48u, 57u, 44u, 46u, 69u, + 101u, 69u, 101u, 159u, 48u, 57u, 159u, 48u, + 57u, 39u, 45u, 129u, 131u, 135u, 151u, 155u, + 157u, 161u, 65u, 90u, 97u, 122u, 142u, 143u, + 45u, 129u, 131u, 135u, 151u, 155u, 157u, 65u, + 90u, 97u, 122u, 142u, 143u, 39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u, 65u, 90u, 97u, - 122u, 142u, 143u, 45u, 129u, 131u, 135u, 151u, - 155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u, - 39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u, - 65u, 90u, 97u, 122u, 142u, 143u, 159u, 48u, - 57u, 0 + 122u, 142u, 143u, 159u, 48u, 57u, 0 }; static const char _english_tokenizer_single_lengths[] = { 0, 13, 10, 12, 1, 1, 1, 1, - 3, 1, 21, 5, 4, 3, 4, 5, - 1, 5, 1, 5, 12, 5, 4, 3, - 1, 9, 7, 8, 1 + 3, 1, 21, 5, 5, 1, 5, 1, + 5, 12, 5, 4, 3, 1, 9, 7, + 8, 1 }; static const char _english_tokenizer_range_lengths[] = { 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 7, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 0, 1, - 1, 3, 3, 3, 1 + 0, 1, 1, 0, 1, 1, 3, 3, + 3, 1 }; static const unsigned char _english_tokenizer_index_offsets[] = { 0, 0, 16, 28, 42, 45, 48, 51, - 54, 59, 62, 91, 97, 102, 106, 111, - 117, 119, 125, 127, 133, 147, 154, 159, - 164, 167, 180, 191, 203 + 54, 59, 62, 91, 97, 103, 105, 111, + 113, 119, 133, 140, 145, 150, 153, 166, + 177, 189 }; static const char _english_tokenizer_indicies[] = { @@ -322,58 +320,57 @@ static const char _english_tokenizer_indicies[] = { 15, 13, 16, 12, 16, 12, 16, 19, 19, 19, 19, 13, 19, 18, 16, 12, 20, 12, 20, 12, 18, 12, 19, 12, - 19, 12, 17, 13, 22, 23, 13, 13, - 21, 13, 24, 13, 13, 21, 13, 13, - 13, 21, 13, 24, 13, 13, 21, 13, - 25, 26, 13, 13, 21, 28, 27, 13, - 25, 29, 13, 13, 21, 28, 27, 13, - 26, 29, 13, 13, 21, 4, 4, 5, - 5, 5, 5, 5, 5, 5, 5, 4, - 5, 4, 30, 31, 32, 33, 33, 18, - 18, 30, 31, 32, 33, 33, 30, 33, - 33, 9, 9, 30, 11, 11, 30, 34, - 35, 19, 19, 19, 19, 19, 19, 34, - 19, 19, 19, 30, 35, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 30, 34, - 19, 19, 19, 19, 19, 19, 34, 19, - 19, 19, 30, 18, 18, 30, 0 + 19, 12, 17, 13, 14, 15, 13, 13, + 21, 13, 22, 23, 13, 13, 21, 25, + 24, 13, 22, 26, 13, 13, 21, 25, + 24, 13, 23, 26, 13, 13, 21, 4, + 4, 5, 5, 5, 5, 5, 5, 5, + 5, 4, 5, 4, 27, 28, 29, 30, + 30, 18, 18, 27, 28, 29, 30, 30, + 27, 30, 30, 9, 9, 27, 11, 11, + 27, 31, 32, 19, 19, 19, 19, 19, + 19, 31, 19, 19, 19, 27, 32, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 27, 31, 19, 19, 19, 19, 19, 19, + 31, 19, 19, 19, 27, 18, 18, 27, + 0 }; static const char _english_tokenizer_trans_targs[] = { 10, 1, 2, 10, 1, 3, 5, 6, - 22, 23, 9, 24, 10, 11, 15, 19, - 20, 0, 21, 25, 28, 10, 12, 14, - 13, 16, 17, 10, 10, 18, 10, 4, - 7, 8, 26, 27 + 19, 20, 9, 21, 10, 11, 12, 16, + 17, 0, 18, 22, 25, 10, 13, 14, + 10, 10, 15, 10, 4, 7, 8, 23, + 24 }; static const char _english_tokenizer_trans_actions[] = { 1, 0, 0, 2, 3, 0, 0, 0, 4, 4, 0, 0, 7, 0, 0, 0, 4, 0, 4, 0, 0, 8, 0, 0, - 0, 0, 0, 9, 10, 0, 11, 0, - 0, 0, 0, 0 + 9, 10, 0, 11, 0, 0, 0, 0, + 0 }; static const char _english_tokenizer_to_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 + 0, 0 }; static const char _english_tokenizer_from_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 + 0, 0 }; static const unsigned char _english_tokenizer_eof_trans[] = { 0, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 0, 22, 22, 22, 22, 22, - 28, 22, 28, 22, 31, 31, 31, 31, - 31, 31, 31, 31, 31 + 1, 1, 0, 22, 22, 25, 22, 25, + 22, 28, 28, 28, 28, 28, 28, 28, + 28, 28 }; static const int english_tokenizer_start = 10; diff --git a/src/tokenizer/english_tokenizer.rl b/src/tokenizer/english_tokenizer.rl index f06da6d..f92ad17 100644 --- a/src/tokenizer/english_tokenizer.rl +++ b/src/tokenizer/english_tokenizer.rl @@ -137,7 +137,7 @@ bool english_tokenizer::next_sentence(vector& tokens) { fhold; }; - eol eol + (whitespace* eol -- eol eol) eol { if (!tokens.empty()) fbreak; current = te; diff --git a/src/tokenizer/generic_tokenizer.cpp b/src/tokenizer/generic_tokenizer.cpp index 086bfdf..af25bf1 100644 --- a/src/tokenizer/generic_tokenizer.cpp +++ b/src/tokenizer/generic_tokenizer.cpp @@ -17,13 +17,13 @@ namespace morphodita { static const char _generic_tokenizer_cond_offsets[] = { 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2 + 2, 2, 2, 2 }; static const char _generic_tokenizer_cond_lengths[] = { 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0 }; static const short _generic_tokenizer_cond_keys[] = { @@ -36,8 +36,8 @@ static const char _generic_tokenizer_cond_spaces[] = { static const unsigned char _generic_tokenizer_key_offsets[] = { 0, 0, 17, 29, 43, 46, 51, 54, - 89, 94, 98, 101, 105, 110, 111, 116, - 117, 122, 136, 142, 147, 150, 162 + 89, 94, 99, 100, 105, 106, 111, 125, + 131, 136, 139, 151 }; static const short _generic_tokenizer_trans_keys[] = { @@ -52,34 +52,33 @@ static const short _generic_tokenizer_trans_keys[] = { 131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, - 255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u, - 32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u, - 147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u, - 10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u, - 32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u, - 93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u, - 46u, 69u, 101u, 159u, 48u, 57u, 69u, 101u, - 159u, 48u, 57u, 159u, 48u, 57u, 129u, 131u, - 135u, 151u, 155u, 157u, 65u, 90u, 97u, 122u, - 142u, 143u, 159u, 48u, 57u, 0 + 255u, 9u, 10u, 13u, 32u, 147u, 9u, 10u, + 13u, 32u, 147u, 13u, 9u, 10u, 13u, 32u, + 147u, 10u, 9u, 10u, 13u, 32u, 147u, 13u, + 32u, 34u, 39u, 41u, 59u, 93u, 125u, 139u, + 141u, 147u, 161u, 9u, 10u, 46u, 69u, 101u, + 159u, 48u, 57u, 69u, 101u, 159u, 48u, 57u, + 159u, 48u, 57u, 129u, 131u, 135u, 151u, 155u, + 157u, 65u, 90u, 97u, 122u, 142u, 143u, 159u, + 48u, 57u, 0 }; static const char _generic_tokenizer_single_lengths[] = { 0, 13, 10, 12, 1, 3, 1, 21, - 5, 4, 3, 4, 5, 1, 5, 1, - 5, 12, 4, 3, 1, 6, 1 + 5, 5, 1, 5, 1, 5, 12, 4, + 3, 1, 6, 1 }; static const char _generic_tokenizer_range_lengths[] = { 0, 2, 1, 1, 1, 1, 1, 7, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 3, 1 + 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 3, 1 }; static const unsigned char _generic_tokenizer_index_offsets[] = { 0, 0, 16, 28, 42, 45, 50, 53, - 82, 88, 93, 97, 102, 108, 110, 116, - 118, 124, 138, 144, 149, 152, 162 + 82, 88, 94, 96, 102, 104, 110, 124, + 130, 135, 138, 148 }; static const char _generic_tokenizer_indicies[] = { @@ -93,49 +92,47 @@ static const char _generic_tokenizer_indicies[] = { 10, 13, 9, 13, 9, 13, 16, 16, 16, 16, 10, 16, 15, 13, 9, 17, 9, 17, 9, 15, 9, 16, 9, 16, - 9, 14, 10, 19, 20, 10, 10, 18, - 10, 21, 10, 10, 18, 10, 10, 10, - 18, 10, 21, 10, 10, 18, 10, 22, - 23, 10, 10, 18, 25, 24, 10, 22, - 26, 10, 10, 18, 25, 24, 10, 23, - 26, 10, 10, 18, 4, 4, 5, 5, - 5, 5, 5, 5, 5, 5, 4, 5, - 4, 27, 28, 29, 29, 15, 15, 27, - 29, 29, 6, 6, 27, 8, 8, 27, - 16, 16, 16, 16, 16, 16, 16, 16, - 16, 27, 15, 15, 27, 0 + 9, 14, 10, 11, 12, 10, 10, 18, + 10, 19, 20, 10, 10, 18, 22, 21, + 10, 19, 23, 10, 10, 18, 22, 21, + 10, 20, 23, 10, 10, 18, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 4, 5, 4, 24, 25, 26, 26, 15, + 15, 24, 26, 26, 6, 6, 24, 8, + 8, 24, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 24, 15, 15, 24, 0 }; static const char _generic_tokenizer_trans_targs[] = { - 7, 1, 2, 7, 1, 3, 19, 6, - 20, 7, 8, 12, 16, 17, 0, 18, - 21, 22, 7, 9, 11, 10, 13, 14, - 7, 7, 15, 7, 4, 5 + 7, 1, 2, 7, 1, 3, 16, 6, + 17, 7, 8, 9, 13, 14, 0, 15, + 18, 19, 7, 10, 11, 7, 7, 12, + 7, 4, 5 }; static const char _generic_tokenizer_trans_actions[] = { 1, 0, 0, 2, 3, 0, 4, 0, 0, 7, 0, 0, 0, 4, 0, 4, - 0, 0, 8, 0, 0, 0, 0, 0, - 9, 10, 0, 11, 0, 0 + 0, 0, 8, 0, 0, 9, 10, 0, + 11, 0, 0 }; static const char _generic_tokenizer_to_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0 }; static const char _generic_tokenizer_from_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0 }; static const unsigned char _generic_tokenizer_eof_trans[] = { 0, 1, 1, 1, 1, 1, 1, 0, - 19, 19, 19, 19, 19, 25, 19, 25, - 19, 28, 28, 28, 28, 28, 28 + 19, 19, 22, 19, 22, 19, 25, 25, + 25, 25, 25, 25 }; static const int generic_tokenizer_start = 7; diff --git a/src/tokenizer/generic_tokenizer.rl b/src/tokenizer/generic_tokenizer.rl index c798689..68466e1 100644 --- a/src/tokenizer/generic_tokenizer.rl +++ b/src/tokenizer/generic_tokenizer.rl @@ -75,7 +75,7 @@ bool generic_tokenizer::next_sentence(vector& tokens) { fhold; }; - eol eol + (whitespace* eol -- eol eol) eol { if (!tokens.empty()) fbreak; current = te;