From 69c2dbb000175d8492a5e7c0fd920368d421ac85 Mon Sep 17 00:00:00 2001 From: Nick Frasser <1693461+nfrasser@users.noreply.github.com> Date: Tue, 3 Dec 2024 23:30:04 -0500 Subject: [PATCH] Combine mixed word/number tokens Prevents some extensions like somefile.mp4 from getting interpreted as URLs --- packages/linkify-plugin-hashtag/src/hashtag.mjs | 6 +++++- packages/linkifyjs/src/scanner.mjs | 15 ++++++++++++++- packages/linkifyjs/src/text.mjs | 2 ++ test/spec/linkifyjs/parser.test.mjs | 1 + test/spec/linkifyjs/scanner.test.mjs | 12 ++++++------ 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/packages/linkify-plugin-hashtag/src/hashtag.mjs b/packages/linkify-plugin-hashtag/src/hashtag.mjs index 491c058..fa6428d 100644 --- a/packages/linkify-plugin-hashtag/src/hashtag.mjs +++ b/packages/linkify-plugin-hashtag/src/hashtag.mjs @@ -8,7 +8,7 @@ const HashtagToken = createTokenClass('hashtag', { isLink: true }); */ export default function hashtag({ scanner, parser }) { // Various tokens that may compose a hashtag - const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT } = scanner.tokens; + const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT, ASCIINUMERICAL, ALPHANUMERICAL } = scanner.tokens; const { alpha, numeric, alphanumeric, emoji } = scanner.tokens.groups; // Take or create a transition from start to the '#' sign (non-accepting) @@ -18,10 +18,14 @@ export default function hashtag({ scanner, parser }) { const HashPrefix = Hash.tt(UNDERSCORE); const Hashtag = new State(HashtagToken); + Hash.tt(ASCIINUMERICAL, Hashtag); + Hash.tt(ALPHANUMERICAL, Hashtag); Hash.ta(numeric, HashPrefix); Hash.ta(alpha, Hashtag); Hash.ta(emoji, Hashtag); Hash.ta(FULLWIDTHMIDDLEDOT, Hashtag); + HashPrefix.tt(ASCIINUMERICAL, Hashtag); + HashPrefix.tt(ALPHANUMERICAL, Hashtag); HashPrefix.ta(alpha, Hashtag); HashPrefix.ta(emoji, Hashtag); HashPrefix.ta(FULLWIDTHMIDDLEDOT, Hashtag); diff --git a/packages/linkifyjs/src/scanner.mjs b/packages/linkifyjs/src/scanner.mjs index 4ebc285..406df67 100644 --- a/packages/linkifyjs/src/scanner.mjs +++ b/packages/linkifyjs/src/scanner.mjs @@ -99,15 +99,24 @@ export function init(customSchemes = []) { const Num = tr(Start, re.DIGIT, tk.NUM, { [fsm.numeric]: true }); tr(Num, re.DIGIT, Num); + const Asciinumeric = tr(Num, re.ASCII_LETTER, tk.ASCIINUMERICAL, { [fsm.asciinumeric]: true }); + const Alphanumeric = tr(Num, re.LETTER, tk.ALPHANUMERICAL, { [fsm.alphanumeric]: true }); // State which emits a word token const Word = tr(Start, re.ASCII_LETTER, tk.WORD, { [fsm.ascii]: true }); + tr(Word, re.DIGIT, Asciinumeric); tr(Word, re.ASCII_LETTER, Word); + tr(Asciinumeric, re.DIGIT, Asciinumeric); + tr(Asciinumeric, re.ASCII_LETTER, Asciinumeric); // Same as previous, but specific to non-fsm.ascii alphabet words const UWord = tr(Start, re.LETTER, tk.UWORD, { [fsm.alpha]: true }); tr(UWord, re.ASCII_LETTER); // Non-accepting + tr(UWord, re.DIGIT, Alphanumeric); tr(UWord, re.LETTER, UWord); + tr(Alphanumeric, re.DIGIT, Alphanumeric); + tr(Alphanumeric, re.ASCII_LETTER); // Non-accepting + tr(Alphanumeric, re.LETTER, Alphanumeric); // Non-accepting // Whitespace jumps // Tokens of only non-newline whitespace are arbitrarily long @@ -132,10 +141,14 @@ export function init(customSchemes = []) { // Generates states for top-level domains // Note that this is most accurate when tlds are in alphabetical order - const wordjr = [[re.ASCII_LETTER, Word]]; + const wordjr = [ + [re.ASCII_LETTER, Word], + [re.DIGIT, Asciinumeric], + ]; const uwordjr = [ [re.ASCII_LETTER, null], [re.LETTER, UWord], + [re.DIGIT, Alphanumeric], ]; for (let i = 0; i < tlds.length; i++) { fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr); diff --git a/packages/linkifyjs/src/text.mjs b/packages/linkifyjs/src/text.mjs index 3a44c36..b0f7892 100644 --- a/packages/linkifyjs/src/text.mjs +++ b/packages/linkifyjs/src/text.mjs @@ -6,6 +6,8 @@ Identifiers for token outputs from the regexp scanner // A valid web domain token export const WORD = 'WORD'; // only contains a-z export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN +export const ASCIINUMERICAL = 'ASCIINUMERICAL'; // contains a-z, 0-9 +export const ALPHANUMERICAL = 'ALPHANUMERICAL'; // contains numbers and letters other than a-z, used for IDN // Special case of word export const LOCALHOST = 'LOCALHOST'; diff --git a/test/spec/linkifyjs/parser.test.mjs b/test/spec/linkifyjs/parser.test.mjs index 6fefb43..f7b8124 100644 --- a/test/spec/linkifyjs/parser.test.mjs +++ b/test/spec/linkifyjs/parser.test.mjs @@ -318,6 +318,7 @@ const tests = [ [Url, Text], ['https://google.com', '\ufffcthis'], ], + ['some string with somefile.mp4 token', [Text], ['some string with somefile.mp4 token']], ]; describe('linkifyjs/parser#run()', () => { diff --git a/test/spec/linkifyjs/scanner.test.mjs b/test/spec/linkifyjs/scanner.test.mjs index 7a3ced1..f58f764 100644 --- a/test/spec/linkifyjs/scanner.test.mjs +++ b/test/spec/linkifyjs/scanner.test.mjs @@ -36,10 +36,10 @@ const tests = [ ], ["!,;'", [t.EXCLAMATION, t.COMMA, t.SEMI, t.APOSTROPHE], ['!', ',', ';', "'"]], ['hello', [t.WORD], ['hello']], - ['Hello123', [t.WORD, t.NUM], ['Hello', '123']], - ['hello123world', [t.WORD, t.NUM, t.TLD], ['hello', '123', 'world']], + ['Hello123', [t.ASCIINUMERICAL], ['Hello123']], + ['hello123world', [t.ASCIINUMERICAL], ['hello123world']], ['0123', [t.NUM], ['0123']], - ['123abc', [t.NUM, t.TLD], ['123', 'abc']], + ['123abc', [t.ASCIINUMERICAL], ['123abc']], ['http', [t.SLASH_SCHEME], ['http']], ['http:', [t.SLASH_SCHEME, t.COLON], ['http', ':']], ['https:', [t.SLASH_SCHEME, t.COLON], ['https', ':']], @@ -66,10 +66,10 @@ const tests = [ ['local', [t.WORD], ['local']], ['localhost', [t.LOCALHOST], ['localhost']], ['localhosts', [t.WORD], ['localhosts']], - ['500px', [t.NUM, t.WORD], ['500', 'px']], + ['500px', [t.ASCIINUMERICAL], ['500px']], ['500-px', [t.NUM, t.HYPHEN, t.WORD], ['500', '-', 'px']], - ['-500px', [t.HYPHEN, t.NUM, t.WORD], ['-', '500', 'px']], - ['500px-', [t.NUM, t.WORD, t.HYPHEN], ['500', 'px', '-']], + ['-500px', [t.HYPHEN, t.ASCIINUMERICAL], ['-', '500px']], + ['500px-', [t.ASCIINUMERICAL, t.HYPHEN], ['500px', '-']], ['123-456', [t.NUM, t.HYPHEN, t.NUM], ['123', '-', '456']], ['foo\u00a0bar', [t.TLD, t.WS, t.TLD], ['foo', '\u00a0', 'bar']], // nbsp ['çïrâ.ca', [t.UWORD, t.WORD, t.UWORD, t.DOT, t.TLD], ['çï', 'r', 'â', '.', 'ca']],