From 0a2af9a33be37a81e95fc3e7c057b94573903550 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 18 Jul 2023 20:46:11 +0200 Subject: [PATCH] refactor: experiment using unicode decomposition+recomposition and the regenerate lib for pattern matching --- index.js | 447 +++------------------------------------------------ package.json | 4 +- test.js | 72 ++++----- 3 files changed, 65 insertions(+), 458 deletions(-) diff --git a/index.js b/index.js index cc97f02..3dc0275 100644 --- a/index.js +++ b/index.js @@ -1,426 +1,31 @@ -var characterMap = { - "À": "A", - "Á": "A", - "Â": "A", - "Ã": "A", - "Ä": "A", - "Å": "A", - "Ấ": "A", - "Ắ": "A", - "Ẳ": "A", - "Ẵ": "A", - "Ặ": "A", - "Æ": "AE", - "Ầ": "A", - "Ằ": "A", - "Ȃ": "A", - "Ç": "C", - "Ḉ": "C", - "È": "E", - "É": "E", - "Ê": "E", - "Ë": "E", - "Ế": "E", - "Ḗ": "E", - "Ề": "E", - "Ḕ": "E", - "Ḝ": "E", - "Ȇ": "E", - "Ì": "I", - "Í": "I", - "Î": "I", - "Ï": "I", - "Ḯ": "I", - "Ȋ": "I", - "Ð": "D", - "Ñ": "N", - "Ò": "O", - "Ó": "O", - "Ô": "O", - "Õ": "O", - "Ö": "O", - "Ø": "O", - "Ố": "O", - "Ṍ": "O", - "Ṓ": "O", - "Ȏ": "O", - "Ù": "U", - "Ú": "U", - "Û": "U", - "Ü": "U", - "Ý": "Y", - "à": "a", - "á": "a", - "â": "a", - "ã": "a", - "ä": "a", - "å": "a", - "ấ": "a", - "ắ": "a", - "ẳ": "a", - "ẵ": "a", - "ặ": "a", - "æ": "ae", - "ầ": "a", - "ằ": "a", - "ȃ": "a", - "ç": "c", - "ḉ": "c", - "è": "e", - "é": "e", - "ê": "e", - "ë": "e", - "ế": "e", - "ḗ": "e", - "ề": "e", - "ḕ": "e", - "ḝ": "e", - "ȇ": "e", - "ì": "i", - "í": "i", - "î": "i", - "ï": "i", - "ḯ": "i", - "ȋ": "i", - "ð": "d", - "ñ": "n", - "ò": "o", - "ó": "o", - "ô": "o", - "õ": "o", - "ö": "o", - "ø": "o", - "ố": "o", - "ṍ": "o", - "ṓ": "o", - "ȏ": "o", - "ù": "u", - "ú": "u", - "û": "u", - "ü": "u", - "ý": "y", - "ÿ": "y", - "Ā": "A", - "ā": "a", - "Ă": "A", - "ă": "a", - "Ą": "A", - "ą": "a", - "Ć": "C", - "ć": "c", - "Ĉ": "C", - "ĉ": "c", - "Ċ": "C", - "ċ": "c", - "Č": "C", - "č": "c", - "C̆": "C", - "c̆": "c", - "Ď": "D", - "ď": "d", - "Đ": "D", - "đ": "d", - "Ē": "E", - "ē": "e", - "Ĕ": "E", - "ĕ": "e", - "Ė": "E", - "ė": "e", - "Ę": "E", - "ę": "e", - "Ě": "E", - "ě": "e", - "Ĝ": "G", - "Ǵ": "G", - "ĝ": "g", - "ǵ": "g", - "Ğ": "G", - "ğ": "g", - "Ġ": "G", - "ġ": "g", - "Ģ": "G", - "ģ": "g", - "Ĥ": "H", - "ĥ": "h", - "Ħ": "H", - "ħ": "h", - "Ḫ": "H", - "ḫ": "h", - "Ĩ": "I", - "ĩ": "i", - "Ī": "I", - "ī": "i", - "Ĭ": "I", - "ĭ": "i", - "Į": "I", - "į": "i", - "İ": "I", - "ı": "i", - "IJ": "IJ", - "ij": "ij", - "Ĵ": "J", - "ĵ": "j", - "Ķ": "K", - "ķ": "k", - "Ḱ": "K", - "ḱ": "k", - "K̆": "K", - "k̆": "k", - "Ĺ": "L", - "ĺ": "l", - "Ļ": "L", - "ļ": "l", - "Ľ": "L", - "ľ": "l", - "Ŀ": "L", - "ŀ": "l", - "Ł": "l", - "ł": "l", - "Ḿ": "M", - "ḿ": "m", - "M̆": "M", - "m̆": "m", - "Ń": "N", - "ń": "n", - "Ņ": "N", - "ņ": "n", - "Ň": "N", - "ň": "n", - "ʼn": "n", - "N̆": "N", - "n̆": "n", - "Ō": "O", - "ō": "o", - "Ŏ": "O", - "ŏ": "o", - "Ő": "O", - "ő": "o", - "Œ": "OE", - "œ": "oe", - "P̆": "P", - "p̆": "p", - "Ŕ": "R", - "ŕ": "r", - "Ŗ": "R", - "ŗ": "r", - "Ř": "R", - "ř": "r", - "R̆": "R", - "r̆": "r", - "Ȓ": "R", - "ȓ": "r", - "Ś": "S", - "ś": "s", - "Ŝ": "S", - "ŝ": "s", - "Ş": "S", - "Ș": "S", - "ș": "s", - "ş": "s", - "Š": "S", - "š": "s", - "ß": "ss", - "Ţ": "T", - "ţ": "t", - "ț": "t", - "Ț": "T", - "Ť": "T", - "ť": "t", - "Ŧ": "T", - "ŧ": "t", - "T̆": "T", - "t̆": "t", - "Ũ": "U", - "ũ": "u", - "Ū": "U", - "ū": "u", - "Ŭ": "U", - "ŭ": "u", - "Ů": "U", - "ů": "u", - "Ű": "U", - "ű": "u", - "Ų": "U", - "ų": "u", - "Ȗ": "U", - "ȗ": "u", - "V̆": "V", - "v̆": "v", - "Ŵ": "W", - "ŵ": "w", - "Ẃ": "W", - "ẃ": "w", - "X̆": "X", - "x̆": "x", - "Ŷ": "Y", - "ŷ": "y", - "Ÿ": "Y", - "Y̆": "Y", - "y̆": "y", - "Ź": "Z", - "ź": "z", - "Ż": "Z", - "ż": "z", - "Ž": "Z", - "ž": "z", - "ſ": "s", - "ƒ": "f", - "Ơ": "O", - "ơ": "o", - "Ư": "U", - "ư": "u", - "Ǎ": "A", - "ǎ": "a", - "Ǐ": "I", - "ǐ": "i", - "Ǒ": "O", - "ǒ": "o", - "Ǔ": "U", - "ǔ": "u", - "Ǖ": "U", - "ǖ": "u", - "Ǘ": "U", - "ǘ": "u", - "Ǚ": "U", - "ǚ": "u", - "Ǜ": "U", - "ǜ": "u", - "Ứ": "U", - "ứ": "u", - "Ṹ": "U", - "ṹ": "u", - "Ǻ": "A", - "ǻ": "a", - "Ǽ": "AE", - "ǽ": "ae", - "Ǿ": "O", - "ǿ": "o", - "Þ": "TH", - "þ": "th", - "Ṕ": "P", - "ṕ": "p", - "Ṥ": "S", - "ṥ": "s", - "X́": "X", - "x́": "x", - "Ѓ": "Г", - "ѓ": "г", - "Ќ": "К", - "ќ": "к", - "A̋": "A", - "a̋": "a", - "E̋": "E", - "e̋": "e", - "I̋": "I", - "i̋": "i", - "Ǹ": "N", - "ǹ": "n", - "Ồ": "O", - "ồ": "o", - "Ṑ": "O", - "ṑ": "o", - "Ừ": "U", - "ừ": "u", - "Ẁ": "W", - "ẁ": "w", - "Ỳ": "Y", - "ỳ": "y", - "Ȁ": "A", - "ȁ": "a", - "Ȅ": "E", - "ȅ": "e", - "Ȉ": "I", - "ȉ": "i", - "Ȍ": "O", - "ȍ": "o", - "Ȑ": "R", - "ȑ": "r", - "Ȕ": "U", - "ȕ": "u", - "B̌": "B", - "b̌": "b", - "Č̣": "C", - "č̣": "c", - "Ê̌": "E", - "ê̌": "e", - "F̌": "F", - "f̌": "f", - "Ǧ": "G", - "ǧ": "g", - "Ȟ": "H", - "ȟ": "h", - "J̌": "J", - "ǰ": "j", - "Ǩ": "K", - "ǩ": "k", - "M̌": "M", - "m̌": "m", - "P̌": "P", - "p̌": "p", - "Q̌": "Q", - "q̌": "q", - "Ř̩": "R", - "ř̩": "r", - "Ṧ": "S", - "ṧ": "s", - "V̌": "V", - "v̌": "v", - "W̌": "W", - "w̌": "w", - "X̌": "X", - "x̌": "x", - "Y̌": "Y", - "y̌": "y", - "A̧": "A", - "a̧": "a", - "B̧": "B", - "b̧": "b", - "Ḑ": "D", - "ḑ": "d", - "Ȩ": "E", - "ȩ": "e", - "Ɛ̧": "E", - "ɛ̧": "e", - "Ḩ": "H", - "ḩ": "h", - "I̧": "I", - "i̧": "i", - "Ɨ̧": "I", - "ɨ̧": "i", - "M̧": "M", - "m̧": "m", - "O̧": "O", - "o̧": "o", - "Q̧": "Q", - "q̧": "q", - "U̧": "U", - "u̧": "u", - "X̧": "X", - "x̧": "x", - "Z̧": "Z", - "z̧": "z", - "й":"и", - "Й":"И", - "ё":"е", - "Ё":"Е", -}; +const regenerate = require('regenerate') -var chars = Object.keys(characterMap).join('|'); -var allAccents = new RegExp(chars, 'g'); -var firstAccent = new RegExp(chars, ''); +// unicode combining marks +// see: https://github.com/pelias/pelias/issues/829#issuecomment-542614645 +// ref: https://en.wikipedia.org/wiki/Combining_character +const COMBINING_MARKS = regenerate() + .add(0x200D) // ZERO WIDTH JOINER (U+200D) + .addRange(0x0300, 0x036F) // Combining Diacritical Marks (0300–036F) + .addRange(0x1AB0, 0x1AFF) // Combining Diacritical Marks Extended (1AB0–1AFF) + .addRange(0x1DC0, 0x1DFF) // Combining Diacritical Marks Supplement (1DC0–1DFF) + .addRange(0x20D0, 0x20FF) // Combining Diacritical Marks for Symbols (20D0–20FF) + .addRange(0xFE00, 0xFE0F) // Variation Selectors (FE00-FE0F) + .addRange(0xFE20, 0xFE2F) // Combining Half Marks (FE20–FE2F) + .add(0x3099) // combining dakuten (U+3099) + .add(0x309A) // combining handakuten (U+309A) + .toRegExp('g') -function matcher(match) { - return characterMap[match]; +const removeAccents = function (string) { + return string + .normalize('NFKD') + .replace(COMBINING_MARKS, '') + .normalize('NFKC') } -var removeAccents = function(string) { - return string.replace(allAccents, matcher); -}; - -var hasAccents = function(string) { - return !!string.match(firstAccent); -}; +const hasAccents = function (string) { + return string !== removeAccents(string) +} -module.exports = removeAccents; -module.exports.has = hasAccents; -module.exports.remove = removeAccents; +module.exports = removeAccents +module.exports.has = hasAccents +module.exports.remove = removeAccents diff --git a/package.json b/package.json index d4d50f3..f39b5c4 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,9 @@ "version": "0.4.4", "description": "Converting the accented characters to their corresponding non-accented ASCII characters.", "main": "index.js", - "dependencies": {}, + "dependencies": { + "regenerate": "^1.4.2" + }, "devDependencies": { "tape": "^4.2.2" }, diff --git a/test.js b/test.js index 0568de6..8d32618 100644 --- a/test.js +++ b/test.js @@ -1,50 +1,50 @@ -var tape = require('tape'); -var removeAccents = require('./'); +const tape = require('tape') +let removeAccents = require('./') -tape('remove accents from string', function(t) { - var input = 'ÀÁÂÃÄÅẤẮÆẦẰÇḈÈÉÊËẾḖỀḔÌÍÎÏḮÐÑÒÓÔÕÖØỐṌṒÙÚÛÜÝàáâãäåấắæầằçḉèéêëếḗềḕìíîïḯñòóôõöøốṍṓùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģǴǵĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķḰḱĹĺĻļĽľĿŀŁłḾḿŃńŅņŇňʼnŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵẂẃŶŷŸŹźŻżŽžſƒƠơƯưǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜỨứṸṹǺǻǼǽǾǿðÞþṔṕṤṥX́x́ЃѓЌќA̋a̋E̋e̋I̋i̋ǸǹỒồṐṑỪừẀẁỲỳȀȁȄȅȈȉȌȍȐȑȔȕẲẴẶḜẳẵặḝC̆c̆ḪḫK̆k̆M̆m̆N̆n̆P̆p̆R̆r̆T̆t̆V̆v̆X̆x̆Y̆y̆ȂȆȊȎȃȇȋȏȒȓȖȗșțȘȚB̌b̌F̌f̌ǦǧȞȟJ̌ǰǨǩM̌m̌P̌p̌Q̌q̌ṦṧV̌v̌W̌w̌X̌x̌Y̌y̌A̧a̧B̧b̧ḐḑȨȩƐ̧ɛ̧ḨḩI̧i̧Ɨ̧ɨ̧M̧m̧O̧o̧Q̧q̧U̧u̧X̧x̧Z̧z̧ß'; - var output = removeAccents(input); - var expected = 'AAAAAAAAAEAACCEEEEEEEEIIIIIDNOOOOOOOOOUUUUYaaaaaaaaaeaacceeeeeeeeiiiiinooooooooouuuuyyAaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgGgHhHhIiIiIiIiIiIJijJjKkKkLlLlLlLlllMmNnNnNnnOoOoOoOEoeRrRrRrSsSsSsSsTtTtTtUuUuUuUuUuUuWwWwYyYZzZzZzsfOoUuAaIiOoUuUuUuUuUuUuUuAaAEaeOodTHthPpSsXxГгКкAaEeIiNnOoOoUuWwYyAaEeIiOoRrUuAAAEaaaeCcHhKkMmNnPpRrTtVvXxYyAEIOaeioRrUustSTBbFfGgHhJjKkMmPpQqSsVvWwXxYyAaBbDdEeEeHhIiIiMmOoQqUuXxZzss'; +tape('remove accents from string', function (t) { + const input = 'ÀÁÂÃÄÅẤẮÆẦẰÇḈÈÉÊËẾḖỀḔÌÍÎÏḮÐÑÒÓÔÕÖØỐṌṒÙÚÛÜÝàáâãäåấắæầằçḉèéêëếḗềḕìíîïḯñòóôõöøốṍṓùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģǴǵĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķḰḱĹĺĻļĽľĿŀŁłḾḿŃńŅņŇňʼnŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵẂẃŶŷŸŹźŻżŽžſƒƠơƯưǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜỨứṸṹǺǻǼǽǾǿðÞþṔṕṤṥX́x́ЃѓЌќA̋a̋E̋e̋I̋i̋ǸǹỒồṐṑỪừẀẁỲỳȀȁȄȅȈȉȌȍȐȑȔȕẲẴẶḜẳẵặḝC̆c̆ḪḫK̆k̆M̆m̆N̆n̆P̆p̆R̆r̆T̆t̆V̆v̆X̆x̆Y̆y̆ȂȆȊȎȃȇȋȏȒȓȖȗșțȘȚB̌b̌F̌f̌ǦǧȞȟJ̌ǰǨǩM̌m̌P̌p̌Q̌q̌ṦṧV̌v̌W̌w̌X̌x̌Y̌y̌A̧a̧B̧b̧ḐḑȨȩƐ̧ɛ̧ḨḩI̧i̧Ɨ̧ɨ̧M̧m̧O̧o̧Q̧q̧U̧u̧X̧x̧Z̧z̧ß' + let output = removeAccents(input) + let expected = 'AAAAAAAAAEAACCEEEEEEEEIIIIIDNOOOOOOOOOUUUUYaaaaaaaaaeaacceeeeeeeeiiiiinooooooooouuuuyyAaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgGgHhHhIiIiIiIiIiIJijJjKkKkLlLlLlLlllMmNnNnNnnOoOoOoOEoeRrRrRrSsSsSsSsTtTtTtUuUuUuUuUuUuWwWwYyYZzZzZzsfOoUuAaIiOoUuUuUuUuUuUuUuAaAEaeOodTHthPpSsXxГгКкAaEeIiNnOoOoUuWwYyAaEeIiOoRrUuAAAEaaaeCcHhKkMmNnPpRrTtVvXxYyAEIOaeioRrUustSTBbFfGgHhJjKkMmPpQqSsVvWwXxYyAaBbDdEeEeHhIiIiMmOoQqUuXxZzss' - t.same( output, expected ); + t.same(output, expected) - t.end(); -}); + t.end() +}) -tape('remove cyrillic accents from string', function(t) { - var input = 'ЁёЙй'; - var output = removeAccents(input); - var expected = 'ЕеИи'; +tape('remove cyrillic accents from string', function (t) { + const input = 'ЁёЙй' + let output = removeAccents(input) + let expected = 'ЕеИи' - t.same( output, expected ); + t.same(output, expected) - t.end(); -}); + t.end() +}) -tape('do not modify non-accented strings', function(t) { - var input = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789.,:;~`!@#$%^&*()-_=+[]{}\'"|\\<>?/eEиИ'; - var output = removeAccents(input); +tape('do not modify non-accented strings', function (t) { + const input = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789.,:;~`!@#$%^&*()-_=+[]{}\'"|\\<>?/eEиИ' + let output = removeAccents(input) - t.same( output, input ); + t.same(output, input) - t.end(); -}); + t.end() +}) -tape('.has can detect accents', function(t) { - t.equal(removeAccents.has('À'), true); - t.equal(removeAccents.has('Löwe'), true); +tape('.has can detect accents', function (t) { + t.equal(removeAccents.has('À'), true) + t.equal(removeAccents.has('Löwe'), true) - t.equal(removeAccents.has('A'), false); - t.equal(removeAccents.has('Panther'), false); + t.equal(removeAccents.has('A'), false) + t.equal(removeAccents.has('Panther'), false) - t.end(); -}); + t.end() +}) -tape('.remove method', function(t) { - t.same(removeAccents.toString(), removeAccents.remove.toString()); +tape('.remove method', function (t) { + t.same(removeAccents.toString(), removeAccents.remove.toString()) - t.same(removeAccents.remove('cat'), 'cat'); - t.same(removeAccents.remove('Pokémon'), 'Pokemon'); - t.same(removeAccents.remove('Straße'), 'Strasse'); - t.end(); -}); + t.same(removeAccents.remove('cat'), 'cat') + t.same(removeAccents.remove('Pokémon'), 'Pokemon') + t.same(removeAccents.remove('Straße'), 'Straße') + t.end() +})