From 8027cc893a19b184fc57f836ce016ce27a96879d Mon Sep 17 00:00:00 2001 From: aymen Date: Fri, 14 Jun 2019 13:11:52 +0100 Subject: [PATCH] =?UTF-8?q?Filtrer=20les=20symboles=20ind=C3=A9sirables?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CommonVoice-Data/bano.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CommonVoice-Data/bano.py b/CommonVoice-Data/bano.py index 4d37d921..e390d381 100644 --- a/CommonVoice-Data/bano.py +++ b/CommonVoice-Data/bano.py @@ -38,6 +38,7 @@ (re.compile(r'(\s|^)0(\s|$|,)'), r'\g<1>zéro\g<2>'), (re.compile(r'(\s|^)0(\s|$|,)'), r'\g<1>zéro\g<2>'), ] +FILTER_SYMBOLES_REG=re.compile(r'[\{\}\[\]«»_\|\(\)\\…(^—)=&\*/µ#’@℗`~¹½¼¾¿º±↨↑↓▼→▲←↔∟§°‼¸‰‘¶“”•—´☺☻♥♦♠♣•◘○◙♂►♀☼♫♪¢¦Ξ≈˜†√ƒοΔδΛΓκιςζυσρΣγτθΘφΦηχξβωγΩΨ◊░▒▓│├╚┼┬┴└┐┤╝╗╬╣║ßÞ═™›³ª¯¬®]+') def format_address(address, template): @@ -62,6 +63,7 @@ def format_address(address, template): str = maybe_normalize(str, mapping=normalizers) str = filter_numbers(str) + str = FILTER_SYMBOLES_REG.sub('', str) return str.strip()