From 16787f4a5d7c39e47abfbf145a5a06dff01b51e5 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 25 Mar 2021 11:13:49 +0100 Subject: [PATCH] Implemented Unicode NFC Normalization --- folia/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/folia/main.py b/folia/main.py index a459183..47d813e 100644 --- a/folia/main.py +++ b/folia/main.py @@ -33,6 +33,7 @@ import bz2 import gzip import random +import unicodedata from socket import getfqdn @@ -1438,9 +1439,9 @@ def text(self, cls='current', retaintokenisation=False, previousdelimiter="",str l = len(s) for j, line in enumerate(e.split("\n")): if self.preservespace: - s2 = line.strip("\r") #strip only artefacts of DOS-style line endings, leave all intact + s2 = unicodedata.normalize('NFC', line.strip("\r")) #strip only artefacts of DOS-style line endings, leave all intact else: - s2 = norm_spaces(line.strip(" \r")) #strips leading and trailing whitespace per line (proycon/folia#88) + s2 = unicodedata.normalize('NFC', norm_spaces(line.strip(" \r"))) #strips leading and trailing whitespace per line (proycon/folia#88) #norm_spaces strips multi-spaces in the middle #also strips artefacts of DOS-style line-endings if j > 0 and s2 and len(s) != l: