Skip to content

Commit

Permalink
Implemented Unicode NFC Normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Mar 25, 2021
1 parent 9e1f5a7 commit 16787f4
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions folia/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import bz2
import gzip
import random
import unicodedata
from socket import getfqdn


Expand Down Expand Up @@ -1438,9 +1439,9 @@ def text(self, cls='current', retaintokenisation=False, previousdelimiter="",str
l = len(s)
for j, line in enumerate(e.split("\n")):
if self.preservespace:
s2 = line.strip("\r") #strip only artefacts of DOS-style line endings, leave all intact
s2 = unicodedata.normalize('NFC', line.strip("\r")) #strip only artefacts of DOS-style line endings, leave all intact
else:
s2 = norm_spaces(line.strip(" \r")) #strips leading and trailing whitespace per line (proycon/folia#88)
s2 = unicodedata.normalize('NFC', norm_spaces(line.strip(" \r"))) #strips leading and trailing whitespace per line (proycon/folia#88)
#norm_spaces strips multi-spaces in the middle
#also strips artefacts of DOS-style line-endings
if j > 0 and s2 and len(s) != l:
Expand Down

0 comments on commit 16787f4

Please sign in to comment.