From 3313369a395b9bd6bc57e0fe81a86a82f644d881 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 2 Dec 2022 10:15:02 +0100 Subject: [PATCH] contract duplicate spaces in transliteration string There are some pathological cases where an isolated letter may be deleted because it is in itself meaningless. If this happens in the middle of a sentence, then the transliteration contains two consecutive spaces. Add a final rule to fix this. See #2909. --- settings/icu_tokenizer.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 16339970..f30578a2 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -24,6 +24,7 @@ transliteration: - ":: lower ()" - "[^a-z0-9[:Space:]] >" - ":: NFC ()" + - "[:Space:]+ > ' '" sanitizers: - step: clean-housenumbers filter-kind: -- 2.39.5