From 13ed184efd71c76fc0c69d9ab800ae44d82f9994 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 16 Feb 2022 20:36:30 +0100 Subject: [PATCH] housenumber analyzer: avoid creating too many variants Housenumber fields with lots of text are likely bad data. So is data with many changes from letter to digit. Exclude them from adding optional spaces. --- .../tokenizer/token_analysis/housenumbers.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/nominatim/tokenizer/token_analysis/housenumbers.py b/nominatim/tokenizer/token_analysis/housenumbers.py index 6a838e00..96e86b28 100644 --- a/nominatim/tokenizer/token_analysis/housenumbers.py +++ b/nominatim/tokenizer/token_analysis/housenumbers.py @@ -15,17 +15,18 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG RE_NON_DIGIT = re.compile('[^0-9]') RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])') RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)') +RE_NAMED_PART = re.compile(r'[a-z]{4}') ### Configuration section -def configure(rules, normalization_rules): +def configure(rules, normalization_rules): # pylint: disable=W0613 """ All behaviour is currently hard-coded. """ return None ### Analysis section -def create(normalizer, transliterator, config): +def create(normalizer, transliterator, config): # pylint: disable=W0613 """ Create a new token analysis instance for this module. """ return HousenumberTokenAnalysis(normalizer, transliterator) @@ -48,8 +49,14 @@ class HousenumberTokenAnalysis: return name norm = self.trans.transliterate(self.norm.transliterate(name)) - norm = RE_DIGIT_ALPHA.sub(r'\1␣\2', norm) - norm = RE_ALPHA_DIGIT.sub(r'\1␣\2', norm) + # If there is a significant non-numeric part, use as is. + if RE_NAMED_PART.search(norm) is None: + # Otherwise add optional spaces between digits and letters. + (norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm) + (norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt) + # Avoid creating too many variants per number. + if cnt1 + cnt2 <= 4: + return norm_opt return norm -- 2.39.5