From 362088775ff54a91947ca7df11b83b53bcdb24d3 Mon Sep 17 00:00:00 2001 From: marc tobias Date: Thu, 30 Oct 2025 13:29:57 +0100 Subject: [PATCH] postcode sanetizer skips postcodes which are only zeros --- src/nominatim_db/data/postcode_format.py | 8 +++++++- test/python/tokenizer/sanitizers/test_clean_postcodes.py | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/nominatim_db/data/postcode_format.py b/src/nominatim_db/data/postcode_format.py index 0e6635f9..fb9d342e 100644 --- a/src/nominatim_db/data/postcode_format.py +++ b/src/nominatim_db/data/postcode_format.py @@ -29,6 +29,9 @@ class CountryPostcodeMatcher: self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*') self.pattern = re.compile(pc_pattern) + # We want to exclude 0000, 00-000, 000 00 etc + self.zero_pattern = re.compile(r'^[0\- ]+$') + self.output = config.get('output', r'\g<0>') def match(self, postcode: str) -> Optional[Match[str]]: @@ -40,7 +43,10 @@ class CountryPostcodeMatcher: normalized = self.norm_pattern.fullmatch(postcode.upper()) if normalized: - return self.pattern.fullmatch(normalized.group(1)) + match = self.pattern.fullmatch(normalized.group(1)) + if match and self.zero_pattern.match(match.string): + return None + return match return None diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index 242e9301..433ae2b9 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -237,3 +237,9 @@ def test_postcode_default_pattern_pass(sanitize, postcode): @pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}') def test_postcode_default_pattern_fail(sanitize, postcode): assert sanitize(country='an', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('00000', '00-000', 'PL-00000', 'PL 00-000')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_zeros(sanitize, postcode): + assert sanitize(country='pl', postcode=postcode) == [] -- 2.39.5