postcodes: introduce a default pattern for countries without postcodes

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 24 May 2022 16:25:37 +0000 (18:25 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 24 May 2022 16:25:37 +0000 (18:25 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py

index 42beea37feb68a1fee5da91e18e6aee95bdec228..c6292a2942b217ae866c8b43ded81a5aa75ff089 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -75,6 +75,12 @@ class _PostcodeSanitizer:
              else:
                  raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
  
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
+        else:
+            self.default_matcher = None
+
  
      def __call__(self, obj):
          if not obj.address:
@@ -103,14 +109,16 @@ class _PostcodeSanitizer:
          if country in self.country_without_postcode:
              return None
  
-        matcher = self.country_matcher.get(country)
-        if matcher is not None:
-            match = matcher.match(postcode)
-            if match is None:
-                return None
-            return matcher.normalize(match), ' '.join(match.groups())
+        matcher = self.country_matcher.get(country, self.default_matcher)
+        if matcher is None:
+            return postcode.upper(), ''
+
+        match = matcher.match(postcode)
+        if match is None:
+            return None
+
+        return matcher.normalize(match), ' '.join(match.groups())
  
-        return postcode.upper(), ''
  
  
  
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index 544bd81db01ee0dc17f13fce5ad859959b186812..f682bbcdf8ad4c38a4e891e55f5360e70a7a28d0 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -34,6 +34,7 @@ sanitizers:
          - (\A|.*,)[^\d,]{3,}(,.*|\Z)
      - step: clean-postcodes
        convert-to-address: yes
+      default-pattern: [A-Z0-9- ]{3,12}
      - step: split-name-list
      - step: strip-brace-terms
      - step: tag-analyzer-by-language
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py

index 228c2f3a1a9adf3331c4536ce03468a875f96426..443761962566cc694b4bf7c2765598f72ead5e09 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -88,3 +88,15 @@ def test_postcode_sweden_pass(sanitize, postcode):
  def test_postcode_sweden_fail(sanitize, postcode):
      assert sanitize(country='se', postcode=postcode) == []
  
+
+@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
+@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_pass(sanitize, postcode):
+    assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())]
+
+
+@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224'))
+@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_fail(sanitize, postcode):
+    assert sanitize(country='an', postcode=postcode) == []
+
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 24 May 2022 16:25:37 +0000 (18:25 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
nominatim/tokenizer/sanitizers/clean_postcodes.py		patch \| blob \| history
settings/icu_tokenizer.yaml		patch \| blob \| history
test/python/tokenizer/sanitizers/test_clean_postcodes.py		patch \| blob \| history