generalize filter-kind parameter for sanatizers

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 20 Jan 2022 14:42:42 +0000 (15:42 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 20 Jan 2022 14:42:42 +0000 (15:42 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 20 Jan 2022 14:42:42 +0000 (15:42 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 20 Jan 2022 14:42:42 +0000 (15:42 +0100)
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py

index b65880c38e5179e167b8a2711e31b079e900d8ab..9777a7fcdcc6dc74103f727644cb8da5f81beaa9 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -5,19 +5,24 @@
  # Copyright (C) 2022 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
-Sanitizer that cleans and normalizes housenumbers.
+Sanitizer that cleans and normalizes house numbers.
  
  Arguments:
      delimiters: Define the set of characters to be used for
-                splitting a list of housenumbers into parts. (default: ',;')
+                splitting a list of house numbers into parts. (default: ',;')
+    filter-kind: Define the address tags that are considered to be a
+                 house number. Either takes a single string or a list of strings,
+                 where each string is a regular expression. An address item
+                 is considered a house number if the 'kind' fully matches any
+                 of the given regular expressions. (default: 'housenumber')
  
  """
-from nominatim.tokenizer.sanitizers.helpers import create_split_regex
+from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
  
  class _HousenumberSanitizer:
  
      def __init__(self, config):
-        self.kinds = config.get('filter-kind', ('housenumber', ))
+        self.filter_kind = create_kind_filter(config, 'housenumber')
          self.split_regexp = create_split_regex(config)
  
  
@@ -27,7 +32,7 @@ class _HousenumberSanitizer:
  
          new_address = []
          for item in obj.address:
-            if item.kind in self.kinds:
+            if self.filter_kind(item):
                  new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name))
              else:
                  # Don't touch other address items.
diff --git a/nominatim/tokenizer/sanitizers/helpers.py b/nominatim/tokenizer/sanitizers/helpers.py

index 78b9a831c32030306de2aedd536a0407d84f73f9..b92914e18acdde699d63e57640a3ea32d5cd234c 100644 (file)
--- a/nominatim/tokenizer/sanitizers/helpers.py
+++ b/nominatim/tokenizer/sanitizers/helpers.py
@@ -27,3 +27,26 @@ def create_split_regex(config, default=',;'):
          raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
  
      return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+
+def create_kind_filter(config, default=None):
+    """ Create a filter function for the name kind from the 'filter-kind'
+        config parameter. The filter functions takes a name item and returns
+        True when the item passes the filter.
+
+        If the parameter is empty, the filter lets all items pass. If the
+        paramter is a string, it is interpreted as a single regular expression
+        that must match the full kind string. If the parameter is a list then
+        any of the regular expressions in the list must match to pass.
+    """
+    filters = config.get('filter-kind', default)
+
+    if not filters:
+        return lambda _: True
+
+    if isinstance(filters, str):
+        regex = re.compile(filters)
+        return lambda name: regex.fullmatch(name.kind)
+
+    regexes = [re.compile(regex) for regex in filters]
+    return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

index 1305029a6969e6111767ec0fafbbfaba3fd24a31..964a90162b1a49cf6e32fbace4f57d8614be2d6b 100644 (file)
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -33,17 +33,14 @@ Arguments:
  import re
  
  from nominatim.tools import country_info
+from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
  
  class _AnalyzerByLanguage:
      """ Processor for tagging the language of names in a place.
      """
  
      def __init__(self, config):
-        if 'filter-kind' in config:
-            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
-        else:
-            self.regexes = None
-
+        self.filter_kind = create_kind_filter(config)
          self.replace = config.get('mode', 'replace') != 'append'
          self.whitelist = config.get('whitelist')
  
@@ -63,13 +60,6 @@ class _AnalyzerByLanguage:
                          self.deflangs[ccode] = clangs
  
  
-    def _kind_matches(self, kind):
-        if self.regexes is None:
-            return True
-
-        return any(regex.fullmatch(kind) for regex in self.regexes)
-
-
      def _suffix_matches(self, suffix):
          if self.whitelist is None:
              return len(suffix) in (2, 3) and suffix.islower()
@@ -84,7 +74,7 @@ class _AnalyzerByLanguage:
          more_names = []
  
          for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n)):
              if name.suffix:
                  langs = [name.suffix] if self._suffix_matches(name.suffix) else None
              else:
diff --git a/test/python/tokenizer/sanitizers/test_helpers.py b/test/python/tokenizer/sanitizers/test_helpers.py

index a0a1d29c088fd5e24ab4b48a524d2304dda724a9..911fbdd7692eed7520085561cb1915d55e7b3f58 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_helpers.py
+++ b/test/python/tokenizer/sanitizers/test_helpers.py
@@ -10,6 +10,7 @@ Tests for sanitizer helper functions.
  import pytest
  
  from nominatim.errors import UsageError
+from nominatim.tokenizer.place_sanitizer import PlaceName
  import nominatim.tokenizer.sanitizers.helpers as helpers
  
  @pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
@@ -41,3 +42,38 @@ def test_create_split_regex_custom(delimiter):
  def test_create_split_regex_empty_delimiter():
      with pytest.raises(UsageError):
          regex = helpers.create_split_regex({'delimiters': ''})
+
+
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
+def test_create_kind_filter_no_params(inp):
+    filt = helpers.create_kind_filter({})
+
+    assert filt(PlaceName('something', inp, ''))
+
+
+@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_custom_regex_positive(kind):
+    filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+
+    assert filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
+def test_create_kind_filter_custom_regex_negative(kind):
+    filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+
+    assert not filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
+def test_create_kind_filter_many_positive(kind):
+    filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+
+    assert filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
+def test_create_kind_filter_many_negative(kind):
+    filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+
+    assert not filt(PlaceName('something', kind, ''))
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 20 Jan 2022 14:42:42 +0000 (15:42 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 20 Jan 2022 14:42:42 +0000 (15:42 +0100)
nominatim/tokenizer/sanitizers/clean_housenumbers.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/helpers.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py		patch \| blob \| history
test/python/tokenizer/sanitizers/test_helpers.py		patch \| blob \| history