Merge pull request #2588 from lonvia/housenumber-sanitizer

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 20 Jan 2022 16:44:24 +0000 (17:44 +0100)

committer GitHub <noreply@github.com>

Thu, 20 Jan 2022 16:44:24 +0000 (17:44 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 20 Jan 2022 16:44:24 +0000 (17:44 +0100)
committer GitHub <noreply@github.com>
Thu, 20 Jan 2022 16:44:24 +0000 (17:44 +0100)
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md

index 5c766f50a3bf47055c41bcdbd39fd2f87d30e598..f75bc6a5c9da88fad8b92bc046337d2fcebff863 100644 (file)
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -181,6 +181,13 @@ The following is a list of sanitizers that are shipped with Nominatim.
      rendering:
          heading_level: 6
  
+##### clean-housenumbers
+
+::: nominatim.tokenizer.sanitizers.clean_housenumbers
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
  
  
  #### Token Analysis
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 7b820c9da04173b4e585b816c828b597d775f87b..cfbb44e3d356c85a9317657fb7df38dd08b3539a 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -413,14 +413,16 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
  
      def _process_place_address(self, token_info, address):
-        hnrs = []
+        hnrs = set()
          addr_terms = []
          streets = []
          for item in address:
              if item.kind == 'postcode':
                  self._add_postcode(item.name)
-            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(item.name)
+            elif item.kind == 'housenumber':
+                norm_name = self._make_standard_hnr(item.name)
+                if norm_name:
+                    hnrs.add(norm_name)
              elif item.kind == 'street':
                  streets.extend(self._retrieve_full_tokens(item.name))
              elif item.kind == 'place':
@@ -431,8 +433,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                  addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
  
          if hnrs:
-            hnrs = self._split_housenumbers(hnrs)
-            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+            token_info.add_housenumbers(self.conn, hnrs)
  
          if addr_terms:
              token_info.add_address_terms(addr_terms)
@@ -545,24 +546,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                  self._cache.postcodes.add(postcode)
  
  
-    @staticmethod
-    def _split_housenumbers(hnrs):
-        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
-            # split numbers if necessary
-            simple_list = []
-            for hnr in hnrs:
-                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
-
-            if len(simple_list) > 1:
-                hnrs = list(set(simple_list))
-            else:
-                hnrs = simple_list
-
-        return hnrs
-
-
-
-
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py

new file mode 100644 (file)

index 0000000..49f9b4f
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that preprocesses address tags for house numbers. The sanitizer
+allows to
+
+* define which tags are to be considered house numbers (see 'filter-kind')
+* split house number lists into individual numbers (see 'delimiters')
+
+Arguments:
+    delimiters: Define the set of characters to be used for
+                splitting a list of house numbers into parts. (default: ',;')
+    filter-kind: Define the address tags that are considered to be a
+                 house number. Either takes a single string or a list of strings,
+                 where each string is a regular expression. An address item
+                 is considered a house number if the 'kind' fully matches any
+                 of the given regular expressions. (default: 'housenumber')
+
+"""
+from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
+
+class _HousenumberSanitizer:
+
+    def __init__(self, config):
+        self.filter_kind = create_kind_filter(config, 'housenumber')
+        self.split_regexp = create_split_regex(config)
+
+
+    def __call__(self, obj):
+        if not obj.address:
+            return
+
+        new_address = []
+        for item in obj.address:
+            if self.filter_kind(item):
+                new_address.extend(item.clone(kind='housenumber', name=n)
+                                   for n in self.sanitize(item.name))
+            else:
+                # Don't touch other address items.
+                new_address.append(item)
+
+        obj.address = new_address
+
+
+    def sanitize(self, value):
+        """ Extract housenumbers in a regularized format from an OSM value.
+
+            The function works as a generator that yields all valid housenumbers
+            that can be created from the value.
+        """
+        for hnr in self.split_regexp.split(value):
+            if hnr:
+                yield from self._regularize(hnr)
+
+
+    @staticmethod
+    def _regularize(hnr):
+        yield hnr
+
+
+def create(config):
+    """ Create a housenumber processing function.
+    """
+
+    return _HousenumberSanitizer(config)
diff --git a/nominatim/tokenizer/sanitizers/helpers.py b/nominatim/tokenizer/sanitizers/helpers.py

new file mode 100644 (file)

index 0000000..b92914e
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/helpers.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Helper functions for sanitizers.
+"""
+import re
+
+from nominatim.errors import UsageError
+
+def create_split_regex(config, default=',;'):
+    """ Converts the 'delimiter' parameter in the configuration into a
+        compiled regular expression that can be used to split the names on the
+        delimiters. The regular expression makes sure that the resulting names
+        are stripped and that repeated delimiters
+        are ignored but it will still create empty fields on occasion. The
+        code needs to filter those.
+
+        The 'default' parameter defines the delimiter set to be used when
+        not explicitly configured.
+    """
+    delimiter_set = set(config.get('delimiters', default))
+    if not delimiter_set:
+        raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
+
+    return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+
+def create_kind_filter(config, default=None):
+    """ Create a filter function for the name kind from the 'filter-kind'
+        config parameter. The filter functions takes a name item and returns
+        True when the item passes the filter.
+
+        If the parameter is empty, the filter lets all items pass. If the
+        paramter is a string, it is interpreted as a single regular expression
+        that must match the full kind string. If the parameter is a list then
+        any of the regular expressions in the list must match to pass.
+    """
+    filters = config.get('filter-kind', default)
+
+    if not filters:
+        return lambda _: True
+
+    if isinstance(filters, str):
+        regex = re.compile(filters)
+        return lambda name: regex.fullmatch(name.kind)
+
+    regexes = [re.compile(regex) for regex in filters]
+    return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py

index 3250c668a9d44b700c49f6c0903d1a064cad3810..13921c3e104385f27489b4404034e523aa389ef3 100644 (file)
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -9,21 +9,15 @@ Sanitizer that splits lists of names into their components.
  
  Arguments:
      delimiters: Define the set of characters to be used for
-                splitting the list. (default: `,;`)
+                splitting the list. (default: ',;')
  """
-import re
-
-from nominatim.errors import UsageError
+from nominatim.tokenizer.sanitizers.helpers import create_split_regex
  
  def create(func):
      """ Create a name processing function that splits name values with
          multiple values into their components.
      """
-    delimiter_set = set(func.get('delimiters', ',;'))
-    if not delimiter_set:
-        raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
-
-    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+    regexp = create_split_regex(func)
  
      def _process(obj):
          if not obj.names:
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

index 2c97a0bc817282ce784105d78216b1d0160c902a..cbf32179f504ed9f9a535fd432e1c3c0d2a526c6 100644 (file)
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -13,7 +13,7 @@ Arguments:
  
      filter-kind: Restrict the names the sanitizer should be applied to
                   to the given tags. The parameter expects a list of
-                 regular expressions which are matched against `kind`.
+                 regular expressions which are matched against 'kind'.
                   Note that a match against the full string is expected.
      whitelist: Restrict the set of languages that should be tagged.
                 Expects a list of acceptable suffixes. When unset,
@@ -30,20 +30,15 @@ Arguments:
            any analyzer tagged) is retained. (default: replace)
  
  """
-import re
-
  from nominatim.tools import country_info
+from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
  
  class _AnalyzerByLanguage:
      """ Processor for tagging the language of names in a place.
      """
  
      def __init__(self, config):
-        if 'filter-kind' in config:
-            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
-        else:
-            self.regexes = None
-
+        self.filter_kind = create_kind_filter(config)
          self.replace = config.get('mode', 'replace') != 'append'
          self.whitelist = config.get('whitelist')
  
@@ -63,13 +58,6 @@ class _AnalyzerByLanguage:
                          self.deflangs[ccode] = clangs
  
  
-    def _kind_matches(self, kind):
-        if self.regexes is None:
-            return True
-
-        return any(regex.fullmatch(kind) for regex in self.regexes)
-
-
      def _suffix_matches(self, suffix):
          if self.whitelist is None:
              return len(suffix) in (2, 3) and suffix.islower()
@@ -84,7 +72,7 @@ class _AnalyzerByLanguage:
          more_names = []
  
          for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n)):
              if name.suffix:
                  langs = [name.suffix] if self._suffix_matches(name.suffix) else None
              else:
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index c6601faf0645186740c06b99eaec124744aba0e0..bf51f56344e12ae4e6986d8fc60683fa77a8d002 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -27,6 +27,11 @@ transliteration:
  sanitizers:
      - step: split-name-list
      - step: strip-brace-terms
+    - step: clean-housenumbers
+      filter-kind:
+        - housenumber
+        - conscriptionnumber
+        - streetnumber
      - step: tag-analyzer-by-language
        filter-kind: [".*name.*"]
        whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
diff --git a/test/bdd/db/query/housenumbers.feature b/test/bdd/db/query/housenumbers.feature

new file mode 100644 (file)

index 0000000..63bd898
--- /dev/null
+++ b/test/bdd/db/query/housenumbers.feature
@@ -0,0 +1,55 @@
+@DB
+Feature: Searching of house numbers
+    Test for specialised treeatment of housenumbers
+
+    Background:
+        Given the grid
+         | 1 |   | 2 |   | 3 |
+         |   | 9 |   |   |   |
+         |   |   |   |   | 4 |
+
+
+    Scenario: A simple numeral housenumber is found
+        Given the places
+         | osm | class    | type | housenr | geometry |
+         | N1  | building | yes  | 45      | 9        |
+        And the places
+         | osm | class   | type | name       | geometry |
+         | W10 | highway | path | North Road | 1,2,3    |
+        When importing
+        And sending search query "45, North Road"
+        Then results contain
+         | osm |
+         | N1  |
+        When sending search query "North Road 45"
+        Then results contain
+         | osm |
+         | N1  |
+
+
+    Scenario Outline: Each housenumber in a list is found
+        Given the places
+         | osm | class    | type | housenr | geometry |
+         | N1  | building | yes  | <hnrs>  | 9        |
+        And the places
+         | osm | class   | type | name     | geometry |
+         | W10 | highway | path | Multistr | 1,2,3    |
+        When importing
+        When sending search query "2 Multistr"
+        Then results contain
+         | osm |
+         | N1  |
+        When sending search query "4 Multistr"
+        Then results contain
+         | osm |
+         | N1  |
+        When sending search query "12 Multistr"
+        Then results contain
+         | osm |
+         | N1  |
+
+     Examples:
+        | hnrs |
+        | 2;4;12 |
+        | 2,4,12 |
+        | 2, 4, 12 |
diff --git a/test/python/pytest.ini b/test/python/pytest.ini

new file mode 100644 (file)

index 0000000..a3bfd12
--- /dev/null
+++ b/test/python/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    sanitizer_params
diff --git a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py

new file mode 100644 (file)

index 0000000..5784619
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for the sanitizer that normalizes housenumbers.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+@pytest.fixture
+def sanitize(request):
+    sanitizer_args = {'step': 'clean-housenumbers'}
+    for mark in request.node.iter_markers(name="sanitizer_params"):
+        sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
+
+    def _run(**kwargs):
+        place = PlaceInfo({'address': kwargs})
+        _, address = PlaceSanitizer([sanitizer_args]).process_names(place)
+
+        return sorted([(p.kind, p.name) for p in address])
+
+    return _run
+
+
+def test_simple_number(sanitize):
+    assert sanitize(housenumber='34') == [('housenumber', '34')]
+
+
+@pytest.mark.parametrize('number', ['1;2;3', '1,2,3', '1; 3 ,2',
+                                    '2,,3,1', '1;2;3;;', ';3;2;1'])
+def test_housenumber_lists(sanitize, number):
+    assert sanitize(housenumber=number) == \
+           [('housenumber', '1'), ('housenumber', '2'), ('housenumber', '3')]
+
+
+@pytest.mark.sanitizer_params(filter_kind=('number', 'streetnumber'))
+def test_filter_kind(sanitize):
+    assert sanitize(housenumber='34', number='4', badnumber='65') == \
+            [('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
diff --git a/test/python/tokenizer/sanitizers/test_helpers.py b/test/python/tokenizer/sanitizers/test_helpers.py

new file mode 100644 (file)

index 0000000..911fbdd
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_helpers.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for sanitizer helper functions.
+"""
+import pytest
+
+from nominatim.errors import UsageError
+from nominatim.tokenizer.place_sanitizer import PlaceName
+import nominatim.tokenizer.sanitizers.helpers as helpers
+
+@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
+def test_create_split_regex_no_params_unsplit(inp):
+    regex = helpers.create_split_regex({})
+
+    assert list(regex.split(inp)) == [inp]
+
+
+@pytest.mark.parametrize('inp,outp', [('here,there', ['here', 'there']),
+                                      ('ying;;yang', ['ying', 'yang']),
+                                      (';a; ;c;d,', ['', 'a', '', 'c', 'd', '']),
+                                      ('1,  3  ,5', ['1', '3', '5'])
+                                     ])
+def test_create_split_regex_no_params_split(inp, outp):
+    regex = helpers.create_split_regex({})
+
+    assert list(regex.split(inp)) == outp
+
+
+@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', '   ', '/.*+'])
+def test_create_split_regex_custom(delimiter):
+    regex = helpers.create_split_regex({'delimiters': delimiter})
+
+    assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
+    assert list(regex.split('out,house')) == ['out,house']
+
+
+def test_create_split_regex_empty_delimiter():
+    with pytest.raises(UsageError):
+        regex = helpers.create_split_regex({'delimiters': ''})
+
+
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
+def test_create_kind_filter_no_params(inp):
+    filt = helpers.create_kind_filter({})
+
+    assert filt(PlaceName('something', inp, ''))
+
+
+@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_custom_regex_positive(kind):
+    filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+
+    assert filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
+def test_create_kind_filter_custom_regex_negative(kind):
+    filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+
+    assert not filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
+def test_create_kind_filter_many_positive(kind):
+    filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+
+    assert filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
+def test_create_kind_filter_many_negative(kind):
+    filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+
+    assert not filt(PlaceName('something', kind, ''))
diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py

index 2db9071b74ae5636d77e6e38c8ffa183f094b67d..47bd1e44cb0806ac6c331bced65e7219d90660d5 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_split_name_list.py
+++ b/test/python/tokenizer/sanitizers/test_split_name_list.py
@@ -5,7 +5,7 @@
  # Copyright (C) 2022 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
-Tests for the sanitizer that splitts multivalue lists.
+Tests for the sanitizer that splits multivalue lists.
  """
  import pytest
  
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py

index a19578c95517e5e94a8077315cf65d33e6c323f4..a3839365a750baa9c39ad8555acea1416fee9c79 100644 (file)
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -400,7 +400,9 @@ class TestPlaceAddress:
  
      @pytest.fixture(autouse=True)
      def setup(self, analyzer, sql_functions):
-        with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
+        hnr = {'step': 'clean-housenumbers',
+               'filter-kind': ['housenumber', 'conscriptionnumber', 'streetnumber']}
+        with analyzer(trans=(":: upper()", "'🜵' > ' '"), sanitizers=[hnr]) as anl:
              self.analyzer = anl
              yield anl
  
@@ -446,13 +448,6 @@ class TestPlaceAddress:
          assert info['hnr_tokens'] == "{-1}"
  
  
-    def test_process_place_housenumbers_lists(self, getorcreate_hnr_id):
-        info = self.process_address(conscriptionnumber='1; 2;3')
-
-        assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
-        assert info['hnr_tokens'] == "{-1,-2,-3}"
-
-
      def test_process_place_housenumbers_duplicates(self, getorcreate_hnr_id):
          info = self.process_address(housenumber='134',
                                      conscriptionnumber='134',
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 20 Jan 2022 16:44:24 +0000 (17:44 +0100)
committer	GitHub <noreply@github.com>
	Thu, 20 Jan 2022 16:44:24 +0000 (17:44 +0100)
docs/customize/Tokenizers.md		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/clean_housenumbers.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/sanitizers/helpers.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/sanitizers/split_name_list.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py		patch \| blob \| history
settings/icu_tokenizer.yaml		patch \| blob \| history
test/bdd/db/query/housenumbers.feature	[new file with mode: 0644]	patch \| blob
test/python/pytest.ini	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/sanitizers/test_clean_housenumbers.py	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/sanitizers/test_helpers.py	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/sanitizers/test_split_name_list.py		patch \| blob \| history
test/python/tokenizer/test_icu.py		patch \| blob \| history