rendering:
heading_level: 6
+##### clean-housenumbers
+
+::: nominatim.tokenizer.sanitizers.clean_housenumbers
+ selection:
+ members: False
+ rendering:
+ heading_level: 6
#### Token Analysis
def _process_place_address(self, token_info, address):
- hnrs = []
+ hnrs = set()
addr_terms = []
streets = []
for item in address:
if item.kind == 'postcode':
self._add_postcode(item.name)
- elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
- hnrs.append(item.name)
+ elif item.kind == 'housenumber':
+ norm_name = self._make_standard_hnr(item.name)
+ if norm_name:
+ hnrs.add(norm_name)
elif item.kind == 'street':
streets.extend(self._retrieve_full_tokens(item.name))
elif item.kind == 'place':
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
if hnrs:
- hnrs = self._split_housenumbers(hnrs)
- token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+ token_info.add_housenumbers(self.conn, hnrs)
if addr_terms:
token_info.add_address_terms(addr_terms)
self._cache.postcodes.add(postcode)
- @staticmethod
- def _split_housenumbers(hnrs):
- if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
- # split numbers if necessary
- simple_list = []
- for hnr in hnrs:
- simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
-
- if len(simple_list) > 1:
- hnrs = list(set(simple_list))
- else:
- hnrs = simple_list
-
- return hnrs
-
-
-
-
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that preprocesses address tags for house numbers. The sanitizer
+allows to
+
+* define which tags are to be considered house numbers (see 'filter-kind')
+* split house number lists into individual numbers (see 'delimiters')
+
+Arguments:
+ delimiters: Define the set of characters to be used for
+ splitting a list of house numbers into parts. (default: ',;')
+ filter-kind: Define the address tags that are considered to be a
+ house number. Either takes a single string or a list of strings,
+ where each string is a regular expression. An address item
+ is considered a house number if the 'kind' fully matches any
+ of the given regular expressions. (default: 'housenumber')
+
+"""
+from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
+
+class _HousenumberSanitizer:
+
+ def __init__(self, config):
+ self.filter_kind = create_kind_filter(config, 'housenumber')
+ self.split_regexp = create_split_regex(config)
+
+
+ def __call__(self, obj):
+ if not obj.address:
+ return
+
+ new_address = []
+ for item in obj.address:
+ if self.filter_kind(item):
+ new_address.extend(item.clone(kind='housenumber', name=n)
+ for n in self.sanitize(item.name))
+ else:
+ # Don't touch other address items.
+ new_address.append(item)
+
+ obj.address = new_address
+
+
+ def sanitize(self, value):
+ """ Extract housenumbers in a regularized format from an OSM value.
+
+ The function works as a generator that yields all valid housenumbers
+ that can be created from the value.
+ """
+ for hnr in self.split_regexp.split(value):
+ if hnr:
+ yield from self._regularize(hnr)
+
+
+ @staticmethod
+ def _regularize(hnr):
+ yield hnr
+
+
+def create(config):
+ """ Create a housenumber processing function.
+ """
+
+ return _HousenumberSanitizer(config)
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Helper functions for sanitizers.
+"""
+import re
+
+from nominatim.errors import UsageError
+
+def create_split_regex(config, default=',;'):
+ """ Converts the 'delimiter' parameter in the configuration into a
+ compiled regular expression that can be used to split the names on the
+ delimiters. The regular expression makes sure that the resulting names
+ are stripped and that repeated delimiters
+ are ignored but it will still create empty fields on occasion. The
+ code needs to filter those.
+
+ The 'default' parameter defines the delimiter set to be used when
+ not explicitly configured.
+ """
+ delimiter_set = set(config.get('delimiters', default))
+ if not delimiter_set:
+ raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
+
+ return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+
+def create_kind_filter(config, default=None):
+ """ Create a filter function for the name kind from the 'filter-kind'
+ config parameter. The filter functions takes a name item and returns
+ True when the item passes the filter.
+
+ If the parameter is empty, the filter lets all items pass. If the
+ paramter is a string, it is interpreted as a single regular expression
+ that must match the full kind string. If the parameter is a list then
+ any of the regular expressions in the list must match to pass.
+ """
+ filters = config.get('filter-kind', default)
+
+ if not filters:
+ return lambda _: True
+
+ if isinstance(filters, str):
+ regex = re.compile(filters)
+ return lambda name: regex.fullmatch(name.kind)
+
+ regexes = [re.compile(regex) for regex in filters]
+ return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
Arguments:
delimiters: Define the set of characters to be used for
- splitting the list. (default: `,;`)
+ splitting the list. (default: ',;')
"""
-import re
-
-from nominatim.errors import UsageError
+from nominatim.tokenizer.sanitizers.helpers import create_split_regex
def create(func):
""" Create a name processing function that splits name values with
multiple values into their components.
"""
- delimiter_set = set(func.get('delimiters', ',;'))
- if not delimiter_set:
- raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
-
- regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+ regexp = create_split_regex(func)
def _process(obj):
if not obj.names:
filter-kind: Restrict the names the sanitizer should be applied to
to the given tags. The parameter expects a list of
- regular expressions which are matched against `kind`.
+ regular expressions which are matched against 'kind'.
Note that a match against the full string is expected.
whitelist: Restrict the set of languages that should be tagged.
Expects a list of acceptable suffixes. When unset,
any analyzer tagged) is retained. (default: replace)
"""
-import re
-
from nominatim.tools import country_info
+from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config):
- if 'filter-kind' in config:
- self.regexes = [re.compile(regex) for regex in config['filter-kind']]
- else:
- self.regexes = None
-
+ self.filter_kind = create_kind_filter(config)
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
self.deflangs[ccode] = clangs
- def _kind_matches(self, kind):
- if self.regexes is None:
- return True
-
- return any(regex.fullmatch(kind) for regex in self.regexes)
-
-
def _suffix_matches(self, suffix):
if self.whitelist is None:
return len(suffix) in (2, 3) and suffix.islower()
more_names = []
for name in (n for n in obj.names
- if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+ if not n.has_attr('analyzer') and self.filter_kind(n)):
if name.suffix:
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
else:
sanitizers:
- step: split-name-list
- step: strip-brace-terms
+ - step: clean-housenumbers
+ filter-kind:
+ - housenumber
+ - conscriptionnumber
+ - streetnumber
- step: tag-analyzer-by-language
filter-kind: [".*name.*"]
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
--- /dev/null
+@DB
+Feature: Searching of house numbers
+ Test for specialised treeatment of housenumbers
+
+ Background:
+ Given the grid
+ | 1 | | 2 | | 3 |
+ | | 9 | | | |
+ | | | | | 4 |
+
+
+ Scenario: A simple numeral housenumber is found
+ Given the places
+ | osm | class | type | housenr | geometry |
+ | N1 | building | yes | 45 | 9 |
+ And the places
+ | osm | class | type | name | geometry |
+ | W10 | highway | path | North Road | 1,2,3 |
+ When importing
+ And sending search query "45, North Road"
+ Then results contain
+ | osm |
+ | N1 |
+ When sending search query "North Road 45"
+ Then results contain
+ | osm |
+ | N1 |
+
+
+ Scenario Outline: Each housenumber in a list is found
+ Given the places
+ | osm | class | type | housenr | geometry |
+ | N1 | building | yes | <hnrs> | 9 |
+ And the places
+ | osm | class | type | name | geometry |
+ | W10 | highway | path | Multistr | 1,2,3 |
+ When importing
+ When sending search query "2 Multistr"
+ Then results contain
+ | osm |
+ | N1 |
+ When sending search query "4 Multistr"
+ Then results contain
+ | osm |
+ | N1 |
+ When sending search query "12 Multistr"
+ Then results contain
+ | osm |
+ | N1 |
+
+ Examples:
+ | hnrs |
+ | 2;4;12 |
+ | 2,4,12 |
+ | 2, 4, 12 |
--- /dev/null
+[pytest]
+markers =
+ sanitizer_params
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for the sanitizer that normalizes housenumbers.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+@pytest.fixture
+def sanitize(request):
+ sanitizer_args = {'step': 'clean-housenumbers'}
+ for mark in request.node.iter_markers(name="sanitizer_params"):
+ sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
+
+ def _run(**kwargs):
+ place = PlaceInfo({'address': kwargs})
+ _, address = PlaceSanitizer([sanitizer_args]).process_names(place)
+
+ return sorted([(p.kind, p.name) for p in address])
+
+ return _run
+
+
+def test_simple_number(sanitize):
+ assert sanitize(housenumber='34') == [('housenumber', '34')]
+
+
+@pytest.mark.parametrize('number', ['1;2;3', '1,2,3', '1; 3 ,2',
+ '2,,3,1', '1;2;3;;', ';3;2;1'])
+def test_housenumber_lists(sanitize, number):
+ assert sanitize(housenumber=number) == \
+ [('housenumber', '1'), ('housenumber', '2'), ('housenumber', '3')]
+
+
+@pytest.mark.sanitizer_params(filter_kind=('number', 'streetnumber'))
+def test_filter_kind(sanitize):
+ assert sanitize(housenumber='34', number='4', badnumber='65') == \
+ [('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for sanitizer helper functions.
+"""
+import pytest
+
+from nominatim.errors import UsageError
+from nominatim.tokenizer.place_sanitizer import PlaceName
+import nominatim.tokenizer.sanitizers.helpers as helpers
+
+@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
+def test_create_split_regex_no_params_unsplit(inp):
+ regex = helpers.create_split_regex({})
+
+ assert list(regex.split(inp)) == [inp]
+
+
+@pytest.mark.parametrize('inp,outp', [('here,there', ['here', 'there']),
+ ('ying;;yang', ['ying', 'yang']),
+ (';a; ;c;d,', ['', 'a', '', 'c', 'd', '']),
+ ('1, 3 ,5', ['1', '3', '5'])
+ ])
+def test_create_split_regex_no_params_split(inp, outp):
+ regex = helpers.create_split_regex({})
+
+ assert list(regex.split(inp)) == outp
+
+
+@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', ' ', '/.*+'])
+def test_create_split_regex_custom(delimiter):
+ regex = helpers.create_split_regex({'delimiters': delimiter})
+
+ assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
+ assert list(regex.split('out,house')) == ['out,house']
+
+
+def test_create_split_regex_empty_delimiter():
+ with pytest.raises(UsageError):
+ regex = helpers.create_split_regex({'delimiters': ''})
+
+
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
+def test_create_kind_filter_no_params(inp):
+ filt = helpers.create_kind_filter({})
+
+ assert filt(PlaceName('something', inp, ''))
+
+
+@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_custom_regex_positive(kind):
+ filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+
+ assert filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
+def test_create_kind_filter_custom_regex_negative(kind):
+ filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+
+ assert not filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
+def test_create_kind_filter_many_positive(kind):
+ filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+
+ assert filt(PlaceName('something', kind, ''))
+
+
+@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
+def test_create_kind_filter_many_negative(kind):
+ filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+
+ assert not filt(PlaceName('something', kind, ''))
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
-Tests for the sanitizer that splitts multivalue lists.
+Tests for the sanitizer that splits multivalue lists.
"""
import pytest
@pytest.fixture(autouse=True)
def setup(self, analyzer, sql_functions):
- with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
+ hnr = {'step': 'clean-housenumbers',
+ 'filter-kind': ['housenumber', 'conscriptionnumber', 'streetnumber']}
+ with analyzer(trans=(":: upper()", "'🜵' > ' '"), sanitizers=[hnr]) as anl:
self.analyzer = anl
yield anl
assert info['hnr_tokens'] == "{-1}"
- def test_process_place_housenumbers_lists(self, getorcreate_hnr_id):
- info = self.process_address(conscriptionnumber='1; 2;3')
-
- assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
- assert info['hnr_tokens'] == "{-1,-2,-3}"
-
-
def test_process_place_housenumbers_duplicates(self, getorcreate_hnr_id):
info = self.process_address(housenumber='134',
conscriptionnumber='134',