rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
- # Make sure country information is available to analyzers and sanatizers.
+ # Make sure country information is available to analyzers and sanitizers.
nominatim.tools.country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
import importlib
from nominatim.errors import UsageError
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class PlaceName:
""" A searchable name for a place together with properties.
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
handler_module = importlib.import_module(module_name)
- self.handlers.append(handler_module.create(func))
+ self.handlers.append(handler_module.create(SanitizerConfig(func)))
def process_names(self, place):
where each string is a regular expression. An address item
is considered a house number if the 'kind' fully matches any
of the given regular expressions. (default: 'housenumber')
-
+ convert-to-name: Define house numbers that should be treated as a name
+ instead of a house number. Either takes a single string
+ or a list of strings, where each string is a regular
+ expression that must match the full house number value.
"""
-from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
+import re
class _HousenumberSanitizer:
def __init__(self, config):
- self.filter_kind = create_kind_filter(config, 'housenumber')
- self.split_regexp = create_split_regex(config)
+ self.filter_kind = config.get_filter_kind('housenumber')
+ self.split_regexp = config.get_delimiter()
- nameregexps = config.get('is-a-name', [])
+ nameregexps = config.get_string_list('convert-to-name', [])
self.is_name_regexp = [re.compile(r) for r in nameregexps]
new_address = []
for item in obj.address:
if self.filter_kind(item):
- if self.treat_as_name(item.name):
+ if self._treat_as_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Configuration for Sanitizers.
+"""
+from collections import UserDict
+import re
+
+from nominatim.errors import UsageError
+
+class SanitizerConfig(UserDict):
+ """ Dictionary with configuration options for a sanitizer.
+
+ In addition to the usualy dictionary function, the class provides
+ accessors to standard sanatizer options that are used by many of the
+ sanitizers.
+ """
+
+ def get_string_list(self, param, default=tuple()):
+ """ Extract a configuration parameter as a string list.
+ If the parameter value is a simple string, it is returned as a
+ one-item list. If the parameter value does not exist, the given
+ default is returned. If the parameter value is a list, it is checked
+ to contain only strings before being returned.
+ """
+ values = self.data.get(param, None)
+
+ if values is None:
+ return None if default is None else list(default)
+
+ if isinstance(values, str):
+ return [values]
+
+ if not isinstance(values, (list, tuple)):
+ raise UsageError(f"Parameter '{param}' must be string or list of strings.")
+
+ if any(not isinstance(value, str) for value in values):
+ raise UsageError(f"Parameter '{param}' must be string or list of strings.")
+
+ return values
+
+
+ def get_delimiter(self, default=',;'):
+ """ Return the 'delimiter' parameter in the configuration as a
+ compiled regular expression that can be used to split the names on the
+ delimiters. The regular expression makes sure that the resulting names
+ are stripped and that repeated delimiters
+ are ignored but it will still create empty fields on occasion. The
+ code needs to filter those.
+
+ The 'default' parameter defines the delimiter set to be used when
+ not explicitly configured.
+ """
+ delimiter_set = set(self.data.get('delimiters', default))
+ if not delimiter_set:
+ raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
+
+ return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+
+ def get_filter_kind(self, *default):
+ """ Return a filter function for the name kind from the 'filter-kind'
+ config parameter. The filter functions takes a name item and returns
+ True when the item passes the filter.
+
+ If the parameter is empty, the filter lets all items pass. If the
+ paramter is a string, it is interpreted as a single regular expression
+ that must match the full kind string. If the parameter is a list then
+ any of the regular expressions in the list must match to pass.
+ """
+ filters = self.get_string_list('filter-kind', default)
+
+ if not filters:
+ return lambda _: True
+
+ regexes = [re.compile(regex) for regex in filters]
+
+ return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
+++ /dev/null
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# This file is part of Nominatim. (https://nominatim.org)
-#
-# Copyright (C) 2022 by the Nominatim developer community.
-# For a full list of authors see the git log.
-"""
-Helper functions for sanitizers.
-"""
-import re
-
-from nominatim.errors import UsageError
-
-def create_split_regex(config, default=',;'):
- """ Converts the 'delimiter' parameter in the configuration into a
- compiled regular expression that can be used to split the names on the
- delimiters. The regular expression makes sure that the resulting names
- are stripped and that repeated delimiters
- are ignored but it will still create empty fields on occasion. The
- code needs to filter those.
-
- The 'default' parameter defines the delimiter set to be used when
- not explicitly configured.
- """
- delimiter_set = set(config.get('delimiters', default))
- if not delimiter_set:
- raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
-
- return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
-
-
-def create_kind_filter(config, default=None):
- """ Create a filter function for the name kind from the 'filter-kind'
- config parameter. The filter functions takes a name item and returns
- True when the item passes the filter.
-
- If the parameter is empty, the filter lets all items pass. If the
- paramter is a string, it is interpreted as a single regular expression
- that must match the full kind string. If the parameter is a list then
- any of the regular expressions in the list must match to pass.
- """
- filters = config.get('filter-kind', default)
-
- if not filters:
- return lambda _: True
-
- if isinstance(filters, str):
- regex = re.compile(filters)
- return lambda name: regex.fullmatch(name.kind)
-
- regexes = [re.compile(regex) for regex in filters]
- return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
delimiters: Define the set of characters to be used for
splitting the list. (default: ',;')
"""
-from nominatim.tokenizer.sanitizers.helpers import create_split_regex
-
-def create(func):
+def create(config):
""" Create a name processing function that splits name values with
multiple values into their components.
"""
- regexp = create_split_regex(func)
+ regexp = config.get_delimiter()
def _process(obj):
if not obj.names:
"""
from nominatim.tools import country_info
-from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config):
- self.filter_kind = create_kind_filter(config)
+ self.filter_kind = config.get_filter_kind()
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
- self.__compute_default_languages(config.get('use-defaults', 'no'))
+ self._compute_default_languages(config.get('use-defaults', 'no'))
- def __compute_default_languages(self, use_defaults):
+ def _compute_default_languages(self, use_defaults):
self.deflangs = {}
if use_defaults in ('mono', 'all'):
- housenumber
- conscriptionnumber
- streetnumber
- is-a-name:
+ convert-to-name:
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
- step: split-name-list
- step: strip-brace-terms
def test_filter_kind(sanitize):
assert sanitize(housenumber='34', number='4', badnumber='65') == \
[('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
+
+
+@pytest.mark.parametrize('number', ('6523', 'n/a', '4'))
+def test_convert_to_name_converted(number):
+ sanitizer_args = {'step': 'clean-housenumbers',
+ 'convert-to-name': (r'\d+', 'n/a')}
+
+ place = PlaceInfo({'address': {'housenumber': number}})
+ names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
+
+ assert ('housenumber', number) in set((p.kind, p.name) for p in names)
+ assert 'housenumber' not in set(p.kind for p in address)
+
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
-Tests for sanitizer helper functions.
+Tests for sanitizer configuration helper functions.
"""
import pytest
from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceName
-import nominatim.tokenizer.sanitizers.helpers as helpers
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
def test_create_split_regex_no_params_unsplit(inp):
- regex = helpers.create_split_regex({})
+ regex = SanitizerConfig().get_delimiter()
assert list(regex.split(inp)) == [inp]
('1, 3 ,5', ['1', '3', '5'])
])
def test_create_split_regex_no_params_split(inp, outp):
- regex = helpers.create_split_regex({})
+ regex = SanitizerConfig().get_delimiter()
assert list(regex.split(inp)) == outp
@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', ' ', '/.*+'])
def test_create_split_regex_custom(delimiter):
- regex = helpers.create_split_regex({'delimiters': delimiter})
+ regex = SanitizerConfig({'delimiters': delimiter}).get_delimiter()
assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
assert list(regex.split('out,house')) == ['out,house']
def test_create_split_regex_empty_delimiter():
with pytest.raises(UsageError):
- regex = helpers.create_split_regex({'delimiters': ''})
+ regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
def test_create_kind_filter_no_params(inp):
- filt = helpers.create_kind_filter({})
+ filt = SanitizerConfig().get_filter_kind()
assert filt(PlaceName('something', inp, ''))
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
def test_create_kind_filter_custom_regex_positive(kind):
- filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+ filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
assert filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
def test_create_kind_filter_custom_regex_negative(kind):
- filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+ filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
assert not filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
def test_create_kind_filter_many_positive(kind):
- filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+ filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
assert filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
def test_create_kind_filter_many_negative(kind):
- filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+ filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
assert not filt(PlaceName('something', kind, ''))