]> git.openstreetmap.org Git - nominatim.git/blob - test/python/tokenizer/sanitizers/test_split_name_list.py
clean_housenumbers: make kinds and delimiters configurable
[nominatim.git] / test / python / tokenizer / sanitizers / test_split_name_list.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tests for the sanitizer that splits multivalue lists.
9 """
10 import pytest
11
12 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
13 from nominatim.indexer.place_info import PlaceInfo
14
15 from nominatim.errors import UsageError
16
17 def run_sanitizer_on(**kwargs):
18     place = PlaceInfo({'name': kwargs})
19     name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
20
21     return sorted([(p.name, p.kind, p.suffix) for p in name])
22
23
24 def sanitize_with_delimiter(delimiter, name):
25     place = PlaceInfo({'name': {'name': name}})
26     san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
27     name, _ = san.process_names(place)
28
29     return sorted([p.name for p in name])
30
31
32 def test_simple():
33     assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
34     assert run_sanitizer_on(name='') == [('', 'name', None)]
35
36
37 def test_splits():
38     assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
39                                               ('B', 'name', None),
40                                               ('C', 'name', None)]
41     assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
42                                                             ('boat', 'short_name', None)]
43
44
45 def test_empty_fields():
46     assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
47                                              ('B', 'name', None)]
48     assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
49                                               ('B', 'name', None)]
50     assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
51     assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
52
53
54 def test_custom_delimiters():
55     assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
56     assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
57     assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
58     assert sanitize_with_delimiter(' ', 'morning  sun') == ['morning', 'sun']
59
60
61 def test_empty_delimiter_set():
62     with pytest.raises(UsageError):
63         sanitize_with_delimiter('', 'abc')
64
65
66 def test_no_name_list():
67     place = PlaceInfo({'address': {'housenumber': '3'}})
68     name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
69
70     assert not name
71     assert len(address) == 1