1 # SPDX-License-Identifier: GPL-2.0-only
\r
3 # This file is part of Nominatim. (https://nominatim.org)
\r
5 # Copyright (C) 2023 by the Nominatim developer community.
\r
6 # For a full list of authors see the git log.
\r
8 Sanitizer which prevents certain tags from getting into the search index.
\r
9 It remove tags which matches all properties given below.
\r
13 type: Define which type of tags should be considered for removal.
\r
14 There are two types of tags 'name' and 'address' tags.
\r
15 Takes a string 'name' or 'address'. (default: 'name')
\r
17 filter-kind: Define which 'kind' of tags should be removed.
\r
18 Takes a string or list of strings where each
\r
19 string is a regular expression. A tag is considered
\r
20 to be a candidate for removal if its 'kind' property
\r
21 fully matches any of the given regular expressions.
\r
22 Note that by default all 'kind' of tags are considered.
\r
24 suffix: Define the 'suffix' property of the tags which should be
\r
25 removed. Takes a string or list of strings where each
\r
26 string is a regular expression. A tag is considered to be a
\r
27 candidate for removal if its 'suffix' property fully
\r
28 matches any of the given regular expressions. Note that by
\r
29 default tags with any suffix value are considered including
\r
30 those which don't have a suffix at all.
\r
32 name: Define the 'name' property corresponding to the 'kind' property
\r
33 of the tag. Takes a string or list of strings where each string
\r
34 is a regular expression. A tag is considered to be a candidate
\r
35 for removal if its name fully matches any of the given regular
\r
36 expressions. Note that by default tags with any 'name' are
\r
39 country_code: Define the country code of places whose tags should be
\r
40 considered for removed. Takes a string or list of strings
\r
41 where each string is a two-letter lower-case country code.
\r
42 Note that by default tags of places with any country code
\r
43 are considered including those which don't have a country
\r
46 rank_address: Define the address rank of places whose tags should be
\r
47 considered for removal. Takes a string or list of strings
\r
48 where each string is a number or range of number or the
\r
50 Note that default is '0-30', which means that tags of all
\r
51 places are considered.
\r
52 See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
\r
53 to learn more about address rank.
\r
57 from typing import Callable, List, Tuple, Sequence
\r
59 from nominatim.tokenizer.sanitizers.base import ProcessInfo
\r
60 from nominatim.data.place_name import PlaceName
\r
61 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
\r
63 class _TagSanitizer:
\r
65 def __init__(self, config: SanitizerConfig) -> None:
\r
66 self.type = config.get('type', 'name')
\r
67 self.filter_kind = config.get_filter('filter-kind')
\r
68 self.country_codes = config.get_string_list('country_code', [])
\r
69 self.filter_suffix = config.get_filter('suffix')
\r
70 self.filter_name = config.get_filter('name')
\r
71 self.allowed_ranks = self._set_allowed_ranks(
\r
72 config.get_string_list("rank_address", ["0-30"])
\r
75 self.has_country_code = config.get('country_code', None) is not None
\r
78 def __call__(self, obj: ProcessInfo) -> None:
\r
79 tags = obj.names if self.type == 'name' else obj.address
\r
82 or not self.allowed_ranks[obj.place.rank_address] \
\r
83 or self.has_country_code \
\r
84 and obj.place.country_code not in self.country_codes:
\r
87 filtered_tags: List[PlaceName] = []
\r
91 if not self.filter_kind(tag.kind) \
\r
92 or not self.filter_suffix(tag.suffix or '') \
\r
93 or not self.filter_name(tag.name):
\r
94 filtered_tags.append(tag)
\r
97 if self.type == 'name':
\r
98 obj.names = filtered_tags
\r
100 obj.address = filtered_tags
\r
103 def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
\r
104 """ Returns a tuple of 31 boolean values corresponding to the
\r
105 address ranks 0-30. Value at index 'i' is True if rank 'i'
\r
106 is present in the ranks or lies in the range of any of the
\r
107 ranks provided in the sanitizer configuration, otherwise
\r
108 the value is False.
\r
110 allowed_ranks = [False] * 31
\r
113 intvl = [int(x) for x in rank.split('-')]
\r
115 start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]
\r
117 for i in range(start, end + 1):
\r
118 allowed_ranks[i] = True
\r
121 return tuple(allowed_ranks)
\r
124 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
\r
125 """ Create a function to process removal of certain tags.
\r
128 return _TagSanitizer(config)
\r