1 # SPDX-License-Identifier: GPL-2.0-only
\r
3 # This file is part of Nominatim. (https://nominatim.org)
\r
5 # Copyright (C) 2023 by the Nominatim developer community.
\r
6 # For a full list of authors see the git log.
\r
8 Sanitizer which prevents certain tags from getting into the search index.
\r
9 It remove tags which matches all properties given below.
\r
13 type: Define which type of tags should be considered for removal.
\r
14 There are two types of tags 'name' and 'address' tags.
\r
15 Takes a string 'name' or 'address'. (default: 'name')
\r
17 filter-kind: Define which 'kind' of tags should be removed.
\r
18 Takes a string or list of strings where each
\r
19 string is a regular expression. A tag is considered
\r
20 to be a candidate for removal if its 'kind' property
\r
21 fully matches any of the given regular expressions.
\r
22 Note that by default all 'kind' of tags are considered.
\r
24 suffix: Define the 'suffix' property of the tags which should be
\r
25 removed. Takes a string or list of strings where each
\r
26 string is a regular expression. A tag is considered to be a
\r
27 candidate for removal if its 'suffix' property fully
\r
28 matches any of the given regular expressions. Note that by
\r
29 default tags with any suffix value are considered including
\r
30 those which don't have a suffix at all.
\r
32 name: Define the 'name' property corresponding to the 'kind' property
\r
33 of the tag. Takes a string or list of strings where each string
\r
34 is a regular expression. A tag is considered to be a candidate
\r
35 for removal if its name fully matches any of the given regular
\r
36 expressions. Note that by default tags with any 'name' are
\r
39 country_code: Define the country code of places whose tags should be
\r
40 considered for removed. Takes a string or list of strings
\r
41 where each string is a two-letter lower-case country code.
\r
42 Note that by default tags of places with any country code
\r
43 are considered including those which don't have a country
\r
46 rank_address: Define the address rank of places whose tags should be
\r
47 considered for removal. Takes a string or list of strings
\r
48 where each string is a number or range of number or the
\r
50 Note that default is '0-30', which means that tags of all
\r
51 places are considered.
\r
52 See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
\r
53 to learn more about address rank.
\r
57 from typing import Callable, List, Optional, Pattern, Tuple, Sequence
\r
60 from nominatim.tokenizer.sanitizers.base import ProcessInfo
\r
61 from nominatim.data.place_name import PlaceName
\r
62 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
\r
64 class _TagSanitizer:
\r
66 def __init__(self, config: SanitizerConfig) -> None:
\r
67 self.type = config.get('type', 'name')
\r
68 self.filter_kind = config.get_filter_kind()
\r
69 self.country_codes = config.get_string_list('country_code', [])
\r
70 self.allowed_ranks = self._set_allowed_ranks( \
\r
71 config.get_string_list('rank_address', ['0-30']))
\r
73 self.has_country_code = config.get('country_code', None) is not None
\r
75 suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])
\r
76 self.suffix_regexp = [re.compile(r) for r in suffixregexps]
\r
78 nameregexps = config.get_string_list('name', [r'[\s\S]*'])
\r
79 self.name_regexp = [re.compile(r) for r in nameregexps]
\r
83 def __call__(self, obj: ProcessInfo) -> None:
\r
84 tags = obj.names if self.type == 'name' else obj.address
\r
87 self.has_country_code and
\r
88 obj.place.country_code not in self.country_codes or
\r
89 not self.allowed_ranks[obj.place.rank_address]):
\r
92 filtered_tags: List[PlaceName] = []
\r
96 if (not self.filter_kind(tag.kind) or
\r
97 not self._matches(tag.suffix, self.suffix_regexp) or
\r
98 not self._matches(tag.name, self.name_regexp)):
\r
99 filtered_tags.append(tag)
\r
102 if self.type == 'name':
\r
103 obj.names = filtered_tags
\r
105 obj.address = filtered_tags
\r
108 def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
\r
109 """ Returns a tuple of 31 boolean values corresponding to the
\r
110 address ranks 0-30. Value at index 'i' is True if rank 'i'
\r
111 is present in the ranks or lies in the range of any of the
\r
112 ranks provided in the sanitizer configuration, otherwise
\r
113 the value is False.
\r
115 allowed_ranks = [False] * 31
\r
118 intvl = [int(x) for x in rank.split('-')]
\r
120 start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])
\r
122 for i in range(start, end + 1):
\r
123 allowed_ranks[i] = True
\r
126 return tuple(allowed_ranks)
\r
129 def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:
\r
130 """ Returns True if the given value fully matches any of the regular
\r
131 expression pattern in the list. Otherwise, returns False.
\r
133 Note that if the value is None, it is taken as an empty string.
\r
135 target = '' if value is None else value
\r
136 return any(r.fullmatch(target) is not None for r in patterns)
\r
140 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
\r
141 """ Create a function to process removal of certain tags.
\r
144 return _TagSanitizer(config)
\r