nominatim/tokenizer/sanitizers/delete_tags.py

   1 # SPDX-License-Identifier: GPL-2.0-only\r
   2 #\r
   3 # This file is part of Nominatim. (https://nominatim.org)\r
   4 #\r
   5 # Copyright (C) 2023 by the Nominatim developer community.\r
   6 # For a full list of authors see the git log.\r
   7 """\r
   8 Sanitizer which prevents certain tags from getting into the search index.\r
   9 It remove tags which matches all properties given below.\r
  10 \r
  11 \r
  12 Arguments:\r
  13     type: Define which type of tags should be considered for removal.\r
  14           There are two types of tags 'name' and 'address' tags.\r
  15           Takes a string 'name' or 'address'. (default: 'name')\r
  16 \r
  17     filter-kind: Define which 'kind' of tags should be removed.\r
  18                  Takes a string or list of strings where each\r
  19                  string is a regular expression. A tag is considered\r
  20                  to be a candidate for removal if its 'kind' property\r
  21                  fully matches any of the given regular expressions.\r
  22                  Note that by default all 'kind' of tags are considered.\r
  23 \r
  24     suffix: Define the 'suffix' property of the tags which should be\r
  25             removed. Takes a string or list of strings where each\r
  26             string is a regular expression. A tag is considered to be a\r
  27             candidate for removal if its 'suffix' property fully\r
  28             matches any of the given regular expressions. Note that by\r
  29             default tags with any suffix value are considered including\r
  30             those which don't have a suffix at all.\r
  31 \r
  32     name: Define the 'name' property corresponding to the 'kind' property\r
  33           of the tag. Takes a string or list of strings where each string\r
  34           is a regular expression. A tag is considered to be a candidate\r
  35           for removal if its name fully matches any of the given regular\r
  36           expressions. Note that by default tags with any 'name' are\r
  37           considered.\r
  38 \r
  39     country_code: Define the country code of places whose tags should be\r
  40                   considered for removed. Takes a string or list of strings\r
  41                   where each string is a two-letter lower-case country code.\r
  42                   Note that by default tags of places with any country code\r
  43                   are considered including those which don't have a country\r
  44                   code at all.\r
  45 \r
  46     rank_address: Define the address rank of places whose tags should be\r
  47                   considered for removal. Takes a string or list of strings\r
  48                   where each string is a number or range of number or the\r
  49                   form <from>-<to>.\r
  50                   Note that default is '0-30', which means that tags of all\r
  51                   places are considered.\r
  52                   See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank\r
  53                   to learn more about address rank.\r
  54 \r
  55 \r
  56 """\r
  57 from typing import Callable, List, Optional, Pattern, Tuple, Sequence\r
  58 import re\r
  59 \r
  60 from nominatim.tokenizer.sanitizers.base import ProcessInfo\r
  61 from nominatim.data.place_name import PlaceName\r
  62 from nominatim.tokenizer.sanitizers.config import SanitizerConfig\r
  63 \r
  64 class _TagSanitizer:\r
  65 \r
  66     def __init__(self, config: SanitizerConfig) -> None:\r
  67         self.type = config.get('type', 'name')\r
  68         self.filter_kind = config.get_filter_kind()\r
  69         self.country_codes = config.get_string_list('country_code', [])\r
  70         self.allowed_ranks = self._set_allowed_ranks( \\r
  71                                             config.get_string_list('rank_address', ['0-30']))\r
  72 \r
  73         self.has_country_code = config.get('country_code', None) is not None\r
  74 \r
  75         suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])\r
  76         self.suffix_regexp = [re.compile(r) for r in suffixregexps]\r
  77 \r
  78         nameregexps = config.get_string_list('name', [r'[\s\S]*'])\r
  79         self.name_regexp = [re.compile(r) for r in nameregexps]\r
  80 \r
  81 \r
  82 \r
  83     def __call__(self, obj: ProcessInfo) -> None:\r
  84         tags = obj.names if self.type == 'name' else obj.address\r
  85 \r
  86         if (not tags or\r
  87              self.has_country_code and\r
  88               obj.place.country_code not in self.country_codes or\r
  89                not self.allowed_ranks[obj.place.rank_address]):\r
  90             return\r
  91 \r
  92         filtered_tags: List[PlaceName] = []\r
  93 \r
  94         for tag in tags:\r
  95 \r
  96             if (not self.filter_kind(tag.kind) or\r
  97                   not self._matches(tag.suffix, self.suffix_regexp) or\r
  98                     not self._matches(tag.name, self.name_regexp)):\r
  99                 filtered_tags.append(tag)\r
 100 \r
 101 \r
 102         if self.type == 'name':\r
 103             obj.names = filtered_tags\r
 104         else:\r
 105             obj.address = filtered_tags\r
 106 \r
 107 \r
 108     def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:\r
 109         """ Returns a tuple of 31 boolean values corresponding to the\r
 110             address ranks 0-30. Value at index 'i' is True if rank 'i'\r
 111             is present in the ranks or lies in the range of any of the\r
 112             ranks provided in the sanitizer configuration, otherwise\r
 113             the value is False.\r
 114         """\r
 115         allowed_ranks = [False] * 31\r
 116 \r
 117         for rank in ranks:\r
 118             intvl = [int(x) for x in rank.split('-')]\r
 119 \r
 120             start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])\r
 121 \r
 122             for i in range(start, end + 1):\r
 123                 allowed_ranks[i] = True\r
 124 \r
 125 \r
 126         return tuple(allowed_ranks)\r
 127 \r
 128 \r
 129     def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:\r
 130         """ Returns True if the given value fully matches any of the regular\r
 131             expression pattern in the list. Otherwise, returns False.\r
 132 \r
 133             Note that if the value is None, it is taken as an empty string.\r
 134         """\r
 135         target = '' if value is None else value\r
 136         return any(r.fullmatch(target) is not None for r in patterns)\r
 137 \r
 138 \r
 139 \r
 140 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:\r
 141     """ Create a function to process removal of certain tags.\r
 142     """\r
 143 \r
 144     return _TagSanitizer(config)\r