1 # SPDX-License-Identifier: GPL-2.0-only
 
   3 # This file is part of Nominatim. (https://nominatim.org)
 
   5 # Copyright (C) 2022 by the Nominatim developer community.
 
   6 # For a full list of authors see the git log.
 
   8 Sanitizer that filters postcodes by their officially allowed pattern.
 
  11     convert-to-address: If set to 'yes' (the default), then postcodes that do
 
  12                         not conform with their country-specific pattern are
 
  13                         converted to an address component. That means that
 
  14                         the postcode does not take part when computing the
 
  15                         postcode centroids of a country but is still searchable.
 
  16                         When set to 'no', non-conforming postcodes are not
 
  18     default-pattern:    Pattern to use, when there is none available for the
 
  19                         country in question. Warning: will not be used for
 
  20                         objects that have no country assigned. These are always
 
  21                         assumed to have no postcode.
 
  23 from typing import Callable, Optional, Tuple
 
  25 from nominatim.data.postcode_format import PostcodeFormatter
 
  26 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 
  27 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 
  29 class _PostcodeSanitizer:
 
  31     def __init__(self, config: SanitizerConfig) -> None:
 
  32         self.convert_to_address = config.get_bool('convert-to-address', True)
 
  33         self.matcher = PostcodeFormatter()
 
  35         default_pattern = config.get('default-pattern')
 
  36         if default_pattern is not None and isinstance(default_pattern, str):
 
  37             self.matcher.set_default_pattern(default_pattern)
 
  40     def __call__(self, obj: ProcessInfo) -> None:
 
  44         postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
 
  46         for pos, postcode in postcodes:
 
  47             formatted = self.scan(postcode.name, obj.place.country_code)
 
  50                 if self.convert_to_address:
 
  51                     postcode.kind = 'unofficial_postcode'
 
  55                 postcode.name = formatted[0]
 
  56                 postcode.set_attr('variant', formatted[1])
 
  59     def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
 
  60         """ Check the postcode for correct formatting and return the
 
  61             normalized version. Returns None if the postcode does not
 
  62             correspond to the official format of the given country.
 
  64         match = self.matcher.match(country, postcode)
 
  68         assert country is not None
 
  70         return self.matcher.normalize(country, match),\
 
  71                ' '.join(filter(lambda p: p is not None, match.groups()))
 
  76 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
 
  77     """ Create a function that filters postcodes by their officially allowed pattern.
 
  80     return _PostcodeSanitizer(config)