1 # SPDX-License-Identifier: GPL-2.0-only
 
   3 # This file is part of Nominatim. (https://nominatim.org)
 
   5 # Copyright (C) 2022 by the Nominatim developer community.
 
   6 # For a full list of authors see the git log.
 
   8 Sanitizer that filters postcodes by their officially allowed pattern.
 
  11     convert-to-address: If set to 'yes' (the default), then postcodes that do
 
  12                         not conform with their country-specific pattern are
 
  13                         converted to an address component. That means that
 
  14                         the postcode does not take part when computing the
 
  15                         postcode centroids of a country but is still searchable.
 
  16                         When set to 'no', non-conforming postcodes are not
 
  21 from nominatim.errors import UsageError
 
  22 from nominatim.tools import country_info
 
  24 class _PostcodeMatcher:
 
  25     """ Matches and formats a postcode according to the format definition.
 
  27     def __init__(self, country_code, config):
 
  28         if 'pattern' not in config:
 
  29             raise UsageError("Field 'pattern' required for 'postcode' "
 
  30                              f"for country '{country_code}'")
 
  32         pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
 
  34         self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
 
  35         self.pattern = re.compile(pc_pattern)
 
  37         self.output = config.get('output', r'\g<0>')
 
  40     def match(self, postcode):
 
  41         """ Match the given postcode against the postcode pattern for this
 
  42             matcher. Returns a `re.Match` object if the match was successful
 
  45         # Upper-case, strip spaces and leading country code.
 
  46         normalized = self.norm_pattern.fullmatch(postcode.upper())
 
  49             return self.pattern.fullmatch(normalized.group(1))
 
  54     def normalize(self, match):
 
  55         """ Return the default format of the postcode for the given match.
 
  56             `match` must be a `re.Match` object previously returned by
 
  59         return match.expand(self.output)
 
  62 class _PostcodeSanitizer:
 
  64     def __init__(self, config):
 
  65         self.convert_to_address = config.get_bool('convert-to-address', True)
 
  66         # Objects without a country code can't have a postcode per definition.
 
  67         self.country_without_postcode = {None}
 
  68         self.country_matcher = {}
 
  70         for ccode, prop in country_info.iterate('postcode'):
 
  72                 self.country_without_postcode.add(ccode)
 
  73             elif isinstance(prop, dict):
 
  74                 self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
 
  76                 raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
 
  79     def __call__(self, obj):
 
  83         postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
 
  85         for pos, postcode in postcodes:
 
  86             formatted = self.scan(postcode.name, obj.place.country_code)
 
  89                 if self.convert_to_address:
 
  90                     postcode.kind = 'unofficial_postcode'
 
  94                 postcode.name = formatted[0]
 
  95                 postcode.set_attr('lookup', formatted[1])
 
  98     def scan(self, postcode, country):
 
  99         """ Check the postcode for correct formatting and return the
 
 100             normalized version. Returns None if the postcode does not
 
 101             correspond to the oficial format of the given country.
 
 103         if country in self.country_without_postcode:
 
 106         matcher = self.country_matcher.get(country)
 
 107         if matcher is not None:
 
 108             match = matcher.match(postcode)
 
 111             return matcher.normalize(match), ' '.join(match.groups())
 
 113         return postcode.upper(), ''
 
 118     """ Create a housenumber processing function.
 
 121     return _PostcodeSanitizer(config)