X-Git-Url: https://git.openstreetmap.org/nominatim.git/blobdiff_plain/18864afa8aee710a5aa7fe65565711119ca7a663..9889c72c55dae60bafe90b605c2c790d655cdad2:/nominatim/tokenizer/sanitizers/clean_postcodes.py diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index c6292a29..5eaea391 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -15,74 +15,29 @@ Arguments: postcode centroids of a country but is still searchable. When set to 'no', non-conforming postcodes are not searchable either. + default-pattern: Pattern to use, when there is none available for the + country in question. Warning: will not be used for + objects that have no country assigned. These are always + assumed to have no postcode. """ -import re - -from nominatim.errors import UsageError -from nominatim.tools import country_info - -class _PostcodeMatcher: - """ Matches and formats a postcode according to the format definition. - """ - def __init__(self, country_code, config): - if 'pattern' not in config: - raise UsageError("Field 'pattern' required for 'postcode' " - f"for country '{country_code}'") - - pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') - - self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') - self.pattern = re.compile(pc_pattern) - - self.output = config.get('output', r'\g<0>') - - - def match(self, postcode): - """ Match the given postcode against the postcode pattern for this - matcher. Returns a `re.Match` object if the match was successful - and None otherwise. - """ - # Upper-case, strip spaces and leading country code. - normalized = self.norm_pattern.fullmatch(postcode.upper()) - - if normalized: - return self.pattern.fullmatch(normalized.group(1)) - - return None - - - def normalize(self, match): - """ Return the default format of the postcode for the given match. - `match` must be a `re.Match` object previously returned by - `match()` - """ - return match.expand(self.output) +from typing import Callable, Optional, Tuple +from nominatim.data.postcode_format import PostcodeFormatter +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.tokenizer.sanitizers.config import SanitizerConfig class _PostcodeSanitizer: - def __init__(self, config): + def __init__(self, config: SanitizerConfig) -> None: self.convert_to_address = config.get_bool('convert-to-address', True) - # Objects without a country code can't have a postcode per definition. - self.country_without_postcode = {None} - self.country_matcher = {} - - for ccode, prop in country_info.iterate('postcode'): - if prop is False: - self.country_without_postcode.add(ccode) - elif isinstance(prop, dict): - self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop) - else: - raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + self.matcher = PostcodeFormatter() default_pattern = config.get('default-pattern') if default_pattern is not None and isinstance(default_pattern, str): - self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) - else: - self.default_matcher = None + self.matcher.set_default_pattern(default_pattern) - def __call__(self, obj): + def __call__(self, obj: ProcessInfo) -> None: if not obj.address: return @@ -98,32 +53,28 @@ class _PostcodeSanitizer: obj.address.pop(pos) else: postcode.name = formatted[0] - postcode.set_attr('lookup', formatted[1]) + postcode.set_attr('variant', formatted[1]) - def scan(self, postcode, country): + def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]: """ Check the postcode for correct formatting and return the normalized version. Returns None if the postcode does not - correspond to the oficial format of the given country. + correspond to the official format of the given country. """ - if country in self.country_without_postcode: - return None - - matcher = self.country_matcher.get(country, self.default_matcher) - if matcher is None: - return postcode.upper(), '' - - match = matcher.match(postcode) + match = self.matcher.match(country, postcode) if match is None: return None - return matcher.normalize(match), ' '.join(match.groups()) + assert country is not None + + return self.matcher.normalize(country, match),\ + ' '.join(filter(lambda p: p is not None, match.groups())) -def create(config): - """ Create a housenumber processing function. +def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]: + """ Create a function that filters postcodes by their officially allowed pattern. """ return _PostcodeSanitizer(config)