]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/sanitizers/clean_postcodes.py
c6292a2942b217ae866c8b43ded81a5aa75ff089
[nominatim.git] / nominatim / tokenizer / sanitizers / clean_postcodes.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Sanitizer that filters postcodes by their officially allowed pattern.
9
10 Arguments:
11     convert-to-address: If set to 'yes' (the default), then postcodes that do
12                         not conform with their country-specific pattern are
13                         converted to an address component. That means that
14                         the postcode does not take part when computing the
15                         postcode centroids of a country but is still searchable.
16                         When set to 'no', non-conforming postcodes are not
17                         searchable either.
18 """
19 import re
20
21 from nominatim.errors import UsageError
22 from nominatim.tools import country_info
23
24 class _PostcodeMatcher:
25     """ Matches and formats a postcode according to the format definition.
26     """
27     def __init__(self, country_code, config):
28         if 'pattern' not in config:
29             raise UsageError("Field 'pattern' required for 'postcode' "
30                              f"for country '{country_code}'")
31
32         pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
33
34         self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
35         self.pattern = re.compile(pc_pattern)
36
37         self.output = config.get('output', r'\g<0>')
38
39
40     def match(self, postcode):
41         """ Match the given postcode against the postcode pattern for this
42             matcher. Returns a `re.Match` object if the match was successful
43             and None otherwise.
44         """
45         # Upper-case, strip spaces and leading country code.
46         normalized = self.norm_pattern.fullmatch(postcode.upper())
47
48         if normalized:
49             return self.pattern.fullmatch(normalized.group(1))
50
51         return None
52
53
54     def normalize(self, match):
55         """ Return the default format of the postcode for the given match.
56             `match` must be a `re.Match` object previously returned by
57             `match()`
58         """
59         return match.expand(self.output)
60
61
62 class _PostcodeSanitizer:
63
64     def __init__(self, config):
65         self.convert_to_address = config.get_bool('convert-to-address', True)
66         # Objects without a country code can't have a postcode per definition.
67         self.country_without_postcode = {None}
68         self.country_matcher = {}
69
70         for ccode, prop in country_info.iterate('postcode'):
71             if prop is False:
72                 self.country_without_postcode.add(ccode)
73             elif isinstance(prop, dict):
74                 self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
75             else:
76                 raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
77
78         default_pattern = config.get('default-pattern')
79         if default_pattern is not None and isinstance(default_pattern, str):
80             self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
81         else:
82             self.default_matcher = None
83
84
85     def __call__(self, obj):
86         if not obj.address:
87             return
88
89         postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
90
91         for pos, postcode in postcodes:
92             formatted = self.scan(postcode.name, obj.place.country_code)
93
94             if formatted is None:
95                 if self.convert_to_address:
96                     postcode.kind = 'unofficial_postcode'
97                 else:
98                     obj.address.pop(pos)
99             else:
100                 postcode.name = formatted[0]
101                 postcode.set_attr('lookup', formatted[1])
102
103
104     def scan(self, postcode, country):
105         """ Check the postcode for correct formatting and return the
106             normalized version. Returns None if the postcode does not
107             correspond to the oficial format of the given country.
108         """
109         if country in self.country_without_postcode:
110             return None
111
112         matcher = self.country_matcher.get(country, self.default_matcher)
113         if matcher is None:
114             return postcode.upper(), ''
115
116         match = matcher.match(postcode)
117         if match is None:
118             return None
119
120         return matcher.normalize(match), ' '.join(match.groups())
121
122
123
124
125 def create(config):
126     """ Create a housenumber processing function.
127     """
128
129     return _PostcodeSanitizer(config)