From: Sarah Hoffmann Date: Fri, 24 Jun 2022 19:09:41 +0000 (+0200) Subject: Merge pull request #2757 from lonvia/filter-postcodes X-Git-Tag: v4.1.0~22 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/3bf3b894eaddd4f17f7e92353af6b2aa6200ab20?hp=0cd3a1b9bd8af6541c65f58608d8be7ad3674607 Merge pull request #2757 from lonvia/filter-postcodes Add filtering, normalisation and variants for postcodes --- diff --git a/.pylintrc b/.pylintrc index fef53872..52d9fcf9 100644 --- a/.pylintrc +++ b/.pylintrc @@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing # 'too-many-ancestors' is triggered already by deriving from UserDict disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use -good-names=i,x,y,fd,db +good-names=i,x,y,fd,db,cc diff --git a/docs/customize/Country-Settings.md b/docs/customize/Country-Settings.md new file mode 100644 index 00000000..6f8f2a9f --- /dev/null +++ b/docs/customize/Country-Settings.md @@ -0,0 +1,149 @@ +# Customizing Per-Country Data + +Whenever an OSM is imported into Nominatim, the object is first assigned +a country. Nominatim can use this information to adapt various aspects of +the address computation to the local customs of the country. This section +explains how country assignment works and the principal per-country +localizations. + +## Country assignment + +Countries are assigned on the basis of country data from the OpenStreetMap +input data itself. Countries are expected to be tagged according to the +[administrative boundary schema](https://wiki.openstreetmap.org/wiki/Tag:boundary%3Dadministrative): +a OSM relation with `boundary=administrative` and `admin_level=2`. Nominatim +uses the country code to distinguish the countries. + +If there is no country data available for a point, then Nominatim uses the +fallback data imported from `data/country_osm_grid.sql.gz`. This was computed +from OSM data as well but is guaranteed to cover all countries. + +Some OSM objects may also be located outside any country, for example a buoy +in the middle of the ocean. These object do not get any country assigned and +get a default treatment when it comes to localized handling of data. + +## Per-country settings + +### Global country settings + +The main place to configure settings per country is the file +`settings/country_settings.yaml`. This file has one section per country that +is recognised by Nominatim. Each section is tagged with the country code +(in lower case) and contains the different localization information. Only +countries which are listed in this file are taken into account for computations. + +For example, the section for Andorra looks like this: + +``` + partition: 35 + languages: ca + names: !include country-names/ad.yaml + postcode: + pattern: "(ddd)" + output: AD\1 +``` + +The individual settings are described below. + +#### `partition` + +Nominatim internally splits the data into multiple tables to improve +performance. The partition number tells Nominatim into which table to put +the country. This is purely internal management and has no effect on the +output data. + +The default is to have one partition per country. + +#### `languages` + +A comma-separated list of ISO-639 language codes of default languages in the +country. These are the languages used in name tags without a language suffix. +Note that this is not necessarily the same as the list of official languages +in the country. There may be officially recognised languages in a country +which are only ever used in name tags with the appropriate language suffixes. +Conversely, a non-official language may appear a lot in the name tags, for +example when used as an unofficial Lingua Franca. + +List the languages in order of frequency of appearance with the most frequently +used language first. It is not recommended to add languages when there are only +very few occurrences. + +If only one language is listed, then Nominatim will 'auto-complete' the +language of names without an explicit language-suffix. + +#### `names` + +List of names of the country and its translations. These names are used as +a baseline. It is always possible to search countries by the given names, no +matter what other names are in the OSM data. They are also used as a fallback +when a needed translation is not available. + +!!! Note + The list of names per country is currently fairly large because Nominatim + supports translations in many languages per default. That is why the + name lists have been separated out into extra files. You can find the + name lists in the file `settings/country-names/.yaml`. + The names section in the main country settings file only refers to these + files via the special `!include` directive. + +#### `postcode` + +Describes the format of the postcode that is in use in the country. + +When a country has no official postcodes, set this to no. Example: + +``` +ae: + postcode: no +``` + +When a country has a postcode, you need to state the postcode pattern and +the default output format. Example: + +``` +bm: + postcode: + pattern: "(ll)[ -]?(dd)" + output: \1 \2 +``` + +The **pattern** is a regular expression that describes the possible formats +accepted as a postcode. The pattern follows the standard syntax for +[regular expressions in Python](https://docs.python.org/3/library/re.html#regular-expression-syntax) +with two extra shortcuts: `d` is a shortcut for a single digit([0-9]) +and `l` for a single ASCII letter ([A-Z]). + +Use match groups to indicate groups in the postcode that may optionally be +separated with a space or a hyphen. + +For example, the postcode for Bermuda above always consists of two letters +and two digits. They may optionally be separated by a space or hyphen. That +means that Nominatim will consider `AB56`, `AB 56` and `AB-56` spelling variants +for one and the same postcode. + +Never add the country code in front of the postcode pattern. Nominatim will +automatically accept variants with a country code prefix for all postcodes. + +The **output** field is an optional field that describes what the canonical +spelling of the postcode should be. The format is the +[regular expression expand syntax](https://docs.python.org/3/library/re.html#re.Match.expand) referring back to the bracket groups in the pattern. + +Most simple postcodes only have one spelling variant. In that case, the +**output** can be omitted. The postcode will simply be used as is. + +In the Bermuda example above, the canonical spelling would be to have a space +between letters and digits. + +!!! Warning + When your postcode pattern covers multiple variants of the postcode, then + you must explicitly state the canonical output or Nominatim will not + handle the variations correctly. + +### Other country-specific configuration + +There are some other configuration files where you can set localized settings +according to the assigned country. These are: + + * [Place ranking configuration](Ranking.md) + +Please see the linked documentation sections for more information. diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 19d867dd..c563b201 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -205,6 +205,14 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +##### clean-postcodes + +::: nominatim.tokenizer.sanitizers.clean_postcodes + selection: + members: False + rendering: + heading_level: 6 + #### Token Analysis @@ -222,8 +230,12 @@ by a sanitizer (see for example the The token-analysis section contains the list of configured analyzers. Each analyzer must have an `id` parameter that uniquely identifies the analyzer. The only exception is the default analyzer that is used when no special -analyzer was selected. There is one special id '@housenumber'. If an analyzer -with that name is present, it is used for normalization of house numbers. +analyzer was selected. There are analysers with special ids: + + * '@housenumber'. If an analyzer with that name is present, it is used + for normalization of house numbers. + * '@potcode'. If an analyzer with that name is present, it is used + for normalization of postcodes. Different analyzer implementations may exist. To select the implementation, the `analyzer` parameter must be set. The different implementations are @@ -356,6 +368,14 @@ house numbers of the form '3 a', '3A', '3-A' etc. are all considered equivalent. The analyzer cannot be customized. +##### Postcode token analyzer + +The analyzer `postcodes` is pupose-made to analyze postcodes. It supports +a 'lookup' varaint of the token, which produces variants with optional +spaces. Use together with the clean-postcodes sanitizer. + +The analyzer cannot be customized. + ### Reconfiguration Changing the configuration after the import is currently not possible, although diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md index 2b4da005..5fe4e38d 100644 --- a/docs/develop/Tokenizers.md +++ b/docs/develop/Tokenizers.md @@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against both the search token list and the match token list. ```sql -FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT +FUNCTION token_get_postcode(info JSONB) RETURNS TEXT ``` -Return the normalized version of the given postcode. This function must return -the same value as the Python function `AbstractAnalyzer->normalize_postcode()`. +Return the postcode for the object, if any exists. The postcode must be in +the form that should also be presented to the end-user. ```sql FUNCTION token_strip_info(info JSONB) RETURNS JSONB diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c25ae0ad..a3860cba 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -28,6 +28,7 @@ pages: - 'Overview': 'customize/Overview.md' - 'Import Styles': 'customize/Import-Styles.md' - 'Configuration Settings': 'customize/Settings.md' + - 'Per-Country Data': 'customize/Country-Settings.md' - 'Place Ranking' : 'customize/Ranking.md' - 'Tokenizers' : 'customize/Tokenizers.md' - 'Special Phrases': 'customize/Special-Phrases.md' diff --git a/lib-php/TokenPostcode.php b/lib-php/TokenPostcode.php index f0dbd457..0ff92929 100644 --- a/lib-php/TokenPostcode.php +++ b/lib-php/TokenPostcode.php @@ -25,7 +25,12 @@ class Postcode public function __construct($iId, $sPostcode, $sCountryCode = '') { $this->iId = $iId; - $this->sPostcode = $sPostcode; + $iSplitPos = strpos($sPostcode, '@'); + if ($iSplitPos === false) { + $this->sPostcode = $sPostcode; + } else { + $this->sPostcode = substr($sPostcode, 0, $iSplitPos); + } $this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode; } diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index ccce99ca..e45d0765 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -190,13 +190,17 @@ class Tokenizer if ($aWord['word'] !== null && pg_escape_string($aWord['word']) == $aWord['word'] ) { - $sNormPostcode = $this->normalizeString($aWord['word']); - if (strpos($sNormQuery, $sNormPostcode) !== false) { - $oValidTokens->addToken( - $sTok, - new Token\Postcode($iId, $aWord['word'], null) - ); + $iSplitPos = strpos($aWord['word'], '@'); + if ($iSplitPos === false) { + $sPostcode = $aWord['word']; + } else { + $sPostcode = substr($aWord['word'], 0, $iSplitPos); } + + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $sPostcode, null) + ); } break; case 'S': // tokens for classification terms (special phrases) diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql index 0eada698..2bbfcd5c 100644 --- a/lib-sql/functions/address_lookup.sql +++ b/lib-sql/functions/address_lookup.sql @@ -320,6 +320,11 @@ BEGIN location := ROW(null, null, null, hstore('ref', place.postcode), 'place', 'postcode', null, null, false, true, 5, 0)::addressline; RETURN NEXT location; + ELSEIF place.address is not null and place.address ? 'postcode' + and not place.address->'postcode' SIMILAR TO '%(,|;)%' THEN + location := ROW(null, null, null, hstore('ref', place.address->'postcode'), 'place', + 'postcode', null, null, false, true, 5, 0)::addressline; + RETURN NEXT location; END IF; RETURN; diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql index c8cfbcc6..3a994711 100644 --- a/lib-sql/functions/interpolation.sql +++ b/lib-sql/functions/interpolation.sql @@ -156,7 +156,6 @@ DECLARE linegeo GEOMETRY; splitline GEOMETRY; sectiongeo GEOMETRY; - interpol_postcode TEXT; postcode TEXT; stepmod SMALLINT; BEGIN @@ -174,8 +173,6 @@ BEGIN ST_PointOnSurface(NEW.linegeo), NEW.linegeo); - interpol_postcode := token_normalized_postcode(NEW.address->'postcode'); - NEW.token_info := token_strip_info(NEW.token_info); IF NEW.address ? '_inherited' THEN NEW.address := hstore('interpolation', NEW.address->'interpolation'); @@ -207,6 +204,11 @@ BEGIN FOR nextnode IN SELECT DISTINCT ON (nodeidpos) osm_id, address, geometry, + -- Take the postcode from the node only if it has a housenumber itself. + -- Note that there is a corner-case where the node has a wrongly + -- formatted postcode and therefore 'postcode' contains a derived + -- variant. + CASE WHEN address ? 'postcode' THEN placex.postcode ELSE NULL::text END as postcode, substring(address->'housenumber','[0-9]+')::integer as hnr FROM placex, generate_series(1, array_upper(waynodes, 1)) nodeidpos WHERE osm_type = 'N' and osm_id = waynodes[nodeidpos]::BIGINT @@ -260,13 +262,10 @@ BEGIN endnumber := newend; -- determine postcode - postcode := coalesce(interpol_postcode, - token_normalized_postcode(prevnode.address->'postcode'), - token_normalized_postcode(nextnode.address->'postcode'), - postcode); - IF postcode is NULL THEN - SELECT token_normalized_postcode(placex.postcode) - FROM placex WHERE place_id = NEW.parent_place_id INTO postcode; + postcode := coalesce(prevnode.postcode, nextnode.postcode, postcode); + IF postcode is NULL and NEW.parent_place_id > 0 THEN + SELECT placex.postcode FROM placex + WHERE place_id = NEW.parent_place_id INTO postcode; END IF; IF postcode is NULL THEN postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry); diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 6143a1ed..1f7e6dc6 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -992,7 +992,7 @@ BEGIN {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %} -- determine postcode - NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'), + NEW.postcode := coalesce(token_get_postcode(NEW.token_info), location.postcode, get_nearest_postcode(NEW.country_code, NEW.centroid)); @@ -1150,8 +1150,7 @@ BEGIN {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %} - NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'), - NEW.postcode); + NEW.postcode := coalesce(token_get_postcode(NEW.token_info), NEW.postcode); -- if we have a name add this to the name search table IF NEW.name IS NOT NULL THEN diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index a3dac8dd..599d0eb0 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -97,10 +97,10 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) +CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; + SELECT info->>'postcode'; $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -223,3 +223,26 @@ BEGIN END; $$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[]) + RETURNS BOOLEAN + AS $$ +DECLARE + existing INTEGER; +BEGIN + SELECT count(*) INTO existing + FROM word WHERE word = postcode and type = 'P'; + + IF existing > 0 THEN + RETURN TRUE; + END IF; + + -- postcodes don't need word ids + INSERT INTO word (word_token, type, word) + SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term; + + RETURN FALSE; +END; +$$ +LANGUAGE plpgsql; + diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 64453d4e..5826f74a 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -97,10 +97,10 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) +CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; + SELECT info->>'postcode'; $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py new file mode 100644 index 00000000..6ae43b7d --- /dev/null +++ b/nominatim/data/postcode_format.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for formatting postcodes according to their country-specific +format. +""" +import re + +from nominatim.errors import UsageError +from nominatim.tools import country_info + +class CountryPostcodeMatcher: + """ Matches and formats a postcode according to a format definition + of the given country. + """ + def __init__(self, country_code, config): + if 'pattern' not in config: + raise UsageError("Field 'pattern' required for 'postcode' " + f"for country '{country_code}'") + + pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') + + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.pattern = re.compile(pc_pattern) + + self.output = config.get('output', r'\g<0>') + + + def match(self, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the match was successful + and None otherwise. + """ + # Upper-case, strip spaces and leading country code. + normalized = self.norm_pattern.fullmatch(postcode.upper()) + + if normalized: + return self.pattern.fullmatch(normalized.group(1)) + + return None + + + def normalize(self, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return match.expand(self.output) + + +class PostcodeFormatter: + """ Container for different postcode formats of the world and + access functions. + """ + def __init__(self): + # Objects without a country code can't have a postcode per definition. + self.country_without_postcode = {None} + self.country_matcher = {} + self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'}) + + for ccode, prop in country_info.iterate('postcode'): + if prop is False: + self.country_without_postcode.add(ccode) + elif isinstance(prop, dict): + self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop) + else: + raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + + + def set_default_pattern(self, pattern): + """ Set the postcode match pattern to use, when a country does not + have a specific pattern or is marked as country without postcode. + """ + self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern}) + + + def get_matcher(self, country_code): + """ Return the CountryPostcodeMatcher for the given country. + Returns None if the country doesn't have a postcode and the + default matcher if there is no specific matcher configured for + the country. + """ + if country_code in self.country_without_postcode: + return None + + return self.country_matcher.get(country_code, self.default_matcher) + + + def match(self, country_code, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the country has a pattern + and the match was successful or None if the match failed. + """ + if country_code in self.country_without_postcode: + return None + + return self.country_matcher.get(country_code, self.default_matcher).match(postcode) + + + def normalize(self, country_code, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return self.country_matcher.get(country_code, self.default_matcher).normalize(match) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 4678af66..a6ff08a4 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module. import itertools import json import logging -import re from textwrap import dedent from nominatim.db.connection import connect @@ -291,33 +290,72 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ Update postcode tokens in the word table from the location_postcode table. """ - to_delete = [] + analyzer = self.token_analysis.analysis.get('@postcode') + with self.conn.cursor() as cur: - # This finds us the rows in location_postcode and word that are - # missing in the other table. - cur.execute("""SELECT * FROM - (SELECT pc, word FROM - (SELECT distinct(postcode) as pc FROM location_postcode) p - FULL JOIN - (SELECT word FROM word WHERE type = 'P') w - ON pc = word) x - WHERE pc is null or word is null""") - - with CopyBuffer() as copystr: - for postcode, word in cur: - if postcode is None: - to_delete.append(word) - else: - copystr.add(self._search_normalized(postcode), - 'P', postcode) - - if to_delete: - cur.execute("""DELETE FROM WORD - WHERE type ='P' and word = any(%s) - """, (to_delete, )) - - copystr.copy_out(cur, 'word', - columns=['word_token', 'type', 'word']) + # First get all postcode names currently in the word table. + cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'") + word_entries = set((entry[0] for entry in cur)) + + # Then compute the required postcode names from the postcode table. + needed_entries = set() + cur.execute("SELECT country_code, postcode FROM location_postcode") + for cc, postcode in cur: + info = PlaceInfo({'country_code': cc, + 'class': 'place', 'type': 'postcode', + 'address': {'postcode': postcode}}) + address = self.sanitizer.process_names(info)[1] + for place in address: + if place.kind == 'postcode': + if analyzer is None: + postcode_name = place.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(place.name) + variant_base = place.get_attr("variant") + + if variant_base: + needed_entries.add(f'{postcode_name}@{variant_base}') + else: + needed_entries.add(postcode_name) + break + + # Now update the word table. + self._delete_unused_postcode_words(word_entries - needed_entries) + self._add_missing_postcode_words(needed_entries - word_entries) + + def _delete_unused_postcode_words(self, tokens): + if tokens: + with self.conn.cursor() as cur: + cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)", + (list(tokens), )) + + def _add_missing_postcode_words(self, tokens): + if not tokens: + return + + analyzer = self.token_analysis.analysis.get('@postcode') + terms = [] + + for postcode_name in tokens: + if '@' in postcode_name: + term, variant = postcode_name.split('@', 2) + term = self._search_normalized(term) + variants = {term} + if analyzer is not None: + variants.update(analyzer.get_variants_ascii(variant)) + variants = list(variants) + else: + variants = [self._search_normalized(postcode_name)] + terms.append((postcode_name, variants)) + + if terms: + with self.conn.cursor() as cur: + cur.execute_values("""SELECT create_postcode_word(pc, var) + FROM (VALUES %s) AS v(pc, var)""", + terms) + + def update_special_phrases(self, phrases, should_replace): @@ -473,7 +511,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): for item in address: if item.kind == 'postcode': - self._add_postcode(item.name) + token_info.set_postcode(self._add_postcode(item)) elif item.kind == 'housenumber': token_info.add_housenumber(*self._compute_housenumber_token(item)) elif item.kind == 'street': @@ -605,26 +643,38 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - def _add_postcode(self, postcode): + def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None: - postcode = self.normalize_postcode(postcode) + analyzer = self.token_analysis.analysis.get('@postcode') - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode) - if not term: - return + if analyzer is None: + postcode_name = item.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(item.name) + variant_base = item.get_attr("variant") - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE type = 'P' and word = pc)) - """, (term, postcode)) - self._cache.postcodes.add(postcode) + if variant_base: + postcode = f'{postcode_name}@{variant_base}' + else: + postcode = postcode_name + + if postcode not in self._cache.postcodes: + term = self._search_normalized(postcode_name) + if not term: + return None + + variants = {term} + if analyzer is not None and variant_base: + variants.update(analyzer.get_variants_ascii(variant_base)) + + with self.conn.cursor() as cur: + cur.execute("SELECT create_postcode_word(%s, %s)", + (postcode, list(variants))) + self._cache.postcodes.add(postcode) + + return postcode_name class _TokenInfo: @@ -637,6 +687,7 @@ class _TokenInfo: self.street_tokens = set() self.place_tokens = set() self.address_tokens = {} + self.postcode = None @staticmethod @@ -665,6 +716,9 @@ class _TokenInfo: if self.address_tokens: out['addr'] = self.address_tokens + if self.postcode: + out['postcode'] = self.postcode + return out @@ -701,6 +755,11 @@ class _TokenInfo: if partials: self.address_tokens[key] = self._mk_array(partials) + def set_postcode(self, postcode): + """ Set the postcode to the given one. + """ + self.postcode = postcode + class _TokenCache: """ Cache for token information to avoid repeated database queries. diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index a292b180..36fd5722 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -467,8 +467,9 @@ class LegacyNameAnalyzer(AbstractAnalyzer): if key == 'postcode': # Make sure the normalized postcode is present in the word table. if re.search(r'[:,;]', value) is None: - self._cache.add_postcode(self.conn, - self.normalize_postcode(value)) + norm_pc = self.normalize_postcode(value) + token_info.set_postcode(norm_pc) + self._cache.add_postcode(self.conn, norm_pc) elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): hnrs.append(value) elif key == 'street': @@ -527,6 +528,11 @@ class _TokenInfo: self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone() + def set_postcode(self, postcode): + """ Set or replace the postcode token with the given value. + """ + self.data['postcode'] = postcode + def add_street(self, conn, street): """ Add addr:street match terms. """ diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py new file mode 100644 index 00000000..05e90ca1 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that filters postcodes by their officially allowed pattern. + +Arguments: + convert-to-address: If set to 'yes' (the default), then postcodes that do + not conform with their country-specific pattern are + converted to an address component. That means that + the postcode does not take part when computing the + postcode centroids of a country but is still searchable. + When set to 'no', non-conforming postcodes are not + searchable either. + default-pattern: Pattern to use, when there is none available for the + country in question. Warning: will not be used for + objects that have no country assigned. These are always + assumed to have no postcode. +""" +from nominatim.data.postcode_format import PostcodeFormatter + +class _PostcodeSanitizer: + + def __init__(self, config): + self.convert_to_address = config.get_bool('convert-to-address', True) + self.matcher = PostcodeFormatter() + + default_pattern = config.get('default-pattern') + if default_pattern is not None and isinstance(default_pattern, str): + self.matcher.set_default_pattern(default_pattern) + + + def __call__(self, obj): + if not obj.address: + return + + postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode') + + for pos, postcode in postcodes: + formatted = self.scan(postcode.name, obj.place.country_code) + + if formatted is None: + if self.convert_to_address: + postcode.kind = 'unofficial_postcode' + else: + obj.address.pop(pos) + else: + postcode.name = formatted[0] + postcode.set_attr('variant', formatted[1]) + + + def scan(self, postcode, country): + """ Check the postcode for correct formatting and return the + normalized version. Returns None if the postcode does not + correspond to the oficial format of the given country. + """ + match = self.matcher.match(country, postcode) + if match is None: + return None + + return self.matcher.normalize(country, match),\ + ' '.join(filter(lambda p: p is not None, match.groups())) + + + + +def create(config): + """ Create a housenumber processing function. + """ + + return _PostcodeSanitizer(config) diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py index ecfcacbe..ce5ce1eb 100644 --- a/nominatim/tokenizer/sanitizers/config.py +++ b/nominatim/tokenizer/sanitizers/config.py @@ -44,6 +44,20 @@ class SanitizerConfig(UserDict): return values + def get_bool(self, param, default=None): + """ Extract a configuration parameter as a boolean. + The parameter must be one of the yaml boolean values or an + user error will be raised. If `default` is given, then the parameter + may also be missing or empty. + """ + value = self.data.get(param, default) + + if not isinstance(value, bool): + raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.") + + return value + + def get_delimiter(self, default=',;'): """ Return the 'delimiter' parameter in the configuration as a compiled regular expression that can be used to split the names on the diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py index 7898b1c6..9a99d127 100644 --- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -48,8 +48,7 @@ class _AnalyzerByLanguage: self.deflangs = {} if use_defaults in ('mono', 'all'): - for ccode, prop in country_info.iterate(): - clangs = prop['languages'] + for ccode, clangs in country_info.iterate('languages'): if len(clangs) == 1 or use_defaults == 'all': if self.whitelist: self.deflangs[ccode] = [l for l in clangs if l in self.whitelist] diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py new file mode 100644 index 00000000..18fc2a8d --- /dev/null +++ b/nominatim/tokenizer/token_analysis/postcodes.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Specialized processor for postcodes. Supports a 'lookup' variant of the +token, which produces variants with optional spaces. +""" + +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator + +### Configuration section + +def configure(rules, normalization_rules): # pylint: disable=W0613 + """ All behaviour is currently hard-coded. + """ + return None + +### Analysis section + +def create(normalizer, transliterator, config): # pylint: disable=W0613 + """ Create a new token analysis instance for this module. + """ + return PostcodeTokenAnalysis(normalizer, transliterator) + + +class PostcodeTokenAnalysis: + """ Special normalization and variant generation for postcodes. + + This analyser must not be used with anything but postcodes as + it follows some special rules: `normalize` doesn't necessarily + need to return a standard form as per normalization rules. It + needs to return the canonical form of the postcode that is also + used for output. `get_variants_ascii` then needs to ensure that + the generated variants once more follow the standard normalization + and transliteration, so that postcodes are correctly recognised by + the search algorithm. + """ + def __init__(self, norm, trans): + self.norm = norm + self.trans = trans + + self.mutator = MutationVariantGenerator(' ', (' ', '')) + + + def normalize(self, name): + """ Return the standard form of the postcode. + """ + return name.strip().upper() + + + def get_variants_ascii(self, norm_name): + """ Compute the spelling variants for the given normalized postcode. + + Takes the canonical form of the postcode, normalizes it using the + standard rules and then creates variants of the result where + all spaces are optional. + """ + # Postcodes follow their own transliteration rules. + # Make sure at this point, that the terms are normalized in a way + # that they are searchable with the standard transliteration rules. + return [self.trans.transliterate(term) for term in + self.mutator.generate([self.norm.transliterate(norm_name)]) if term] diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py index 0ad00171..d754b4dd 100644 --- a/nominatim/tools/country_info.py +++ b/nominatim/tools/country_info.py @@ -84,10 +84,20 @@ def setup_country_config(config): _COUNTRY_INFO.load(config) -def iterate(): +def iterate(prop=None): """ Iterate over country code and properties. + + When `prop` is None, all countries are returned with their complete + set of properties. + + If `prop` is given, then only countries are returned where the + given property is set. The second item of the tuple contains only + the content of the given property. """ - return _COUNTRY_INFO.items() + if prop is None: + return _COUNTRY_INFO.items() + + return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p) def setup_country_tables(dsn, sql_dir, ignore_partitions=False): diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 2b7027e7..9c66719b 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -8,6 +8,7 @@ Functions for importing, updating and otherwise maintaining the table of artificial postcode centroids. """ +from collections import defaultdict import csv import gzip import logging @@ -16,6 +17,8 @@ from math import isfinite from psycopg2 import sql as pysql from nominatim.db.connection import connect +from nominatim.utils.centroid import PointsCentroid +from nominatim.data.postcode_format import PostcodeFormatter LOG = logging.getLogger() @@ -30,20 +33,31 @@ def _to_float(num, max_value): return num -class _CountryPostcodesCollector: +class _PostcodeCollector: """ Collector for postcodes of a single country. """ - def __init__(self, country): + def __init__(self, country, matcher): self.country = country - self.collected = {} + self.matcher = matcher + self.collected = defaultdict(PointsCentroid) + self.normalization_cache = None def add(self, postcode, x, y): """ Add the given postcode to the collection cache. If the postcode already existed, it is overwritten with the new centroid. """ - self.collected[postcode] = (x, y) + if self.matcher is not None: + if self.normalization_cache and self.normalization_cache[0] == postcode: + normalized = self.normalization_cache[1] + else: + match = self.matcher.match(postcode) + normalized = self.matcher.normalize(match) if match else None + self.normalization_cache = (postcode, normalized) + + if normalized: + self.collected[normalized] += (x, y) def commit(self, conn, analyzer, project_dir): @@ -93,16 +107,16 @@ class _CountryPostcodesCollector: WHERE country_code = %s""", (self.country, )) for postcode, x, y in cur: - newx, newy = self.collected.pop(postcode, (None, None)) - if newx is not None: - dist = (x - newx)**2 + (y - newy)**2 - if dist > 0.0000001: + pcobj = self.collected.pop(postcode, None) + if pcobj: + newx, newy = pcobj.centroid() + if (x - newx) > 0.0000001 or (y - newy) > 0.0000001: to_update.append((postcode, newx, newy)) else: to_delete.append(postcode) - to_add = [(k, v[0], v[1]) for k, v in self.collected.items()] - self.collected = [] + to_add = [(k, *v.centroid()) for k, v in self.collected.items()] + self.collected = None return to_add, to_delete, to_update @@ -125,8 +139,10 @@ class _CountryPostcodesCollector: postcode = analyzer.normalize_postcode(row['postcode']) if postcode not in self.collected: try: - self.collected[postcode] = (_to_float(row['lon'], 180), - _to_float(row['lat'], 90)) + # Do float conversation separately, it might throw + centroid = (_to_float(row['lon'], 180), + _to_float(row['lat'], 90)) + self.collected[postcode] += centroid except ValueError: LOG.warning("Bad coordinates %s, %s in %s country postcode file.", row['lat'], row['lon'], self.country) @@ -158,6 +174,7 @@ def update_postcodes(dsn, project_dir, tokenizer): potentially enhances it with external data and then updates the postcodes in the table 'location_postcode'. """ + matcher = PostcodeFormatter() with tokenizer.name_analyzer() as analyzer: with connect(dsn) as conn: # First get the list of countries that currently have postcodes. @@ -169,19 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer): # Recompute the list of valid postcodes from placex. with conn.cursor(name="placex_postcodes") as cur: cur.execute(""" - SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid) + SELECT cc, pc, ST_X(centroid), ST_Y(centroid) FROM (SELECT COALESCE(plx.country_code, get_country_code(ST_Centroid(pl.geometry))) as cc, - token_normalized_postcode(pl.address->'postcode') as pc, - ST_Centroid(ST_Collect(COALESCE(plx.centroid, - ST_Centroid(pl.geometry)))) as centroid + pl.address->'postcode' as pc, + COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid FROM place AS pl LEFT OUTER JOIN placex AS plx ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type - WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null - GROUP BY cc, pc) xx + WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx WHERE pc IS NOT null AND cc IS NOT null - ORDER BY country_code, pc""") + ORDER BY cc, pc""") collector = None @@ -189,7 +204,7 @@ def update_postcodes(dsn, project_dir, tokenizer): if collector is None or country != collector.country: if collector is not None: collector.commit(conn, analyzer, project_dir) - collector = _CountryPostcodesCollector(country) + collector = _PostcodeCollector(country, matcher.get_matcher(country)) todo_countries.discard(country) collector.add(postcode, x, y) @@ -198,7 +213,8 @@ def update_postcodes(dsn, project_dir, tokenizer): # Now handle any countries that are only in the postcode table. for country in todo_countries: - _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir) + fmt = matcher.get_matcher(country) + _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir) conn.commit() diff --git a/nominatim/utils/__init__.py b/nominatim/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/utils/centroid.py b/nominatim/utils/centroid.py new file mode 100644 index 00000000..c2bd6192 --- /dev/null +++ b/nominatim/utils/centroid.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for computation of centroids. +""" +from collections.abc import Collection + +class PointsCentroid: + """ Centroid computation from single points using an online algorithm. + More points may be added at any time. + + Coordinates are internally treated as a 7-digit fixed-point float + (i.e. in OSM style). + """ + + def __init__(self): + self.sum_x = 0 + self.sum_y = 0 + self.count = 0 + + def centroid(self): + """ Return the centroid of all points collected so far. + """ + if self.count == 0: + raise ValueError("No points available for centroid.") + + return (float(self.sum_x/self.count)/10000000, + float(self.sum_y/self.count)/10000000) + + + def __len__(self): + return self.count + + + def __iadd__(self, other): + if isinstance(other, Collection) and len(other) == 2: + if all(isinstance(p, (float, int)) for p in other): + x, y = other + self.sum_x += int(x * 10000000) + self.sum_y += int(y * 10000000) + self.count += 1 + return self + + raise ValueError("Can only add 2-element tuples to centroid.") diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 643acbee..b0bacdfc 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -3,6 +3,9 @@ ad: partition: 35 languages: ca names: !include country-names/ad.yaml + postcode: + pattern: "(ddd)" + output: AD\1 # United Arab Emirates (الإمارات العربية المتحدة) @@ -10,6 +13,7 @@ ae: partition: 83 languages: ar names: !include country-names/ae.yaml + postcode: no # Afghanistan (افغانستان) @@ -17,6 +21,8 @@ af: partition: 30 languages: fa, ps names: !include country-names/af.yaml + postcode: + pattern: "dddd" # Antigua and Barbuda (Antigua and Barbuda) @@ -24,6 +30,7 @@ ag: partition: 205 languages: en names: !include country-names/ag.yaml + postcode: no # Anguilla (Anguilla) @@ -31,6 +38,9 @@ ai: partition: 175 languages: en names: !include country-names/ai.yaml + postcode: + pattern: "2640" + output: AI-2640 # Albania (Shqipëria) @@ -38,6 +48,8 @@ al: partition: 9 languages: sq names: !include country-names/al.yaml + postcode: + pattern: "dddd" # Armenia (Հայաստան) @@ -45,6 +57,8 @@ am: partition: 33 languages: hy names: !include country-names/am.yaml + postcode: + pattern: "dddd" # Netherlands Antilles (De Nederlandse Antillen) @@ -59,6 +73,7 @@ ao: partition: 85 languages: pt names: !include country-names/ao.yaml + postcode: no # (Antarctica) @@ -66,6 +81,7 @@ aq: partition: 181 languages: en, es, fr, ru names: !include country-names/aq.yaml + postcode: no # Argentina (Argentina) @@ -73,6 +89,8 @@ ar: partition: 39 languages: es names: !include country-names/ar.yaml + postcode: + pattern: "l?dddd(?:lll)?" # (American Samoa) @@ -87,6 +105,8 @@ at: partition: 245 languages: de names: !include country-names/at.yaml + postcode: + pattern: "dddd" # Australia (Australia) @@ -94,6 +114,8 @@ au: partition: 139 languages: en names: !include country-names/au.yaml + postcode: + pattern: "dddd" # (Aruba) @@ -101,6 +123,7 @@ aw: partition: 183 languages: nl, pap names: !include country-names/aw.yaml + postcode: no # (Aland Islands) @@ -115,6 +138,8 @@ az: partition: 119 languages: az names: !include country-names/az.yaml + postcode: + pattern: "dddd" # Bosnia and Herzegovina (Bosna i Hercegovina / Босна и Херцеговина) @@ -122,6 +147,8 @@ ba: partition: 6 languages: bs, hr, sr names: !include country-names/ba.yaml + postcode: + pattern: "ddddd" # Barbados (Barbados) @@ -129,6 +156,9 @@ bb: partition: 206 languages: en names: !include country-names/bb.yaml + postcode: + pattern: "(ddddd)" + output: BB\1 # Bangladesh (Bangladesh) @@ -136,6 +166,8 @@ bd: partition: 158 languages: bn names: !include country-names/bd.yaml + postcode: + pattern: "dddd" # Belgium (België / Belgique / Belgien) @@ -143,6 +175,8 @@ be: partition: 15 languages: nl, fr, de names: !include country-names/be.yaml + postcode: + pattern: "dddd" # Burkina Faso (Burkina Faso) @@ -150,6 +184,7 @@ bf: partition: 225 languages: fr names: !include country-names/bf.yaml + postcode: no # Bulgaria (Бългaрия) @@ -157,6 +192,8 @@ bg: partition: 140 languages: bg names: !include country-names/bg.yaml + postcode: + pattern: "dddd" # Bahrain (البحرين) @@ -164,6 +201,8 @@ bh: partition: 62 languages: ar names: !include country-names/bh.yaml + postcode: + pattern: "d?ddd" # Burundi (Burundi) @@ -171,6 +210,7 @@ bi: partition: 61 languages: fr names: !include country-names/bi.yaml + postcode: no # Benin (Bénin) @@ -178,6 +218,7 @@ bj: partition: 224 languages: fr names: !include country-names/bj.yaml + postcode: no # (Saint Barthélemy) @@ -192,6 +233,9 @@ bm: partition: 176 languages: en names: !include country-names/bm.yaml + postcode: + pattern: "(ll)[ -]?(dd)" + output: \1 \2 # Brunei (Brunei) @@ -199,6 +243,9 @@ bn: partition: 86 languages: ms names: !include country-names/bn.yaml + postcode: + pattern: "(ll) ?(dddd)" + output: \1\2 # Bolivia (Bolivia) @@ -206,6 +253,7 @@ bo: partition: 120 languages: es, qu, gn, ay names: !include country-names/bo.yaml + postcode: no # Caribbean Netherlands (Caribisch Nederland) @@ -220,6 +268,9 @@ br: partition: 121 languages: pt names: !include country-names/br.yaml + postcode: + pattern: "(ddddd)-?(ddd)" + output: \1-\2 # The Bahamas (The Bahamas) @@ -227,6 +278,7 @@ bs: partition: 207 languages: en names: !include country-names/bs.yaml + postcode: no # Bhutan (འབྲུག་ཡུལ་) @@ -234,6 +286,8 @@ bt: partition: 87 languages: dz names: !include country-names/bt.yaml + postcode: + pattern: "ddddd" # (Bouvet Island) @@ -248,6 +302,7 @@ bw: partition: 122 languages: en, tn names: !include country-names/bw.yaml + postcode: no # Belarus (Беларусь) @@ -255,6 +310,8 @@ by: partition: 40 languages: be, ru names: !include country-names/by.yaml + postcode: + pattern: "dddddd" # Belize (Belize) @@ -262,6 +319,7 @@ bz: partition: 208 languages: en names: !include country-names/bz.yaml + postcode: no # Canada (Canada) @@ -269,6 +327,9 @@ ca: partition: 244 languages: en, fr names: !include country-names/ca.yaml + postcode: + pattern: "(ldl) ?(dld)" + output: \1 \2 # Cocos (Keeling) Islands (Cocos (Keeling) Islands) @@ -283,6 +344,7 @@ cd: partition: 229 languages: fr names: !include country-names/cd.yaml + postcode: no # Central African Republic (Ködörösêse tî Bêafrîka - République Centrafricaine) @@ -290,6 +352,7 @@ cf: partition: 227 languages: fr, sg names: !include country-names/cf.yaml + postcode: no # Congo-Brazzaville (Congo) @@ -297,6 +360,7 @@ cg: partition: 230 languages: fr names: !include country-names/cg.yaml + postcode: no # Switzerland (Schweiz/Suisse/Svizzera/Svizra) @@ -304,6 +368,8 @@ ch: partition: 5 languages: de, fr, it, rm names: !include country-names/ch.yaml + postcode: + pattern: "dddd" # Côte d'Ivoire (Côte d’Ivoire) @@ -311,6 +377,7 @@ ci: partition: 228 languages: fr names: !include country-names/ci.yaml + postcode: no # Cook Islands (Kūki 'Āirani) @@ -318,6 +385,7 @@ ck: partition: 41 languages: en, rar names: !include country-names/ck.yaml + postcode: no # Chile (Chile) @@ -325,6 +393,8 @@ cl: partition: 88 languages: es names: !include country-names/cl.yaml + postcode: + pattern: "ddddddd" # Cameroon (Cameroun) @@ -332,6 +402,7 @@ cm: partition: 141 languages: fr, en names: !include country-names/cm.yaml + postcode: no # China (中国) @@ -339,6 +410,8 @@ cn: partition: 117 languages: zh names: !include country-names/cn.yaml + postcode: + pattern: "dddddd" # Colombia (Colombia) @@ -346,6 +419,8 @@ co: partition: 133 languages: es names: !include country-names/co.yaml + postcode: + pattern: "dddddd" # Costa Rica (Costa Rica) @@ -353,6 +428,8 @@ cr: partition: 64 languages: es names: !include country-names/cr.yaml + postcode: + pattern: "ddddd" # Cuba (Cuba) @@ -360,6 +437,8 @@ cu: partition: 42 languages: es names: !include country-names/cu.yaml + postcode: + pattern: "ddddd" # Cape Verde (Cabo Verde) @@ -367,6 +446,8 @@ cv: partition: 89 languages: pt names: !include country-names/cv.yaml + postcode: + pattern: "dddd" # Curaçao (Curaçao) @@ -388,6 +469,8 @@ cy: partition: 114 languages: el, tr names: !include country-names/cy.yaml + postcode: + pattern: "(?:99|d)ddd" # Czechia (Česko) @@ -395,6 +478,9 @@ cz: partition: 124 languages: cs names: !include country-names/cz.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Germany (Deutschland) @@ -402,6 +488,8 @@ de: partition: 3 languages: de names: !include country-names/de.yaml + postcode: + pattern: "ddddd" # Djibouti (Djibouti جيبوتي) @@ -409,6 +497,7 @@ dj: partition: 43 languages: fr, ar, so, aa names: !include country-names/dj.yaml + postcode: no # Denmark (Danmark) @@ -416,6 +505,8 @@ dk: partition: 160 languages: da names: !include country-names/dk.yaml + postcode: + pattern: "dddd" # Dominica (Dominica) @@ -423,6 +514,7 @@ dm: partition: 209 languages: en names: !include country-names/dm.yaml + postcode: no # Dominican Republic (República Dominicana) @@ -430,6 +522,8 @@ do: partition: 37 languages: es names: !include country-names/do.yaml + postcode: + pattern: "ddddd" # Algeria (Algérie / ⵍⵣⵣⴰⵢⴻⵔ / الجزائر) @@ -437,6 +531,8 @@ dz: partition: 19 languages: ar, ber, fr names: !include country-names/dz.yaml + postcode: + pattern: "ddddd" # Ecuador (Ecuador) @@ -444,6 +540,8 @@ ec: partition: 78 languages: es names: !include country-names/ec.yaml + postcode: + pattern: "dddddd" # Estonia (Eesti) @@ -451,6 +549,8 @@ ee: partition: 125 languages: et names: !include country-names/ee.yaml + postcode: + pattern: "ddddd" # Egypt (مصر) @@ -458,6 +558,8 @@ eg: partition: 16 languages: ar names: !include country-names/eg.yaml + postcode: + pattern: "ddddd" # Sahrawi Arab Democratic Republic (الجمهورية العربية الصحراوية الديمقراطية) @@ -472,6 +574,7 @@ er: partition: 142 languages: ti, ar, en names: !include country-names/er.yaml + postcode: no # Spain (España) @@ -479,6 +582,8 @@ es: partition: 31 languages: es, ast, ca, eu, gl names: !include country-names/es.yaml + postcode: + pattern: "ddddd" # Ethiopia (ኢትዮጵያ) @@ -486,6 +591,8 @@ et: partition: 90 languages: am, om names: !include country-names/et.yaml + postcode: + pattern: "dddd" # Finland (Suomi) @@ -493,6 +600,8 @@ fi: partition: 20 languages: fi, sv, se names: !include country-names/fi.yaml + postcode: + pattern: "ddddd" # Fiji (Viti) @@ -500,6 +609,7 @@ fj: partition: 210 languages: en names: !include country-names/fj.yaml + postcode: no # Falkland Islands (Falkland Islands) @@ -507,6 +617,8 @@ fk: partition: 91 languages: en names: !include country-names/fk.yaml + postcode: + pattern: "FIQQ 1ZZ" # Federated States of Micronesia (Micronesia) @@ -514,6 +626,8 @@ fm: partition: 217 languages: en names: !include country-names/fm.yaml + postcode: + pattern: "ddddd" # Faroe Islands (Føroyar) @@ -521,6 +635,8 @@ fo: partition: 10 languages: fo, da names: !include country-names/fo.yaml + postcode: + pattern: "ddd" # France (France) @@ -528,6 +644,8 @@ fr: partition: 4 languages: fr names: !include country-names/fr.yaml + postcode: + pattern: "ddddd" # Gabon (Gabon) @@ -535,6 +653,7 @@ ga: partition: 239 languages: fr names: !include country-names/ga.yaml + postcode: no # United Kingdom (United Kingdom) @@ -542,6 +661,9 @@ gb: partition: 1 languages: en names: !include country-names/gb.yaml + postcode: + pattern: "(l?ld[A-Z0-9]?) ?(dll)" + output: \1 \2 # Grenada (Grenada) @@ -549,6 +671,7 @@ gd: partition: 143 languages: en names: !include country-names/gd.yaml + postcode: no # Georgia (საქართველო) @@ -556,6 +679,8 @@ ge: partition: 21 languages: ka names: !include country-names/ge.yaml + postcode: + pattern: "dddd" # French Guiana (Guyane Française) @@ -570,6 +695,9 @@ gg: partition: 77 languages: en names: !include country-names/gg.yaml + postcode: + pattern: "(GYdd?) ?(dll)" + output: \1 \2 # Ghana (Ghana) @@ -577,6 +705,8 @@ gh: partition: 211 languages: en names: !include country-names/gh.yaml + postcode: + pattern: "ll-d?ddd-dddd" # Gibraltar (Gibraltar) @@ -584,6 +714,9 @@ gi: partition: 138 languages: en names: !include country-names/gi.yaml + postcode: + pattern: "(GX11) ?(1AA)" + output: GX11 1AA # Greenland (Kalaallit Nunaat) @@ -591,6 +724,8 @@ gl: partition: 111 languages: kl, da names: !include country-names/gl.yaml + postcode: + pattern: "dddd" # The Gambia (Gambia) @@ -598,6 +733,7 @@ gm: partition: 212 languages: en names: !include country-names/gm.yaml + postcode: no # Guinea (Guinée) @@ -605,6 +741,8 @@ gn: partition: 240 languages: fr names: !include country-names/gn.yaml + postcode: + pattern: "ddd" # Guadeloupe (Guadeloupe) @@ -619,6 +757,7 @@ gq: partition: 12 languages: es, fr, pt names: !include country-names/gq.yaml + postcode: no # Greece (Ελλάς) @@ -626,6 +765,9 @@ gr: partition: 22 languages: el names: !include country-names/gr.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # South Georgia and the South Sandwich Islands (South Georgia and the South Sandwich Islands) @@ -633,6 +775,9 @@ gs: partition: 44 languages: en names: !include country-names/gs.yaml + postcode: + pattern: "(SIQQ) ?(1ZZ)" + output: \1 \2 # Guatemala (Guatemala) @@ -640,6 +785,8 @@ gt: partition: 57 languages: es names: !include country-names/gt.yaml + postcode: + pattern: "ddddd" # Guam (Guam) @@ -654,6 +801,8 @@ gw: partition: 8 languages: pt names: !include country-names/gw.yaml + postcode: + pattern: "dddd" # Guyana (Guyana) @@ -661,6 +810,7 @@ gy: partition: 213 languages: en names: !include country-names/gy.yaml + postcode: no # (Hong Kong) @@ -682,6 +832,8 @@ hn: partition: 56 languages: es names: !include country-names/hn.yaml + postcode: + pattern: "ddddd" # Croatia (Hrvatska) @@ -689,6 +841,8 @@ hr: partition: 92 languages: hr names: !include country-names/hr.yaml + postcode: + pattern: "ddddd" # Haiti (Ayiti) @@ -696,6 +850,8 @@ ht: partition: 29 languages: fr, ht names: !include country-names/ht.yaml + postcode: + pattern: "dddd" # Hungary (Magyarország) @@ -703,6 +859,8 @@ hu: partition: 45 languages: hu names: !include country-names/hu.yaml + postcode: + pattern: "dddd" # Indonesia (Indonesia) @@ -710,6 +868,8 @@ id: partition: 110 languages: id names: !include country-names/id.yaml + postcode: + pattern: "ddddd" # Ireland (Éire / Ireland) @@ -717,6 +877,9 @@ ie: partition: 46 languages: en, ga names: !include country-names/ie.yaml + postcode: + pattern: "(ldd) ?([0123456789ACDEFHKNPRTVWXY]{4})" + output: \1 \2 # Israel (ישראל) @@ -724,6 +887,8 @@ il: partition: 65 languages: he names: !include country-names/il.yaml + postcode: + pattern: "ddddddd" # Isle of Man (Isle of Man) @@ -731,6 +896,9 @@ im: partition: 190 languages: en names: !include country-names/im.yaml + postcode: + pattern: "(IMdd?) ?(dll)" + output: \1 \2 # India (India) @@ -738,6 +906,9 @@ in: partition: 128 languages: hi, en names: !include country-names/in.yaml + postcode: + pattern: "(ddd) ?(ddd)" + output: \1\2 # British Indian Ocean Territory (British Indian Ocean Territory) @@ -745,6 +916,9 @@ io: partition: 13 languages: en names: !include country-names/io.yaml + postcode: + pattern: "(BBND) ?(1ZZ)" + output: \1 \2 # Iraq (العراق) @@ -752,6 +926,8 @@ iq: partition: 144 languages: ar, ku names: !include country-names/iq.yaml + postcode: + pattern: "ddddd" # Iran (ایران) @@ -759,6 +935,9 @@ ir: partition: 80 languages: fa names: !include country-names/ir.yaml + postcode: + pattern: "(ddddd)[-_ ]?(ddddd)" + output: \1-\2 # Iceland (Ísland) @@ -766,6 +945,8 @@ is: partition: 134 languages: is names: !include country-names/is.yaml + postcode: + pattern: "ddd" # Italy (Italia) @@ -773,6 +954,8 @@ it: partition: 28 languages: it, de, fr names: !include country-names/it.yaml + postcode: + pattern: "ddddd" # Jersey (Jersey) @@ -780,6 +963,9 @@ je: partition: 123 languages: en names: !include country-names/je.yaml + postcode: + pattern: "(JEdd?) ?(dll)" + output: \1 \2 # Jamaica (Jamaica) @@ -787,6 +973,7 @@ jm: partition: 214 languages: en names: !include country-names/jm.yaml + postcode: no # Jordan (الأردن) @@ -794,6 +981,8 @@ jo: partition: 17 languages: ar names: !include country-names/jo.yaml + postcode: + pattern: "ddddd" # Japan (日本) @@ -801,6 +990,9 @@ jp: partition: 11 languages: ja names: !include country-names/jp.yaml + postcode: + pattern: "(ddd)-?(dddd)" + output: \1-\2 # Kenya (Kenya) @@ -808,6 +1000,8 @@ ke: partition: 126 languages: sw, en names: !include country-names/ke.yaml + postcode: + pattern: "ddddd" # Kyrgyzstan (Кыргызстан) @@ -815,6 +1009,8 @@ kg: partition: 93 languages: ky, ru names: !include country-names/kg.yaml + postcode: + pattern: "dddddd" # Cambodia (ព្រះរាជាណាចក្រ​កម្ពុជា) @@ -822,6 +1018,8 @@ kh: partition: 159 languages: km names: !include country-names/kh.yaml + postcode: + pattern: "dddddd" # Kiribati (Kiribati) @@ -829,6 +1027,7 @@ ki: partition: 215 languages: en names: !include country-names/ki.yaml + postcode: no # Comoros (Comores Komori جزر القمر) @@ -836,6 +1035,7 @@ km: partition: 47 languages: ar, fr, sw names: !include country-names/km.yaml + postcode: no # Saint Kitts and Nevis (Saint Kitts and Nevis) @@ -843,6 +1043,8 @@ kn: partition: 84 languages: en names: !include country-names/kn.yaml + postcode: + pattern: "dddd" # North Korea (조선민주주의인민공화국) @@ -850,6 +1052,7 @@ kp: partition: 48 languages: ko names: !include country-names/kp.yaml + postcode: no # South Korea (대한민국) @@ -857,6 +1060,8 @@ kr: partition: 49 languages: ko, en names: !include country-names/kr.yaml + postcode: + pattern: "ddddd" # Kuwait (الكويت) @@ -864,6 +1069,8 @@ kw: partition: 127 languages: ar names: !include country-names/kw.yaml + postcode: + pattern: "ddddd" # Cayman Islands (Cayman Islands) @@ -871,6 +1078,9 @@ ky: partition: 38 languages: en names: !include country-names/ky.yaml + postcode: + pattern: "(d)-(dddd)" + output: KY\1-\2 # Kazakhstan (Қазақстан) @@ -878,6 +1088,8 @@ kz: partition: 94 languages: kk, ru names: !include country-names/kz.yaml + postcode: + pattern: "(?:lddldld|dddddd)" # Laos (ປະເທດລາວ) @@ -885,6 +1097,8 @@ la: partition: 145 languages: lo names: !include country-names/la.yaml + postcode: + pattern: "ddddd" # Lebanon (لبنان) @@ -892,6 +1106,8 @@ lb: partition: 66 languages: ar, fr names: !include country-names/lb.yaml + postcode: + pattern: "(dddd)(?: ?dddd)?" # Saint Lucia (Saint Lucia) @@ -899,6 +1115,9 @@ lc: partition: 146 languages: en names: !include country-names/lc.yaml + postcode: + pattern: "(dd) ?(ddd)" + output: LC\1 \2 # Liechtenstein (Liechtenstein) @@ -906,6 +1125,8 @@ li: partition: 246 languages: de names: !include country-names/li.yaml + postcode: + pattern: "dddd" # Sri Lanka (ශ්‍රී ලංකාව இலங்கை) @@ -913,6 +1134,8 @@ lk: partition: 95 languages: si, ta names: !include country-names/lk.yaml + postcode: + pattern: "ddddd" # Liberia (Liberia) @@ -920,6 +1143,8 @@ lr: partition: 216 languages: en names: !include country-names/lr.yaml + postcode: + pattern: "dddd" # Lesotho (Lesotho) @@ -927,6 +1152,8 @@ ls: partition: 136 languages: en, st names: !include country-names/ls.yaml + postcode: + pattern: "ddd" # Lithuania (Lietuva) @@ -934,6 +1161,8 @@ lt: partition: 67 languages: lt names: !include country-names/lt.yaml + postcode: + pattern: "ddddd" # Luxembourg (Lëtzebuerg) @@ -941,6 +1170,8 @@ lu: partition: 74 languages: lb, fr, de names: !include country-names/lu.yaml + postcode: + pattern: "dddd" # Latvia (Latvija) @@ -948,6 +1179,9 @@ lv: partition: 162 languages: lv names: !include country-names/lv.yaml + postcode: + pattern: "(dddd)" + output: LV-\1 # Libya (ليبيا) @@ -955,6 +1189,7 @@ ly: partition: 163 languages: ar names: !include country-names/ly.yaml + postcode: no # Morocco (Maroc ⵍⵎⵖⵔⵉⴱ المغرب) @@ -962,6 +1197,8 @@ ma: partition: 23 languages: fr, zgh, ar names: !include country-names/ma.yaml + postcode: + pattern: "ddddd" # Monaco (Monaco) @@ -969,6 +1206,8 @@ mc: partition: 242 languages: fr names: !include country-names/mc.yaml + postcode: + pattern: "980dd" # Moldova (Moldova) @@ -976,6 +1215,9 @@ md: partition: 147 languages: ro, ru, uk names: !include country-names/md.yaml + postcode: + pattern: "(dddd)" + output: MD-\1 # Montenegro (Crna Gora / Црна Гора) @@ -983,6 +1225,8 @@ me: partition: 180 languages: srp, sr, hr, bs, sq names: !include country-names/me.yaml + postcode: + pattern: "ddddd" # Saint Martin (Saint Martin) @@ -997,6 +1241,8 @@ mg: partition: 164 languages: mg, fr names: !include country-names/mg.yaml + postcode: + pattern: "ddd" # Marshall Islands (Ṃajeḷ) @@ -1004,6 +1250,8 @@ mh: partition: 105 languages: en, mh names: !include country-names/mh.yaml + postcode: + pattern: "ddddd" # North Macedonia (Северна Македонија) @@ -1011,6 +1259,8 @@ mk: partition: 69 languages: mk names: !include country-names/mk.yaml + postcode: + pattern: "dddd" # Mali (Mali) @@ -1018,6 +1268,7 @@ ml: partition: 241 languages: fr names: !include country-names/ml.yaml + postcode: no # Myanmar (မြန်မာ) @@ -1025,6 +1276,8 @@ mm: partition: 148 languages: my names: !include country-names/mm.yaml + postcode: + pattern: "ddddd" # Mongolia (Монгол улс ᠮᠤᠩᠭᠤᠯ ᠤᠯᠤᠰ) @@ -1032,6 +1285,8 @@ mn: partition: 167 languages: mn names: !include country-names/mn.yaml + postcode: + pattern: "ddddd" # Macao (Macao) @@ -1039,6 +1294,7 @@ mo: partition: 191 languages: zh-hant, pt names: !include country-names/mo.yaml + postcode: no # Northern Mariana Islands (Northern Mariana Islands) @@ -1060,6 +1316,7 @@ mr: partition: 149 languages: ar, fr names: !include country-names/mr.yaml + postcode: no # Montserrat (Montserrat) @@ -1074,6 +1331,9 @@ mt: partition: 165 languages: mt, en names: !include country-names/mt.yaml + postcode: + pattern: "(lll) ?(dddd)" + output: \1 \2 # Mauritius (Mauritius) @@ -1081,6 +1341,8 @@ mu: partition: 150 languages: mfe, fr, en names: !include country-names/mu.yaml + postcode: + pattern: "ddddd" # Maldives (ދިވެހިރާއްޖެ) @@ -1088,6 +1350,8 @@ mv: partition: 96 languages: dv names: !include country-names/mv.yaml + postcode: + pattern: "ddddd" # Malawi (Malawi) @@ -1095,6 +1359,7 @@ mw: partition: 97 languages: en, ny names: !include country-names/mw.yaml + postcode: no # Mexico (México) @@ -1102,6 +1367,8 @@ mx: partition: 166 languages: es names: !include country-names/mx.yaml + postcode: + pattern: "ddddd" # Malaysia (Malaysia) @@ -1109,6 +1376,8 @@ my: partition: 7 languages: ms names: !include country-names/my.yaml + postcode: + pattern: "ddddd" # Mozambique (Moçambique) @@ -1116,6 +1385,8 @@ mz: partition: 98 languages: pt names: !include country-names/mz.yaml + postcode: + pattern: "(dddd)(?:-dd)?" # Namibia (Namibia) @@ -1123,6 +1394,8 @@ na: partition: 99 languages: en, sf, de names: !include country-names/na.yaml + postcode: + pattern: "ddddd" # New Caledonia (Nouvelle-Calédonie) @@ -1137,6 +1410,8 @@ ne: partition: 226 languages: fr names: !include country-names/ne.yaml + postcode: + pattern: "dddd" # Norfolk Island (Norfolk Island) @@ -1151,6 +1426,8 @@ ng: partition: 218 languages: en names: !include country-names/ng.yaml + postcode: + pattern: "dddddd" # Nicaragua (Nicaragua) @@ -1158,6 +1435,8 @@ ni: partition: 151 languages: es names: !include country-names/ni.yaml + postcode: + pattern: "ddddd" # Netherlands (Nederland) @@ -1165,6 +1444,9 @@ nl: partition: 63 languages: nl names: !include country-names/nl.yaml + postcode: + pattern: "(dddd) ?(ll)" + output: \1 \2 # Norway (Norge) @@ -1172,6 +1454,8 @@ nl: partition: 60 languages: nb, nn, no, se names: !include country-names/no.yaml + postcode: + pattern: "dddd" # Nepal (Nepal) @@ -1179,6 +1463,8 @@ np: partition: 50 languages: ne names: !include country-names/np.yaml + postcode: + pattern: "ddddd" # Nauru (Naoero) @@ -1186,6 +1472,7 @@ nr: partition: 70 languages: na, en names: !include country-names/nr.yaml + postcode: no # Niue (Niuē) @@ -1193,6 +1480,7 @@ nu: partition: 178 languages: niu, en names: !include country-names/nu.yaml + postcode: no # New Zealand (New Zealand / Aotearoa) @@ -1200,6 +1488,8 @@ nz: partition: 27 languages: mi, en names: !include country-names/nz.yaml + postcode: + pattern: "dddd" # Oman (عمان) @@ -1207,6 +1497,8 @@ om: partition: 137 languages: ar names: !include country-names/om.yaml + postcode: + pattern: "ddd" # Panama (Panamá) @@ -1214,6 +1506,8 @@ pa: partition: 152 languages: es names: !include country-names/pa.yaml + postcode: + pattern: "dddd" # Peru (Perú) @@ -1221,6 +1515,8 @@ pe: partition: 51 languages: es names: !include country-names/pe.yaml + postcode: + pattern: "ddddd" # French Polynesia (Polynésie française) @@ -1235,6 +1531,8 @@ pg: partition: 71 languages: en, tpi, ho names: !include country-names/pg.yaml + postcode: + pattern: "ddd" # Philippines (Philippines) @@ -1242,6 +1540,8 @@ ph: partition: 26 languages: en, tl names: !include country-names/ph.yaml + postcode: + pattern: "dddd" # Pakistan (پاکستان) @@ -1249,6 +1549,8 @@ pk: partition: 14 languages: en, ur, pnb, sd, ps, bal names: !include country-names/pk.yaml + postcode: + pattern: "ddddd" # Poland (Polska) @@ -1256,6 +1558,9 @@ pl: partition: 168 languages: pl names: !include country-names/pl.yaml + postcode: + pattern: "(dd)[ -]?(ddd)" + output: \1-\2 # Saint Pierre and Miquelon (Saint-Pierre-et-Miquelon) @@ -1270,6 +1575,9 @@ pn: partition: 113 languages: en, pih names: !include country-names/pn.yaml + postcode: + pattern: "(PCRN) ?(1ZZ)" + output: \1 \2 # Puerto Rico (Puerto Rico) @@ -1284,6 +1592,8 @@ ps: partition: 194 languages: ar, he names: !include country-names/ps.yaml + postcode: + pattern: "ddd" # Portugal (Portugal) @@ -1291,6 +1601,8 @@ pt: partition: 34 languages: pt names: !include country-names/pt.yaml + postcode: + pattern: "dddd(?:-ddd)?" # Palau (Belau) @@ -1298,6 +1610,8 @@ pw: partition: 195 languages: en, pau, ja, sov, tox names: !include country-names/pw.yaml + postcode: + pattern: "969(39|40)" # Paraguay (Paraguay) @@ -1305,6 +1619,8 @@ py: partition: 101 languages: es, gn names: !include country-names/py.yaml + postcode: + pattern: "dddddd" # Qatar (قطر) @@ -1312,6 +1628,7 @@ qa: partition: 169 languages: ar names: !include country-names/qa.yaml + postcode: no # (Réunion) @@ -1326,6 +1643,8 @@ ro: partition: 170 languages: ro names: !include country-names/ro.yaml + postcode: + pattern: "dddddd" # Serbia (Србија) @@ -1333,6 +1652,8 @@ rs: partition: 59 languages: sr names: !include country-names/rs.yaml + postcode: + pattern: "ddddd" # Russia (Россия) @@ -1340,6 +1661,8 @@ ru: partition: 135 languages: ru names: !include country-names/ru.yaml + postcode: + pattern: "dddddd" # Rwanda (Rwanda) @@ -1347,6 +1670,7 @@ rw: partition: 102 languages: rw, fr, en names: !include country-names/rw.yaml + postcode: no # Saudi Arabia (السعودية) @@ -1354,6 +1678,8 @@ sa: partition: 52 languages: ar names: !include country-names/sa.yaml + postcode: + pattern: "ddddd(?:-dddd)?" # Solomon Islands (Solomon Islands) @@ -1361,6 +1687,7 @@ sb: partition: 201 languages: en names: !include country-names/sb.yaml + postcode: no # Seychelles (Sesel) @@ -1368,6 +1695,7 @@ sc: partition: 79 languages: fr, en, crs names: !include country-names/sc.yaml + postcode: no # Sudan (السودان) @@ -1375,6 +1703,8 @@ sd: partition: 72 languages: ar, en names: !include country-names/sd.yaml + postcode: + pattern: "ddddd" # Sweden (Sverige) @@ -1382,6 +1712,9 @@ se: partition: 112 languages: sv names: !include country-names/se.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Singapore (Singapore) @@ -1389,6 +1722,8 @@ sg: partition: 115 languages: zh-hans, en, ms, ta names: !include country-names/sg.yaml + postcode: + pattern: "dddddd" # Saint Helena, Ascension and Tristan da Cunha (Saint Helena, Ascension and Tristan da Cunha) @@ -1396,6 +1731,9 @@ sh: partition: 196 languages: en names: !include country-names/sh.yaml + postcode: + pattern: "(ASCN|STHL|TDCU) ?(1ZZ)" + output: \1 \2 # Slovenia (Slovenija) @@ -1403,6 +1741,8 @@ si: partition: 36 languages: sl names: !include country-names/si.yaml + postcode: + pattern: "dddd" # (Svalbard and Jan Mayen) @@ -1417,6 +1757,9 @@ sk: partition: 172 languages: sk names: !include country-names/sk.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Sierra Leone (Sierra Leone) @@ -1424,6 +1767,7 @@ sl: partition: 219 languages: en names: !include country-names/sl.yaml + postcode: no # San Marino (San Marino) @@ -1431,6 +1775,8 @@ sm: partition: 153 languages: it names: !include country-names/sm.yaml + postcode: + pattern: "4789d" # Senegal (Sénégal) @@ -1438,6 +1784,8 @@ sn: partition: 237 languages: fr names: !include country-names/sn.yaml + postcode: + pattern: "ddddd" # Somalia (Soomaaliya الصومال) @@ -1445,6 +1793,9 @@ so: partition: 154 languages: so, ar names: !include country-names/so.yaml + postcode: + pattern: "(ll) ?(ddddd)" + output: \1 \2 # Suriname (Suriname) @@ -1452,6 +1803,7 @@ sr: partition: 24 languages: nl names: !include country-names/sr.yaml + postcode: no # South Sudan (South Sudan) @@ -1459,6 +1811,7 @@ ss: partition: 247 languages: en names: !include country-names/ss.yaml + postcode: no # São Tomé and Príncipe (São Tomé e Príncipe) @@ -1466,6 +1819,7 @@ st: partition: 53 languages: pt names: !include country-names/st.yaml + postcode: no # El Salvador (El Salvador) @@ -1473,6 +1827,8 @@ sv: partition: 103 languages: es names: !include country-names/sv.yaml + postcode: + pattern: "dddd" # (Sint Maarten) @@ -1487,6 +1843,7 @@ sy: partition: 104 languages: ar names: !include country-names/sy.yaml + postcode: no # Eswatini (eSwatini) @@ -1494,6 +1851,8 @@ sz: partition: 82 languages: en, ss names: !include country-names/sz.yaml + postcode: + pattern: "lddd" # Turks and Caicos Islands (Turks and Caicos Islands) @@ -1501,6 +1860,9 @@ tc: partition: 106 languages: en names: !include country-names/tc.yaml + postcode: + pattern: "(TKCA) ?(1ZZ)" + output: \1 \2 # Chad (Tchad تشاد) @@ -1508,6 +1870,7 @@ td: partition: 68 languages: fr, ar names: !include country-names/td.yaml + postcode: no # French Southern Lands (Terres australes et antarctiques françaises) @@ -1522,6 +1885,7 @@ tg: partition: 243 languages: fr names: !include country-names/tg.yaml + postcode: no # Thailand (ประเทศไทย) @@ -1529,6 +1893,8 @@ th: partition: 32 languages: th names: !include country-names/th.yaml + postcode: + pattern: "ddddd" # Tajikistan (Тоҷикистон) @@ -1536,6 +1902,8 @@ tj: partition: 129 languages: tg, ru names: !include country-names/tj.yaml + postcode: + pattern: "dddddd" # Tokelau (Tokelau) @@ -1543,6 +1911,7 @@ tk: partition: 179 languages: tkl, en, sm names: !include country-names/tk.yaml + postcode: no # East Timor (Timór Lorosa'e) @@ -1550,6 +1919,7 @@ tl: partition: 161 languages: pt, tet names: !include country-names/tl.yaml + postcode: no # Turkmenistan (Türkmenistan) @@ -1557,6 +1927,8 @@ tm: partition: 54 languages: tk names: !include country-names/tm.yaml + postcode: + pattern: "dddddd" # Tunisia (تونس) @@ -1564,6 +1936,8 @@ tn: partition: 18 languages: ar, fr names: !include country-names/tn.yaml + postcode: + pattern: "dddd" # Tonga (Tonga) @@ -1571,6 +1945,7 @@ to: partition: 220 languages: en names: !include country-names/to.yaml + postcode: no # Turkey (Türkiye) @@ -1578,6 +1953,8 @@ tr: partition: 81 languages: tr names: !include country-names/tr.yaml + postcode: + pattern: "ddddd" # Trinidad and Tobago (Trinidad and Tobago) @@ -1585,6 +1962,8 @@ tt: partition: 221 languages: en names: !include country-names/tt.yaml + postcode: + pattern: "dddddd" # Tuvalu (Tuvalu) @@ -1592,6 +1971,7 @@ tv: partition: 156 languages: en names: !include country-names/tv.yaml + postcode: no # Taiwan (臺灣) @@ -1599,6 +1979,8 @@ tw: partition: 25 languages: zh-hant names: !include country-names/tw.yaml + postcode: + pattern: "ddd(?:ddd?)?" # Tanzania (Tanzania) @@ -1606,6 +1988,8 @@ tz: partition: 130 languages: sw, en names: !include country-names/tz.yaml + postcode: + pattern: "ddddd" # Ukraine (Україна) @@ -1613,6 +1997,8 @@ ua: partition: 173 languages: uk names: !include country-names/ua.yaml + postcode: + pattern: "d?ddddd" # Uganda (Uganda) @@ -1620,6 +2006,7 @@ ug: partition: 155 languages: en, sw names: !include country-names/ug.yaml + postcode: no # (United States Minor Outlying Islands) @@ -1627,6 +2014,8 @@ um: partition: 198 languages: en names: !include country-names/um.yaml + postcode: + pattern: "96898" # United States (United States) @@ -1634,6 +2023,8 @@ us: partition: 2 languages: en names: !include country-names/us.yaml + postcode: + pattern: "ddddd" # Uruguay (Uruguay) @@ -1641,6 +2032,8 @@ uy: partition: 174 languages: es names: !include country-names/uy.yaml + postcode: + pattern: "ddddd" # Uzbekistan (Oʻzbekiston) @@ -1648,6 +2041,8 @@ uz: partition: 157 languages: uz, kaa names: !include country-names/uz.yaml + postcode: + pattern: "dddddd" # Vatican City (Civitas Vaticana) @@ -1655,6 +2050,8 @@ va: partition: 107 languages: it names: !include country-names/va.yaml + postcode: + pattern: "00120" # Saint Vincent and the Grenadines (Saint Vincent and the Grenadines) @@ -1662,6 +2059,9 @@ vc: partition: 171 languages: en names: !include country-names/vc.yaml + postcode: + pattern: "(dddd)" + output: VC\1 # Venezuela (Venezuela) @@ -1669,6 +2069,8 @@ ve: partition: 108 languages: es names: !include country-names/ve.yaml + postcode: + pattern: "dddd" # British Virgin Islands (British Virgin Islands) @@ -1676,6 +2078,9 @@ vg: partition: 109 languages: en names: !include country-names/vg.yaml + postcode: + pattern: "(dddd)" + output: VG\1 # (United States Virgin Islands) @@ -1690,6 +2095,8 @@ vn: partition: 75 languages: vi names: !include country-names/vn.yaml + postcode: + pattern: "ddddd" # Vanuatu (Vanuatu) @@ -1697,6 +2104,7 @@ vu: partition: 116 languages: bi, en, fr names: !include country-names/vu.yaml + postcode: no # Wallis and Futuna Islands (Wallis-et-Futuna) @@ -1718,6 +2126,8 @@ xk: partition: 59 languages: sq, sr names: !include country-names/xk.yaml + postcode: + pattern: "ddddd" # Yemen (اليمن) @@ -1725,6 +2135,7 @@ ye: partition: 55 languages: ar names: !include country-names/ye.yaml + postcode: no # Mayotte (Mayotte) @@ -1739,6 +2150,8 @@ za: partition: 76 languages: en, af, st, tn, xh, zu names: !include country-names/za.yaml + postcode: + pattern: "dddd" # Zambia (Zambia) @@ -1746,6 +2159,8 @@ zm: partition: 222 languages: en names: !include country-names/zm.yaml + postcode: + pattern: "dddd" # Zimbabwe (Zimbabwe) @@ -1753,4 +2168,4 @@ zw: partition: 223 languages: en, sn, nd names: !include country-names/zw.yaml - + postcode: no diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index cd9c0d6d..212fdcb9 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -32,6 +32,9 @@ sanitizers: - streetnumber convert-to-name: - (\A|.*,)[^\d,]{3,}(,.*|\Z) + - step: clean-postcodes + convert-to-address: yes + default-pattern: "[A-Z0-9- ]{3,12}" - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language @@ -43,6 +46,8 @@ token-analysis: - analyzer: generic - id: "@housenumber" analyzer: housenumbers + - id: "@postcode" + analyzer: postcodes - id: bg analyzer: generic mode: variant-only diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 15beab57..4d146d18 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -163,25 +163,8 @@ Feature: Import of postcodes | de | 01982 | country:de | And there are word tokens for postcodes 01982 - Scenario: Different postcodes with the same normalization can both be found - Given the places - | osm | class | type | addr+postcode | addr+housenumber | geometry | - | N34 | place | house | EH4 7EA | 111 | country:gb | - | N35 | place | house | E4 7EA | 111 | country:gb | - When importing - Then location_postcode contains exactly - | country | postcode | geometry | - | gb | EH4 7EA | country:gb | - | gb | E4 7EA | country:gb | - When sending search query "EH4 7EA" - Then results contain - | type | display_name | - | postcode | EH4 7EA | - When sending search query "E4 7EA" - Then results contain - | type | display_name | - | postcode | E4 7EA | + @Fail Scenario: search and address ranks for GB post codes correctly assigned Given the places | osm | class | type | postcode | geometry | @@ -195,55 +178,19 @@ Feature: Import of postcodes | E45 2 | gb | 23 | 5 | | Y45 | gb | 21 | 5 | - Scenario: wrongly formatted GB postcodes are down-ranked + @fail-legacy + Scenario: Postcodes outside all countries are not added to the postcode and word table Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | EA452CD | country:gb | - | N2 | place | postcode | E45 23 | country:gb | - When importing - Then location_postcode contains exactly - | postcode | country | rank_search | rank_address | - | EA452CD | gb | 30 | 30 | - | E45 23 | gb | 30 | 30 | - - Scenario: search and address rank for DE postcodes correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | 56427 | country:de | - | N2 | place | postcode | 5642 | country:de | - | N3 | place | postcode | 5642A | country:de | - | N4 | place | postcode | 564276 | country:de | - When importing - Then location_postcode contains exactly - | postcode | country | rank_search | rank_address | - | 56427 | de | 21 | 11 | - | 5642 | de | 30 | 30 | - | 5642A | de | 30 | 30 | - | 564276 | de | 30 | 30 | - - Scenario: search and address rank for other postcodes are correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | 1 | country:ca | - | N2 | place | postcode | X3 | country:ca | - | N3 | place | postcode | 543 | country:ca | - | N4 | place | postcode | 54dc | country:ca | - | N5 | place | postcode | 12345 | country:ca | - | N6 | place | postcode | 55TT667 | country:ca | - | N7 | place | postcode | 123-65 | country:ca | - | N8 | place | postcode | 12 445 4 | country:ca | - | N9 | place | postcode | A1:bc10 | country:ca | + | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | + | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 | + And the places + | osm | class | type | name | geometry | + | N1 | place | hamlet | Null Island | 0 0 | When importing Then location_postcode contains exactly - | postcode | country | rank_search | rank_address | - | 1 | ca | 21 | 11 | - | X3 | ca | 21 | 11 | - | 543 | ca | 21 | 11 | - | 54DC | ca | 21 | 11 | - | 12345 | ca | 21 | 11 | - | 55TT667 | ca | 21 | 11 | - | 123-65 | ca | 25 | 11 | - | 12 445 4 | ca | 25 | 11 | - | A1:BC10 | ca | 25 | 11 | - - + | country | postcode | geometry | + And there are no word tokens for postcodes 01982 + When sending search query "111, 01982 Null Island" + Then results contain + | osm | display_name | + | N34 | 111, Null Island, 01982 | diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index f91c0050..e5a7a592 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -168,14 +168,6 @@ Feature: Import and search of names | ID | osm | | 0 | R1 | - Scenario: Unprintable characters in postcodes are ignored - Given the named places - | osm | class | type | address | geometry | - | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de | - When importing - And sending search query "1234" - Then result 0 has not attributes osm_type - Scenario Outline: Housenumbers with special characters are found Given the grid | 1 | | | | 2 | diff --git a/test/bdd/db/query/postcodes.feature b/test/bdd/db/query/postcodes.feature new file mode 100644 index 00000000..a3ca7035 --- /dev/null +++ b/test/bdd/db/query/postcodes.feature @@ -0,0 +1,97 @@ +@DB +Feature: Querying fo postcode variants + + Scenario: Postcodes in Singapore (6-digit postcode) + Given the grid with origin SG + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | Lorang | 399174 | 10,11 | + When importing + When sending search query "399174" + Then results contain + | ID | type | display_name | + | 0 | postcode | 399174 | + + + @fail-legacy + Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces) + Given the grid with origin NL + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | De Weide | 3993 DX | 10,11 | + When importing + When sending search query "3993 DX" + Then results contain + | ID | type | display_name | + | 0 | postcode | 3993 DX | + When sending search query "3993dx" + Then results contain + | ID | type | display_name | + | 0 | postcode | 3993 DX | + + Examples: + | postcode | + | 3993 DX | + | 3993DX | + | 3993 dx | + + + @fail-legacy + Scenario: Postcodes in Singapore (6-digit postcode) + Given the grid with origin SG + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | Lorang | 399174 | 10,11 | + When importing + When sending search query "399174" + Then results contain + | ID | type | display_name | + | 0 | postcode | 399174 | + + + @fail-legacy + Scenario Outline: Postcodes in Andorra (with country code) + Given the grid with origin AD + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | Lorang | | 10,11 | + When importing + When sending search query "675" + Then results contain + | ID | type | display_name | + | 0 | postcode | AD675 | + When sending search query "AD675" + Then results contain + | ID | type | display_name | + | 0 | postcode | AD675 | + + Examples: + | postcode | + | 675 | + | AD 675 | + | AD675 | + + + Scenario: Different postcodes with the same normalization can both be found + Given the places + | osm | class | type | addr+postcode | addr+housenumber | geometry | + | N34 | place | house | EH4 7EA | 111 | country:gb | + | N35 | place | house | E4 7EA | 111 | country:gb | + When importing + Then location_postcode contains exactly + | country | postcode | geometry | + | gb | EH4 7EA | country:gb | + | gb | E4 7EA | country:gb | + When sending search query "EH4 7EA" + Then results contain + | type | display_name | + | postcode | EH4 7EA | + When sending search query "E4 7EA" + Then results contain + | type | display_name | + | postcode | E4 7EA | + diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index 44c82b01..8fd918f8 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -18,13 +18,19 @@ from nominatim.tokenizer import factory as tokenizer_factory def check_database_integrity(context): """ Check some generic constraints on the tables. """ - # place_addressline should not have duplicate (place_id, address_place_id) - cur = context.db.cursor() - cur.execute("""SELECT count(*) FROM - (SELECT place_id, address_place_id, count(*) as c - FROM place_addressline GROUP BY place_id, address_place_id) x - WHERE c > 1""") - assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline" + with context.db.cursor() as cur: + # place_addressline should not have duplicate (place_id, address_place_id) + cur.execute("""SELECT count(*) FROM + (SELECT place_id, address_place_id, count(*) as c + FROM place_addressline GROUP BY place_id, address_place_id) x + WHERE c > 1""") + assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline" + + # word table must not have empty word_tokens + if context.nominatim.tokenizer != 'legacy': + cur.execute("SELECT count(*) FROM word WHERE word_token = ''") + assert cur.fetchone()[0] == 0, "Empty word tokens found in word table" + ################################ GIVEN ################################## diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py new file mode 100644 index 00000000..44376196 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for the sanitizer that normalizes postcodes. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo +from nominatim.tools import country_info + +@pytest.fixture +def sanitize(def_config, request): + country_info.setup_country_config(def_config) + sanitizer_args = {'step': 'clean-postcodes'} + for mark in request.node.iter_markers(name="sanitizer_params"): + sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()}) + + def _run(country=None, **kwargs): + pi = {'address': kwargs} + if country is not None: + pi['country_code'] = country + + _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi)) + + return sorted([(p.kind, p.name) for p in address]) + + return _run + + +@pytest.mark.parametrize("country", (None, 'ae')) +def test_postcode_no_country(sanitize, country): + assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')] + + +@pytest.mark.parametrize("country", (None, 'ae')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_no_country_drop(sanitize, country): + assert sanitize(country=country, postcode='23231') == [] + + +@pytest.mark.parametrize("postcode", ('12345', ' 12345 ', 'de 12345', + 'DE12345', 'DE 12345', 'DE-12345')) +def test_postcode_pass_good_format(sanitize, postcode): + assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')] + + +@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....', + 'DE 12345', 'DEF12345', 'CH 12345')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_drop_bad_format(sanitize, postcode): + assert sanitize(country='de', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('1234', '9435', '99000')) +def test_postcode_cyprus_pass(sanitize, postcode): + assert sanitize(country='cy', postcode=postcode) == [('postcode', postcode)] + + +@pytest.mark.parametrize("postcode", ('91234', '99a45', '567')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_cyprus_fail(sanitize, postcode): + assert sanitize(country='cy', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('123456', 'A33F2G7')) +def test_postcode_kazakhstan_pass(sanitize, postcode): + assert sanitize(country='kz', postcode=postcode) == [('postcode', postcode)] + + +@pytest.mark.parametrize("postcode", ('V34T6Y923456', '99345')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_kazakhstan_fail(sanitize, postcode): + assert sanitize(country='kz', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534')) +def test_postcode_sweden_pass(sanitize, postcode): + assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')] + + +@pytest.mark.parametrize("postcode", ('67 345', '671123')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_sweden_fail(sanitize, postcode): + assert sanitize(country='se', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44')) +@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}') +def test_postcode_default_pattern_pass(sanitize, postcode): + assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())] + + +@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224')) +@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}') +def test_postcode_default_pattern_fail(sanitize, postcode): + assert sanitize(country='an', postcode=postcode) == [] + diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index d85a5b65..b9de97bc 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), variants=('~gasse -> gasse', 'street => st', ), - sanitizers=[], with_housenumber=False): + sanitizers=[], with_housenumber=False, + with_postcode=False): cfgstr = {'normalization': list(norm), 'sanitizers': sanitizers, 'transliteration': list(trans), @@ -81,6 +82,9 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, if with_housenumber: cfgstr['token-analysis'].append({'id': '@housenumber', 'analyzer': 'housenumbers'}) + if with_postcode: + cfgstr['token-analysis'].append({'id': '@postcode', + 'analyzer': 'postcodes'}) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config) @@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer): anl.normalize_postcode('38 Б') == '38 Б' -def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table): - table_factory('location_postcode', 'postcode TEXT', - content=(('1234',), ('12 34',), ('AB23',), ('1234',))) +class TestPostcodes: - with analyzer() as anl: - anl.update_postcodes_from_db() + @pytest.fixture(autouse=True) + def setup(self, analyzer, sql_functions): + sanitizers = [{'step': 'clean-postcodes'}] + with analyzer(sanitizers=sanitizers, with_postcode=True) as anl: + self.analyzer = anl + yield anl - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'} + def process_postcode(self, cc, postcode): + return self.analyzer.process_place(PlaceInfo({'country_code': cc, + 'address': {'postcode': postcode}})) -def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table): - table_factory('location_postcode', 'postcode TEXT', - content=(('1234',), ('45BC', ), ('XX45', ))) - word_table.add_postcode(' 1234', '1234') - word_table.add_postcode(' 5678', '5678') - with analyzer() as anl: - anl.update_postcodes_from_db() + def test_update_postcodes_from_db_empty(self, table_factory, word_table): + table_factory('location_postcode', 'country_code TEXT, postcode TEXT', + content=(('de', '12345'), ('se', '132 34'), + ('bm', 'AB23'), ('fr', '12345'))) + + self.analyzer.update_postcodes_from_db() + + assert word_table.count() == 5 + assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'} + + + def test_update_postcodes_from_db_ambigious(self, table_factory, word_table): + table_factory('location_postcode', 'country_code TEXT, postcode TEXT', + content=(('in', '123456'), ('sg', '123456'))) + + self.analyzer.update_postcodes_from_db() + + assert word_table.count() == 3 + assert word_table.get_postcodes() == {'123456', '123456@123 456'} + + + def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table): + table_factory('location_postcode', 'country_code TEXT, postcode TEXT', + content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45'))) + word_table.add_postcode(' 1234', '1234') + word_table.add_postcode(' 5678', '5678') + + self.analyzer.update_postcodes_from_db() + + assert word_table.count() == 5 + assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'} + + + def test_process_place_postcode_simple(self, word_table): + info = self.process_postcode('de', '12345') + + assert info['postcode'] == '12345' + + assert word_table.get_postcodes() == {'12345', } + + + def test_process_place_postcode_with_space(self, word_table): + info = self.process_postcode('in', '123 567') + + assert info['postcode'] == '123567' + + assert word_table.get_postcodes() == {'123567@123 567', } - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'} def test_update_special_phrase_empty_table(analyzer, word_table): @@ -437,13 +482,6 @@ class TestPlaceAddress: assert word_table.get_postcodes() == {pcode, } - @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) - def test_process_place_bad_postcode(self, word_table, pcode): - self.process_address(postcode=pcode) - - assert not word_table.get_postcodes() - - @pytest.mark.parametrize('hnr', ['123a', '1', '101']) def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id): info = self.process_address(housenumber=hnr) diff --git a/test/python/tokenizer/token_analysis/test_analysis_postcodes.py b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py new file mode 100644 index 00000000..623bed54 --- /dev/null +++ b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for special postcode analysis and variant generation. +""" +import pytest + +from icu import Transliterator + +import nominatim.tokenizer.token_analysis.postcodes as module +from nominatim.errors import UsageError + +DEFAULT_NORMALIZATION = """ :: NFD (); + '🜳' > ' '; + [[:Nonspacing Mark:] [:Cf:]] >; + :: lower (); + [[:Punctuation:][:Space:]]+ > ' '; + :: NFC (); + """ + +DEFAULT_TRANSLITERATION = """ :: Latin (); + '🜵' > ' '; + """ + +@pytest.fixture +def analyser(): + rules = { 'analyzer': 'postcodes'} + config = module.configure(rules, DEFAULT_NORMALIZATION) + + trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + + return module.create(norm, trans, config) + + +def get_normalized_variants(proc, name): + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + return proc.get_variants_ascii(norm.transliterate(name).strip()) + + +@pytest.mark.parametrize('name,norm', [('12', '12'), + ('A 34 ', 'A 34'), + ('34-av', '34-AV')]) +def test_normalize(analyser, name, norm): + assert analyser.normalize(name) == norm + + +@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}), + ('AB-998', {'ab 998', 'ab998'}), + ('23 FGH D3', {'23 fgh d3', '23fgh d3', + '23 fghd3', '23fghd3'})]) +def test_get_variants_ascii(analyser, postcode, variants): + out = analyser.get_variants_ascii(postcode) + + assert len(out) == len(set(out)) + assert set(out) == variants diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py index bdfe3094..0c4b93fc 100644 --- a/test/python/tools/test_postcodes.py +++ b/test/python/tools/test_postcodes.py @@ -11,7 +11,7 @@ import subprocess import pytest -from nominatim.tools import postcodes +from nominatim.tools import postcodes, country_info import dummy_tokenizer class MockPostcodeTable: @@ -64,11 +64,26 @@ class MockPostcodeTable: def tokenizer(): return dummy_tokenizer.DummyTokenizer(None, None) + @pytest.fixture -def postcode_table(temp_db_conn, placex_table): +def postcode_table(def_config, temp_db_conn, placex_table): + country_info.setup_country_config(def_config) return MockPostcodeTable(temp_db_conn) +@pytest.fixture +def insert_implicit_postcode(placex_table, place_row): + """ + Inserts data into the placex and place table + which can then be used to compute one postcode. + """ + def _insert_implicit_postcode(osm_id, country, geometry, address): + placex_table.add(osm_id=osm_id, country=country, geom=geometry) + place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address) + + return _insert_implicit_postcode + + def test_postcodes_empty(dsn, postcode_table, place_table, tmp_path, tokenizer): postcodes.update_postcodes(dsn, tmp_path, tokenizer) @@ -193,27 +208,30 @@ def test_can_compute(dsn, table_factory): table_factory('place') assert postcodes.can_compute(dsn) + def test_no_placex_entry(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer): #Rewrite the get_country_code function to verify its execution. temp_db_cursor.execute(""" CREATE OR REPLACE FUNCTION get_country_code(place geometry) RETURNS TEXT AS $$ BEGIN - RETURN 'fr'; + RETURN 'yy'; END; $$ LANGUAGE plpgsql; """) place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511')) postcodes.update_postcodes(dsn, tmp_path, tokenizer) - assert postcode_table.row_set == {('fr', 'AB 4511', 10, 12)} + assert postcode_table.row_set == {('yy', 'AB 4511', 10, 12)} -@pytest.fixture -def insert_implicit_postcode(placex_table, place_row): - """ - Inserts data into the placex and place table - which can then be used to compute one postcode. - """ - def _insert_implicit_postcode(osm_id, country, geometry, address): - placex_table.add(osm_id=osm_id, country=country, geom=geometry) - place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address) - return _insert_implicit_postcode +def test_discard_badly_formatted_postcodes(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer): + #Rewrite the get_country_code function to verify its execution. + temp_db_cursor.execute(""" + CREATE OR REPLACE FUNCTION get_country_code(place geometry) + RETURNS TEXT AS $$ BEGIN + RETURN 'fr'; + END; $$ LANGUAGE plpgsql; + """) + place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511')) + postcodes.update_postcodes(dsn, tmp_path, tokenizer) + + assert not postcode_table.row_set diff --git a/test/python/utils/test_centroid.py b/test/python/utils/test_centroid.py new file mode 100644 index 00000000..63d967e7 --- /dev/null +++ b/test/python/utils/test_centroid.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for centroid computation. +""" +import pytest + +from nominatim.utils.centroid import PointsCentroid + +def test_empty_set(): + c = PointsCentroid() + + with pytest.raises(ValueError, match='No points'): + c.centroid() + + +@pytest.mark.parametrize("centroid", [(0,0), (-1, 3), [0.0000032, 88.4938]]) +def test_one_point_centroid(centroid): + c = PointsCentroid() + + c += centroid + + assert len(c.centroid()) == 2 + assert c.centroid() == (pytest.approx(centroid[0]), pytest.approx(centroid[1])) + + +def test_multipoint_centroid(): + c = PointsCentroid() + + c += (20.0, -10.0) + assert c.centroid() == (pytest.approx(20.0), pytest.approx(-10.0)) + c += (20.2, -9.0) + assert c.centroid() == (pytest.approx(20.1), pytest.approx(-9.5)) + c += (20.2, -9.0) + assert c.centroid() == (pytest.approx(20.13333), pytest.approx(-9.333333)) + + +def test_manypoint_centroid(): + c = PointsCentroid() + + for _ in range(10000): + c += (4.564732, -0.000034) + + assert c.centroid() == (pytest.approx(4.564732), pytest.approx(-0.000034)) + + +@pytest.mark.parametrize("param", ["aa", None, 5, [1, 2, 3], (3, None), ("a", 3.9)]) +def test_add_non_tuple(param): + c = PointsCentroid() + + with pytest.raises(ValueError, match='2-element tuples'): + c += param