]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2757 from lonvia/filter-postcodes
authorSarah Hoffmann <lonvia@denofr.de>
Fri, 24 Jun 2022 19:09:41 +0000 (21:09 +0200)
committerGitHub <noreply@github.com>
Fri, 24 Jun 2022 19:09:41 +0000 (21:09 +0200)
Add filtering, normalisation and variants for postcodes

35 files changed:
.pylintrc
docs/customize/Country-Settings.md [new file with mode: 0644]
docs/customize/Tokenizers.md
docs/develop/Tokenizers.md
docs/mkdocs.yml
lib-php/TokenPostcode.php
lib-php/tokenizer/icu_tokenizer.php
lib-sql/functions/address_lookup.sql
lib-sql/functions/interpolation.sql
lib-sql/functions/placex_triggers.sql
lib-sql/tokenizer/icu_tokenizer.sql
lib-sql/tokenizer/legacy_tokenizer.sql
nominatim/data/__init__.py [new file with mode: 0644]
nominatim/data/postcode_format.py [new file with mode: 0644]
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
nominatim/tokenizer/sanitizers/clean_postcodes.py [new file with mode: 0644]
nominatim/tokenizer/sanitizers/config.py
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
nominatim/tokenizer/token_analysis/postcodes.py [new file with mode: 0644]
nominatim/tools/country_info.py
nominatim/tools/postcodes.py
nominatim/utils/__init__.py [new file with mode: 0644]
nominatim/utils/centroid.py [new file with mode: 0644]
settings/country_settings.yaml
settings/icu_tokenizer.yaml
test/bdd/db/import/postcodes.feature
test/bdd/db/query/normalization.feature
test/bdd/db/query/postcodes.feature [new file with mode: 0644]
test/bdd/steps/steps_db_ops.py
test/python/tokenizer/sanitizers/test_clean_postcodes.py [new file with mode: 0644]
test/python/tokenizer/test_icu.py
test/python/tokenizer/token_analysis/test_analysis_postcodes.py [new file with mode: 0644]
test/python/tools/test_postcodes.py
test/python/utils/test_centroid.py [new file with mode: 0644]

index fef53872118c6a034286b6490afcd939f889ef11..52d9fcf9e623b2b709841efe35d1a9995cb5a9fe 100644 (file)
--- a/.pylintrc
+++ b/.pylintrc
@@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing
 # 'too-many-ancestors' is triggered already by deriving from UserDict
 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
 
-good-names=i,x,y,fd,db
+good-names=i,x,y,fd,db,cc
diff --git a/docs/customize/Country-Settings.md b/docs/customize/Country-Settings.md
new file mode 100644 (file)
index 0000000..6f8f2a9
--- /dev/null
@@ -0,0 +1,149 @@
+# Customizing Per-Country Data
+
+Whenever an OSM is imported into Nominatim, the object is first assigned
+a country. Nominatim can use this information to adapt various aspects of
+the address computation to the local customs of the country. This section
+explains how country assignment works and the principal per-country
+localizations.
+
+## Country assignment
+
+Countries are assigned on the basis of country data from the OpenStreetMap
+input data itself. Countries are expected to be tagged according to the
+[administrative boundary schema](https://wiki.openstreetmap.org/wiki/Tag:boundary%3Dadministrative):
+a OSM relation with `boundary=administrative` and `admin_level=2`. Nominatim
+uses the country code to distinguish the countries.
+
+If there is no country data available for a point, then Nominatim uses the
+fallback data imported from `data/country_osm_grid.sql.gz`. This was computed
+from OSM data as well but is guaranteed to cover all countries.
+
+Some OSM objects may also be located outside any country, for example a buoy
+in the middle of the ocean. These object do not get any country assigned and
+get a default treatment when it comes to localized handling of data.
+
+## Per-country settings
+
+### Global country settings
+
+The main place to configure settings per country is the file
+`settings/country_settings.yaml`. This file has one section per country that
+is recognised by Nominatim. Each section is tagged with the country code
+(in lower case) and contains the different localization information. Only
+countries which are listed in this file are taken into account for computations.
+
+For example, the section for Andorra looks like this:
+
+```
+    partition: 35
+    languages: ca
+    names: !include country-names/ad.yaml
+    postcode:
+      pattern: "(ddd)"
+      output: AD\1
+```
+
+The individual settings are described below.
+
+#### `partition`
+
+Nominatim internally splits the data into multiple tables to improve
+performance. The partition number tells Nominatim into which table to put
+the country. This is purely internal management and has no effect on the
+output data.
+
+The default is to have one partition per country.
+
+#### `languages`
+
+A comma-separated list of ISO-639 language codes of default languages in the
+country. These are the languages used in name tags without a language suffix.
+Note that this is not necessarily the same as the list of official languages
+in the country. There may be officially recognised languages in a country
+which are only ever used in name tags with the appropriate language suffixes.
+Conversely, a non-official language may appear a lot in the name tags, for
+example when used as an unofficial Lingua Franca.
+
+List the languages in order of frequency of appearance with the most frequently
+used language first. It is not recommended to add languages when there are only
+very few occurrences.
+
+If only one language is listed, then Nominatim will 'auto-complete' the
+language of names without an explicit language-suffix.
+
+#### `names`
+
+List of names of the country and its translations. These names are used as
+a baseline. It is always possible to search countries by the given names, no
+matter what other names are in the OSM data. They are also used as a fallback
+when a needed translation is not available.
+
+!!! Note
+    The list of names per country is currently fairly large because Nominatim
+    supports translations in many languages per default. That is why the
+    name lists have been separated out into extra files. You can find the
+    name lists in the file `settings/country-names/<country code>.yaml`.
+    The names section in the main country settings file only refers to these
+    files via the special `!include` directive.
+
+#### `postcode`
+
+Describes the format of the postcode that is in use in the country.
+
+When a country has no official postcodes, set this to no. Example:
+
+```
+ae:
+    postcode: no
+```
+
+When a country has a postcode, you need to state the postcode pattern and
+the default output format. Example:
+
+```
+bm:
+    postcode:
+      pattern: "(ll)[ -]?(dd)"
+      output: \1 \2
+```
+
+The **pattern** is a regular expression that describes the possible formats
+accepted as a postcode. The pattern follows the standard syntax for
+[regular expressions in Python](https://docs.python.org/3/library/re.html#regular-expression-syntax)
+with two extra shortcuts: `d` is a shortcut for a single digit([0-9])
+and `l` for a single ASCII letter ([A-Z]).
+
+Use match groups to indicate groups in the postcode that may optionally be
+separated with a space or a hyphen.
+
+For example, the postcode for Bermuda above always consists of two letters
+and two digits. They may optionally be separated by a space or hyphen. That
+means that Nominatim will consider `AB56`, `AB 56` and `AB-56` spelling variants
+for one and the same postcode.
+
+Never add the country code in front of the postcode pattern. Nominatim will
+automatically accept variants with a country code prefix for all postcodes.
+
+The **output** field is an optional field that describes what the canonical
+spelling of the postcode should be. The format is the
+[regular expression expand syntax](https://docs.python.org/3/library/re.html#re.Match.expand) referring back to the bracket groups in the pattern.
+
+Most simple postcodes only have one spelling variant. In that case, the
+**output** can be omitted. The postcode will simply be used as is.
+
+In the Bermuda example above, the canonical spelling would be to have a space
+between letters and digits.
+
+!!! Warning
+    When your postcode pattern covers multiple variants of the postcode, then
+    you must explicitly state the canonical output or Nominatim will not
+    handle the variations correctly.
+
+### Other country-specific configuration
+
+There are some other configuration files where you can set localized settings
+according to the assigned country. These are:
+
+ * [Place ranking configuration](Ranking.md)
+
+Please see the linked documentation sections for more information.
index 19d867ddd800063494d72ad6ac078025d7ce2347..c563b20105160e27cb7176ff8b597f0c063c7fab 100644 (file)
@@ -205,6 +205,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
     rendering:
         heading_level: 6
 
+##### clean-postcodes
+
+::: nominatim.tokenizer.sanitizers.clean_postcodes
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
 
 #### Token Analysis
 
@@ -222,8 +230,12 @@ by a sanitizer (see for example the
 The token-analysis section contains the list of configured analyzers. Each
 analyzer must have an `id` parameter that uniquely identifies the analyzer.
 The only exception is the default analyzer that is used when no special
-analyzer was selected. There is one special id '@housenumber'. If an analyzer
-with that name is present, it is used for normalization of house numbers.
+analyzer was selected. There are analysers with special ids:
+
+ * '@housenumber'. If an analyzer with that name is present, it is used
+   for normalization of house numbers.
+ * '@potcode'. If an analyzer with that name is present, it is used
+   for normalization of postcodes.
 
 Different analyzer implementations may exist. To select the implementation,
 the `analyzer` parameter must be set. The different implementations are
@@ -356,6 +368,14 @@ house numbers of the form '3 a', '3A', '3-A' etc. are all considered equivalent.
 
 The analyzer cannot be customized.
 
+##### Postcode token analyzer
+
+The analyzer `postcodes` is pupose-made to analyze postcodes. It supports
+a 'lookup' varaint of the token, which produces variants with optional
+spaces. Use together with the clean-postcodes sanitizer.
+
+The analyzer cannot be customized.
+
 ### Reconfiguration
 
 Changing the configuration after the import is currently not possible, although
index 2b4da005090ab28ec4f6c41215d32f3dc5c87ace..5fe4e38d436b2978ce334a258fdaa80c3a9a9e58 100644 (file)
@@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against
 both the search token list and the match token list.
 
 ```sql
-FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
+FUNCTION token_get_postcode(info JSONB) RETURNS TEXT
 ```
 
-Return the normalized version of the given postcode. This function must return
-the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
+Return the postcode for the object, if any exists. The postcode must be in
+the form that should also be presented to the end-user.
 
 ```sql
 FUNCTION token_strip_info(info JSONB) RETURNS JSONB
index c25ae0ad321711d4ff48044ab73323d9ab845aa5..a3860cbaa2ab737ffc2a8d0d5f92476d8dea6cd9 100644 (file)
@@ -28,6 +28,7 @@ pages:
         - 'Overview': 'customize/Overview.md'
         - 'Import Styles': 'customize/Import-Styles.md'
         - 'Configuration Settings': 'customize/Settings.md'
+        - 'Per-Country Data': 'customize/Country-Settings.md'
         - 'Place Ranking' : 'customize/Ranking.md'
         - 'Tokenizers' : 'customize/Tokenizers.md'
         - 'Special Phrases': 'customize/Special-Phrases.md'
index f0dbd4571676ac59b0030b91a5bd9700ad9b3860..0ff92929cb58f2b496275fe23d016ac0d4dfdef2 100644 (file)
@@ -25,7 +25,12 @@ class Postcode
     public function __construct($iId, $sPostcode, $sCountryCode = '')
     {
         $this->iId = $iId;
-        $this->sPostcode = $sPostcode;
+        $iSplitPos = strpos($sPostcode, '@');
+        if ($iSplitPos === false) {
+            $this->sPostcode = $sPostcode;
+        } else {
+            $this->sPostcode = substr($sPostcode, 0, $iSplitPos);
+        }
         $this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode;
     }
 
index ccce99ca1330d7a42a6976d7fb7c9eaf3d8a84d7..e45d07654843b9c1c6127f40f3d064441ea4bc79 100644 (file)
@@ -190,13 +190,17 @@ class Tokenizer
                     if ($aWord['word'] !== null
                         && pg_escape_string($aWord['word']) == $aWord['word']
                     ) {
-                        $sNormPostcode = $this->normalizeString($aWord['word']);
-                        if (strpos($sNormQuery, $sNormPostcode) !== false) {
-                            $oValidTokens->addToken(
-                                $sTok,
-                                new Token\Postcode($iId, $aWord['word'], null)
-                            );
+                        $iSplitPos = strpos($aWord['word'], '@');
+                        if ($iSplitPos === false) {
+                            $sPostcode = $aWord['word'];
+                        } else {
+                            $sPostcode = substr($aWord['word'], 0, $iSplitPos);
                         }
+
+                        $oValidTokens->addToken(
+                            $sTok,
+                            new Token\Postcode($iId, $sPostcode, null)
+                        );
                     }
                     break;
                 case 'S':  // tokens for classification terms (special phrases)
index 0eada6987e7db285f9418642ccc5280bd649811a..2bbfcd5c03c6296ff06191a00571c7b11f5da25a 100644 (file)
@@ -320,6 +320,11 @@ BEGIN
     location := ROW(null, null, null, hstore('ref', place.postcode), 'place',
                     'postcode', null, null, false, true, 5, 0)::addressline;
     RETURN NEXT location;
+  ELSEIF place.address is not null and place.address ? 'postcode'
+         and not place.address->'postcode' SIMILAR TO '%(,|;)%' THEN
+    location := ROW(null, null, null, hstore('ref', place.address->'postcode'), 'place',
+                    'postcode', null, null, false, true, 5, 0)::addressline;
+    RETURN NEXT location;
   END IF;
 
   RETURN;
index c8cfbcc68c53dece2e3f84ea40c12f99dc77da86..3a99471101d0c9140967934812494bbed48135ed 100644 (file)
@@ -156,7 +156,6 @@ DECLARE
   linegeo GEOMETRY;
   splitline GEOMETRY;
   sectiongeo GEOMETRY;
-  interpol_postcode TEXT;
   postcode TEXT;
   stepmod SMALLINT;
 BEGIN
@@ -174,8 +173,6 @@ BEGIN
                                                  ST_PointOnSurface(NEW.linegeo),
                                                  NEW.linegeo);
 
-  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
-
   NEW.token_info := token_strip_info(NEW.token_info);
   IF NEW.address ? '_inherited' THEN
     NEW.address := hstore('interpolation', NEW.address->'interpolation');
@@ -207,6 +204,11 @@ BEGIN
     FOR nextnode IN
       SELECT DISTINCT ON (nodeidpos)
           osm_id, address, geometry,
+          -- Take the postcode from the node only if it has a housenumber itself.
+          -- Note that there is a corner-case where the node has a wrongly
+          -- formatted postcode and therefore 'postcode' contains a derived
+          -- variant.
+          CASE WHEN address ? 'postcode' THEN placex.postcode ELSE NULL::text END as postcode,
           substring(address->'housenumber','[0-9]+')::integer as hnr
         FROM placex, generate_series(1, array_upper(waynodes, 1)) nodeidpos
         WHERE osm_type = 'N' and osm_id = waynodes[nodeidpos]::BIGINT
@@ -260,13 +262,10 @@ BEGIN
         endnumber := newend;
 
         -- determine postcode
-        postcode := coalesce(interpol_postcode,
-                             token_normalized_postcode(prevnode.address->'postcode'),
-                             token_normalized_postcode(nextnode.address->'postcode'),
-                             postcode);
-        IF postcode is NULL THEN
-            SELECT token_normalized_postcode(placex.postcode)
-              FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+        postcode := coalesce(prevnode.postcode, nextnode.postcode, postcode);
+        IF postcode is NULL and NEW.parent_place_id > 0 THEN
+            SELECT placex.postcode FROM placex
+              WHERE place_id = NEW.parent_place_id INTO postcode;
         END IF;
         IF postcode is NULL THEN
             postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
index 6143a1edae6b78c2052cd23447880115fe97ad79..1f7e6dc61a0e99fce95aa31c7aad24707df409fe 100644 (file)
@@ -992,7 +992,7 @@ BEGIN
       {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
 
       -- determine postcode
-      NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+      NEW.postcode := coalesce(token_get_postcode(NEW.token_info),
                                location.postcode,
                                get_nearest_postcode(NEW.country_code, NEW.centroid));
 
@@ -1150,8 +1150,7 @@ BEGIN
 
   {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
 
-  NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
-                           NEW.postcode);
+  NEW.postcode := coalesce(token_get_postcode(NEW.token_info), NEW.postcode);
 
   -- if we have a name add this to the name search table
   IF NEW.name IS NOT NULL THEN
index a3dac8ddcbe82eb5fd6057bd81bb9b823befa159..599d0eb089eaeff6b5be795734e3a9b139b97117 100644 (file)
@@ -97,10 +97,10 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
   RETURNS TEXT
 AS $$
-  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+  SELECT info->>'postcode';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -223,3 +223,26 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
+  RETURNS BOOLEAN
+  AS $$
+DECLARE
+  existing INTEGER;
+BEGIN
+  SELECT count(*) INTO existing
+    FROM word WHERE word = postcode and type = 'P';
+
+  IF existing > 0 THEN
+    RETURN TRUE;
+  END IF;
+
+  -- postcodes don't need word ids
+  INSERT INTO word (word_token, type, word)
+    SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
+
+  RETURN FALSE;
+END;
+$$
+LANGUAGE plpgsql;
+
index 64453d4e5909888d7f6acd2ee130aa1adcfb8c00..5826f74ac25392b3bff857c1c71f054028839ba2 100644 (file)
@@ -97,10 +97,10 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
   RETURNS TEXT
 AS $$
-  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+  SELECT info->>'postcode';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py
new file mode 100644 (file)
index 0000000..6ae43b7
--- /dev/null
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class CountryPostcodeMatcher:
+    """ Matches and formats a postcode according to a format definition
+        of the given country.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
+        """
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
+
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
+class PostcodeFormatter:
+    """ Container for different postcode formats of the world and
+        access functions.
+    """
+    def __init__(self):
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def set_default_pattern(self, pattern):
+        """ Set the postcode match pattern to use, when a country does not
+            have a specific pattern or is marked as country without postcode.
+        """
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+    def get_matcher(self, country_code):
+        """ Return the CountryPostcodeMatcher for the given country.
+            Returns None if the country doesn't have a postcode and the
+            default matcher if there is no specific matcher configured for
+            the country.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher)
+
+
+    def match(self, country_code, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the country has a pattern
+            and the match was successful or None if the match failed.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+    def normalize(self, country_code, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
index 4678af66eb08d019b30e38bb8280da108083bd13..a6ff08a40774736c363965bada9a23101f2ccd86 100644 (file)
@@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module.
 import itertools
 import json
 import logging
-import re
 from textwrap import dedent
 
 from nominatim.db.connection import connect
@@ -291,33 +290,72 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         """ Update postcode tokens in the word table from the location_postcode
             table.
         """
-        to_delete = []
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
         with self.conn.cursor() as cur:
-            # This finds us the rows in location_postcode and word that are
-            # missing in the other table.
-            cur.execute("""SELECT * FROM
-                            (SELECT pc, word FROM
-                              (SELECT distinct(postcode) as pc FROM location_postcode) p
-                              FULL JOIN
-                              (SELECT word FROM word WHERE type = 'P') w
-                              ON pc = word) x
-                           WHERE pc is null or word is null""")
-
-            with CopyBuffer() as copystr:
-                for postcode, word in cur:
-                    if postcode is None:
-                        to_delete.append(word)
-                    else:
-                        copystr.add(self._search_normalized(postcode),
-                                    'P', postcode)
-
-                if to_delete:
-                    cur.execute("""DELETE FROM WORD
-                                   WHERE type ='P' and word = any(%s)
-                                """, (to_delete, ))
-
-                copystr.copy_out(cur, 'word',
-                                 columns=['word_token', 'type', 'word'])
+            # First get all postcode names currently in the word table.
+            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
+            word_entries = set((entry[0] for entry in cur))
+
+            # Then compute the required postcode names from the postcode table.
+            needed_entries = set()
+            cur.execute("SELECT country_code, postcode FROM location_postcode")
+            for cc, postcode in cur:
+                info = PlaceInfo({'country_code': cc,
+                                  'class': 'place', 'type': 'postcode',
+                                  'address': {'postcode': postcode}})
+                address = self.sanitizer.process_names(info)[1]
+                for place in address:
+                    if place.kind == 'postcode':
+                        if analyzer is None:
+                            postcode_name = place.name.strip().upper()
+                            variant_base = None
+                        else:
+                            postcode_name = analyzer.normalize(place.name)
+                            variant_base = place.get_attr("variant")
+
+                        if variant_base:
+                            needed_entries.add(f'{postcode_name}@{variant_base}')
+                        else:
+                            needed_entries.add(postcode_name)
+                        break
+
+        # Now update the word table.
+        self._delete_unused_postcode_words(word_entries - needed_entries)
+        self._add_missing_postcode_words(needed_entries - word_entries)
+
+    def _delete_unused_postcode_words(self, tokens):
+        if tokens:
+            with self.conn.cursor() as cur:
+                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
+                            (list(tokens), ))
+
+    def _add_missing_postcode_words(self, tokens):
+        if not tokens:
+            return
+
+        analyzer = self.token_analysis.analysis.get('@postcode')
+        terms = []
+
+        for postcode_name in tokens:
+            if '@' in postcode_name:
+                term, variant = postcode_name.split('@', 2)
+                term = self._search_normalized(term)
+                variants = {term}
+                if analyzer is not None:
+                    variants.update(analyzer.get_variants_ascii(variant))
+                    variants = list(variants)
+            else:
+                variants = [self._search_normalized(postcode_name)]
+            terms.append((postcode_name, variants))
+
+        if terms:
+            with self.conn.cursor() as cur:
+                cur.execute_values("""SELECT create_postcode_word(pc, var)
+                                      FROM (VALUES %s) AS v(pc, var)""",
+                                   terms)
+
+
 
 
     def update_special_phrases(self, phrases, should_replace):
@@ -473,7 +511,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _process_place_address(self, token_info, address):
         for item in address:
             if item.kind == 'postcode':
-                self._add_postcode(item.name)
+                token_info.set_postcode(self._add_postcode(item))
             elif item.kind == 'housenumber':
                 token_info.add_housenumber(*self._compute_housenumber_token(item))
             elif item.kind == 'street':
@@ -605,26 +643,38 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         return full_tokens, partial_tokens
 
 
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item):
         """ Make sure the normalized postcode is present in the word table.
         """
-        if re.search(r'[:,;]', postcode) is None:
-            postcode = self.normalize_postcode(postcode)
+        analyzer = self.token_analysis.analysis.get('@postcode')
 
-            if postcode not in self._cache.postcodes:
-                term = self._search_normalized(postcode)
-                if not term:
-                    return
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.normalize(item.name)
+            variant_base = item.get_attr("variant")
 
-                with self.conn.cursor() as cur:
-                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, word)
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
-                                    WHERE NOT EXISTS
-                                     (SELECT * FROM word
-                                      WHERE type = 'P' and word = pc))
-                                """, (term, postcode))
-                self._cache.postcodes.add(postcode)
+        if variant_base:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
+
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return None
+
+            variants = {term}
+            if analyzer is not None and variant_base:
+                variants.update(analyzer.get_variants_ascii(variant_base))
+
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
+
+        return postcode_name
 
 
 class _TokenInfo:
@@ -637,6 +687,7 @@ class _TokenInfo:
         self.street_tokens = set()
         self.place_tokens = set()
         self.address_tokens = {}
+        self.postcode = None
 
 
     @staticmethod
@@ -665,6 +716,9 @@ class _TokenInfo:
         if self.address_tokens:
             out['addr'] = self.address_tokens
 
+        if self.postcode:
+            out['postcode'] = self.postcode
+
         return out
 
 
@@ -701,6 +755,11 @@ class _TokenInfo:
         if partials:
             self.address_tokens[key] = self._mk_array(partials)
 
+    def set_postcode(self, postcode):
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
+
 
 class _TokenCache:
     """ Cache for token information to avoid repeated database queries.
index a292b180b8d5b153496c4641fdd5fc2de139f899..36fd5722441a12e92d24434d5dc1317497dd27bc 100644 (file)
@@ -467,8 +467,9 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
             if key == 'postcode':
                 # Make sure the normalized postcode is present in the word table.
                 if re.search(r'[:,;]', value) is None:
-                    self._cache.add_postcode(self.conn,
-                                             self.normalize_postcode(value))
+                    norm_pc = self.normalize_postcode(value)
+                    token_info.set_postcode(norm_pc)
+                    self._cache.add_postcode(self.conn, norm_pc)
             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                 hnrs.append(value)
             elif key == 'street':
@@ -527,6 +528,11 @@ class _TokenInfo:
             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 
 
+    def set_postcode(self, postcode):
+        """ Set or replace the postcode token with the given value.
+        """
+        self.data['postcode'] = postcode
+
     def add_street(self, conn, street):
         """ Add addr:street match terms.
         """
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
new file mode 100644 (file)
index 0000000..05e90ca
--- /dev/null
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that filters postcodes by their officially allowed pattern.
+
+Arguments:
+    convert-to-address: If set to 'yes' (the default), then postcodes that do
+                        not conform with their country-specific pattern are
+                        converted to an address component. That means that
+                        the postcode does not take part when computing the
+                        postcode centroids of a country but is still searchable.
+                        When set to 'no', non-conforming postcodes are not
+                        searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
+"""
+from nominatim.data.postcode_format import PostcodeFormatter
+
+class _PostcodeSanitizer:
+
+    def __init__(self, config):
+        self.convert_to_address = config.get_bool('convert-to-address', True)
+        self.matcher = PostcodeFormatter()
+
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.matcher.set_default_pattern(default_pattern)
+
+
+    def __call__(self, obj):
+        if not obj.address:
+            return
+
+        postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
+
+        for pos, postcode in postcodes:
+            formatted = self.scan(postcode.name, obj.place.country_code)
+
+            if formatted is None:
+                if self.convert_to_address:
+                    postcode.kind = 'unofficial_postcode'
+                else:
+                    obj.address.pop(pos)
+            else:
+                postcode.name = formatted[0]
+                postcode.set_attr('variant', formatted[1])
+
+
+    def scan(self, postcode, country):
+        """ Check the postcode for correct formatting and return the
+            normalized version. Returns None if the postcode does not
+            correspond to the oficial format of the given country.
+        """
+        match = self.matcher.match(country, postcode)
+        if match is None:
+            return None
+
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
+
+
+
+
+def create(config):
+    """ Create a housenumber processing function.
+    """
+
+    return _PostcodeSanitizer(config)
index ecfcacbe551e7c0747e20b1e14e30458c3b858bc..ce5ce1eb8b5606dd702efb2b582facf1a48a0626 100644 (file)
@@ -44,6 +44,20 @@ class SanitizerConfig(UserDict):
         return values
 
 
+    def get_bool(self, param, default=None):
+        """ Extract a configuration parameter as a boolean.
+            The parameter must be one of the yaml boolean values or an
+            user error will be raised. If `default` is given, then the parameter
+            may also be missing or empty.
+        """
+        value = self.data.get(param, default)
+
+        if not isinstance(value, bool):
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+
+        return value
+
+
     def get_delimiter(self, default=',;'):
         """ Return the 'delimiter' parameter in the configuration as a
             compiled regular expression that can be used to split the names on the
index 7898b1c68525dd59d8362a83e258db8ced173a59..9a99d127728290264c7762f7c76fefb7177f3267 100644 (file)
@@ -48,8 +48,7 @@ class _AnalyzerByLanguage:
         self.deflangs = {}
 
         if use_defaults in ('mono', 'all'):
-            for ccode, prop in country_info.iterate():
-                clangs = prop['languages']
+            for ccode, clangs in country_info.iterate('languages'):
                 if len(clangs) == 1 or use_defaults == 'all':
                     if self.whitelist:
                         self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py
new file mode 100644 (file)
index 0000000..18fc2a8
--- /dev/null
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+
+from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules, normalization_rules): # pylint: disable=W0613
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer, transliterator, config): # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return PostcodeTokenAnalysis(normalizer, transliterator)
+
+
+class PostcodeTokenAnalysis:
+    """ Special normalization and variant generation for postcodes.
+
+        This analyser must not be used with anything but postcodes as
+        it follows some special rules: `normalize` doesn't necessarily
+        need to return a standard form as per normalization rules. It
+        needs to return the canonical form of the postcode that is also
+        used for output. `get_variants_ascii` then needs to ensure that
+        the generated variants once more follow the standard normalization
+        and transliteration, so that postcodes are correctly recognised by
+        the search algorithm.
+    """
+    def __init__(self, norm, trans):
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+    def normalize(self, name):
+        """ Return the standard form of the postcode.
+        """
+        return name.strip().upper()
+
+
+    def get_variants_ascii(self, norm_name):
+        """ Compute the spelling variants for the given normalized postcode.
+
+            Takes the canonical form of the postcode, normalizes it using the
+            standard rules and then creates variants of the result where
+            all spaces are optional.
+        """
+        # Postcodes follow their own transliteration rules.
+        # Make sure at this point, that the terms are normalized in a way
+        # that they are searchable with the standard transliteration rules.
+        return [self.trans.transliterate(term) for term in
+                self.mutator.generate([self.norm.transliterate(norm_name)]) if term]
index 0ad001719e164f110afbf063f69f57711a78b42c..d754b4ddb029365b22d2cc7a77ccaeefc49a2719 100644 (file)
@@ -84,10 +84,20 @@ def setup_country_config(config):
     _COUNTRY_INFO.load(config)
 
 
-def iterate():
+def iterate(prop=None):
     """ Iterate over country code and properties.
+
+        When `prop` is None, all countries are returned with their complete
+        set of properties.
+
+        If `prop` is given, then only countries are returned where the
+        given property is set. The second item of the tuple contains only
+        the content of the given property.
     """
-    return _COUNTRY_INFO.items()
+    if prop is None:
+        return _COUNTRY_INFO.items()
+
+    return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
 
 
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
index 2b7027e721b04cd3775e5495459920fecebbc5c6..9c66719b5fe1ce55573985f8a653876c093102c6 100644 (file)
@@ -8,6 +8,7 @@
 Functions for importing, updating and otherwise maintaining the table
 of artificial postcode centroids.
 """
+from collections import defaultdict
 import csv
 import gzip
 import logging
@@ -16,6 +17,8 @@ from math import isfinite
 from psycopg2 import sql as pysql
 
 from nominatim.db.connection import connect
+from nominatim.utils.centroid import PointsCentroid
+from nominatim.data.postcode_format import PostcodeFormatter
 
 LOG = logging.getLogger()
 
@@ -30,20 +33,31 @@ def _to_float(num, max_value):
 
     return num
 
-class _CountryPostcodesCollector:
+class _PostcodeCollector:
     """ Collector for postcodes of a single country.
     """
 
-    def __init__(self, country):
+    def __init__(self, country, matcher):
         self.country = country
-        self.collected = {}
+        self.matcher = matcher
+        self.collected = defaultdict(PointsCentroid)
+        self.normalization_cache = None
 
 
     def add(self, postcode, x, y):
         """ Add the given postcode to the collection cache. If the postcode
             already existed, it is overwritten with the new centroid.
         """
-        self.collected[postcode] = (x, y)
+        if self.matcher is not None:
+            if self.normalization_cache and self.normalization_cache[0] == postcode:
+                normalized = self.normalization_cache[1]
+            else:
+                match = self.matcher.match(postcode)
+                normalized = self.matcher.normalize(match) if match else None
+                self.normalization_cache = (postcode, normalized)
+
+            if normalized:
+                self.collected[normalized] += (x, y)
 
 
     def commit(self, conn, analyzer, project_dir):
@@ -93,16 +107,16 @@ class _CountryPostcodesCollector:
                            WHERE country_code = %s""",
                         (self.country, ))
             for postcode, x, y in cur:
-                newx, newy = self.collected.pop(postcode, (None, None))
-                if newx is not None:
-                    dist = (x - newx)**2 + (y - newy)**2
-                    if dist > 0.0000001:
+                pcobj = self.collected.pop(postcode, None)
+                if pcobj:
+                    newx, newy = pcobj.centroid()
+                    if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
                         to_update.append((postcode, newx, newy))
                 else:
                     to_delete.append(postcode)
 
-        to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
-        self.collected = []
+        to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
+        self.collected = None
 
         return to_add, to_delete, to_update
 
@@ -125,8 +139,10 @@ class _CountryPostcodesCollector:
                 postcode = analyzer.normalize_postcode(row['postcode'])
                 if postcode not in self.collected:
                     try:
-                        self.collected[postcode] = (_to_float(row['lon'], 180),
-                                                    _to_float(row['lat'], 90))
+                        # Do float conversation separately, it might throw
+                        centroid = (_to_float(row['lon'], 180),
+                                    _to_float(row['lat'], 90))
+                        self.collected[postcode] += centroid
                     except ValueError:
                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
                                     row['lat'], row['lon'], self.country)
@@ -158,6 +174,7 @@ def update_postcodes(dsn, project_dir, tokenizer):
         potentially enhances it with external data and then updates the
         postcodes in the table 'location_postcode'.
     """
+    matcher = PostcodeFormatter()
     with tokenizer.name_analyzer() as analyzer:
         with connect(dsn) as conn:
             # First get the list of countries that currently have postcodes.
@@ -169,19 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer):
             # Recompute the list of valid postcodes from placex.
             with conn.cursor(name="placex_postcodes") as cur:
                 cur.execute("""
-                SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
+                SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
                 FROM (SELECT
                         COALESCE(plx.country_code,
                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
-                        token_normalized_postcode(pl.address->'postcode') as pc,
-                        ST_Centroid(ST_Collect(COALESCE(plx.centroid,
-                                                        ST_Centroid(pl.geometry)))) as centroid
+                        pl.address->'postcode' as pc,
+                        COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
                       FROM place AS pl LEFT OUTER JOIN placex AS plx
                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
-                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
-                    GROUP BY cc, pc) xx
+                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
                 WHERE pc IS NOT null AND cc IS NOT null
-                ORDER BY country_code, pc""")
+                ORDER BY cc, pc""")
 
                 collector = None
 
@@ -189,7 +204,7 @@ def update_postcodes(dsn, project_dir, tokenizer):
                     if collector is None or country != collector.country:
                         if collector is not None:
                             collector.commit(conn, analyzer, project_dir)
-                        collector = _CountryPostcodesCollector(country)
+                        collector = _PostcodeCollector(country, matcher.get_matcher(country))
                         todo_countries.discard(country)
                     collector.add(postcode, x, y)
 
@@ -198,7 +213,8 @@ def update_postcodes(dsn, project_dir, tokenizer):
 
             # Now handle any countries that are only in the postcode table.
             for country in todo_countries:
-                _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
+                fmt = matcher.get_matcher(country)
+                _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
 
             conn.commit()
 
diff --git a/nominatim/utils/__init__.py b/nominatim/utils/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/nominatim/utils/centroid.py b/nominatim/utils/centroid.py
new file mode 100644 (file)
index 0000000..c2bd619
--- /dev/null
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for computation of centroids.
+"""
+from collections.abc import Collection
+
+class PointsCentroid:
+    """ Centroid computation from single points using an online algorithm.
+        More points may be added at any time.
+
+        Coordinates are internally treated as a 7-digit fixed-point float
+        (i.e. in OSM style).
+    """
+
+    def __init__(self):
+        self.sum_x = 0
+        self.sum_y = 0
+        self.count = 0
+
+    def centroid(self):
+        """ Return the centroid of all points collected so far.
+        """
+        if self.count == 0:
+            raise ValueError("No points available for centroid.")
+
+        return (float(self.sum_x/self.count)/10000000,
+                float(self.sum_y/self.count)/10000000)
+
+
+    def __len__(self):
+        return self.count
+
+
+    def __iadd__(self, other):
+        if isinstance(other, Collection) and len(other) == 2:
+            if all(isinstance(p, (float, int)) for p in other):
+                x, y = other
+                self.sum_x += int(x * 10000000)
+                self.sum_y += int(y * 10000000)
+                self.count += 1
+                return self
+
+        raise ValueError("Can only add 2-element tuples to centroid.")
index 643acbee3a0204a44eb2f6ea1768a2fc7e98b322..b0bacdfcc5d4dc60c944d4a14cfaf9e455a897d8 100644 (file)
@@ -3,6 +3,9 @@ ad:
     partition: 35
     languages: ca
     names: !include country-names/ad.yaml
+    postcode:
+      pattern: "(ddd)"
+      output: AD\1
 
 
 # United Arab Emirates (الإمارات العربية المتحدة)
@@ -10,6 +13,7 @@ ae:
     partition: 83
     languages: ar
     names: !include country-names/ae.yaml
+    postcode: no
 
 
 # Afghanistan (افغانستان)
@@ -17,6 +21,8 @@ af:
     partition: 30
     languages: fa, ps
     names: !include country-names/af.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Antigua and Barbuda (Antigua and Barbuda)
@@ -24,6 +30,7 @@ ag:
     partition: 205
     languages: en
     names: !include country-names/ag.yaml
+    postcode: no
 
 
 # Anguilla (Anguilla)
@@ -31,6 +38,9 @@ ai:
     partition: 175
     languages: en
     names: !include country-names/ai.yaml
+    postcode:
+      pattern: "2640"
+      output: AI-2640
 
 
 # Albania (Shqipëria)
@@ -38,6 +48,8 @@ al:
     partition: 9
     languages: sq
     names: !include country-names/al.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Armenia (Հայաստան)
@@ -45,6 +57,8 @@ am:
     partition: 33
     languages: hy
     names: !include country-names/am.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Netherlands Antilles (De Nederlandse Antillen)
@@ -59,6 +73,7 @@ ao:
     partition: 85
     languages: pt
     names: !include country-names/ao.yaml
+    postcode: no
 
 
 #  (Antarctica)
@@ -66,6 +81,7 @@ aq:
     partition: 181
     languages: en, es, fr, ru
     names: !include country-names/aq.yaml
+    postcode: no
 
 
 # Argentina (Argentina)
@@ -73,6 +89,8 @@ ar:
     partition: 39
     languages: es
     names: !include country-names/ar.yaml
+    postcode:
+      pattern: "l?dddd(?:lll)?"
 
 
 #  (American Samoa)
@@ -87,6 +105,8 @@ at:
     partition: 245
     languages: de
     names: !include country-names/at.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Australia (Australia)
@@ -94,6 +114,8 @@ au:
     partition: 139
     languages: en
     names: !include country-names/au.yaml
+    postcode:
+      pattern: "dddd"
 
 
 #  (Aruba)
@@ -101,6 +123,7 @@ aw:
     partition: 183
     languages: nl, pap
     names: !include country-names/aw.yaml
+    postcode: no
 
 
 #  (Aland Islands)
@@ -115,6 +138,8 @@ az:
     partition: 119
     languages: az
     names: !include country-names/az.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Bosnia and Herzegovina (Bosna i Hercegovina / Босна и Херцеговина)
@@ -122,6 +147,8 @@ ba:
     partition: 6
     languages: bs, hr, sr
     names: !include country-names/ba.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Barbados (Barbados)
@@ -129,6 +156,9 @@ bb:
     partition: 206
     languages: en
     names: !include country-names/bb.yaml
+    postcode:
+      pattern: "(ddddd)"
+      output: BB\1
 
 
 # Bangladesh (Bangladesh)
@@ -136,6 +166,8 @@ bd:
     partition: 158
     languages: bn
     names: !include country-names/bd.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Belgium (België / Belgique / Belgien)
@@ -143,6 +175,8 @@ be:
     partition: 15
     languages: nl, fr, de
     names: !include country-names/be.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Burkina Faso (Burkina Faso)
@@ -150,6 +184,7 @@ bf:
     partition: 225
     languages: fr
     names: !include country-names/bf.yaml
+    postcode: no
 
 
 # Bulgaria (Бългaрия)
@@ -157,6 +192,8 @@ bg:
     partition: 140
     languages: bg
     names: !include country-names/bg.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Bahrain (البحرين)
@@ -164,6 +201,8 @@ bh:
     partition: 62
     languages: ar
     names: !include country-names/bh.yaml
+    postcode:
+      pattern: "d?ddd"
 
 
 # Burundi (Burundi)
@@ -171,6 +210,7 @@ bi:
     partition: 61
     languages: fr
     names: !include country-names/bi.yaml
+    postcode: no
 
 
 # Benin (Bénin)
@@ -178,6 +218,7 @@ bj:
     partition: 224
     languages: fr
     names: !include country-names/bj.yaml
+    postcode: no
 
 
 #  (Saint Barthélemy)
@@ -192,6 +233,9 @@ bm:
     partition: 176
     languages: en
     names: !include country-names/bm.yaml
+    postcode:
+      pattern: "(ll)[ -]?(dd)"
+      output: \1 \2
 
 
 # Brunei (Brunei)
@@ -199,6 +243,9 @@ bn:
     partition: 86
     languages: ms
     names: !include country-names/bn.yaml
+    postcode:
+      pattern: "(ll) ?(dddd)"
+      output: \1\2
 
 
 # Bolivia (Bolivia)
@@ -206,6 +253,7 @@ bo:
     partition: 120
     languages: es, qu, gn, ay
     names: !include country-names/bo.yaml
+    postcode: no
 
 
 # Caribbean Netherlands (Caribisch Nederland)
@@ -220,6 +268,9 @@ br:
     partition: 121
     languages: pt
     names: !include country-names/br.yaml
+    postcode:
+      pattern: "(ddddd)-?(ddd)"
+      output: \1-\2
 
 
 # The Bahamas (The Bahamas)
@@ -227,6 +278,7 @@ bs:
     partition: 207
     languages: en
     names: !include country-names/bs.yaml
+    postcode: no
 
 
 # Bhutan (འབྲུག་ཡུལ་)
@@ -234,6 +286,8 @@ bt:
     partition: 87
     languages: dz
     names: !include country-names/bt.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 #  (Bouvet Island)
@@ -248,6 +302,7 @@ bw:
     partition: 122
     languages: en, tn
     names: !include country-names/bw.yaml
+    postcode: no
 
 
 # Belarus (Беларусь)
@@ -255,6 +310,8 @@ by:
     partition: 40
     languages: be, ru
     names: !include country-names/by.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Belize (Belize)
@@ -262,6 +319,7 @@ bz:
     partition: 208
     languages: en
     names: !include country-names/bz.yaml
+    postcode: no
 
 
 # Canada (Canada)
@@ -269,6 +327,9 @@ ca:
     partition: 244
     languages: en, fr
     names: !include country-names/ca.yaml
+    postcode:
+      pattern: "(ldl) ?(dld)"
+      output: \1 \2
 
 
 # Cocos (Keeling) Islands (Cocos (Keeling) Islands)
@@ -283,6 +344,7 @@ cd:
     partition: 229
     languages: fr
     names: !include country-names/cd.yaml
+    postcode: no
 
 
 # Central African Republic (Ködörösêse tî Bêafrîka - République Centrafricaine)
@@ -290,6 +352,7 @@ cf:
     partition: 227
     languages: fr, sg
     names: !include country-names/cf.yaml
+    postcode: no
 
 
 # Congo-Brazzaville (Congo)
@@ -297,6 +360,7 @@ cg:
     partition: 230
     languages: fr
     names: !include country-names/cg.yaml
+    postcode: no
 
 
 # Switzerland (Schweiz/Suisse/Svizzera/Svizra)
@@ -304,6 +368,8 @@ ch:
     partition: 5
     languages: de, fr, it, rm
     names: !include country-names/ch.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Côte d'Ivoire (Côte d’Ivoire)
@@ -311,6 +377,7 @@ ci:
     partition: 228
     languages: fr
     names: !include country-names/ci.yaml
+    postcode: no
 
 
 # Cook Islands (Kūki 'Āirani)
@@ -318,6 +385,7 @@ ck:
     partition: 41
     languages: en, rar
     names: !include country-names/ck.yaml
+    postcode: no
 
 
 # Chile (Chile)
@@ -325,6 +393,8 @@ cl:
     partition: 88
     languages: es
     names: !include country-names/cl.yaml
+    postcode:
+      pattern: "ddddddd"
 
 
 # Cameroon (Cameroun)
@@ -332,6 +402,7 @@ cm:
     partition: 141
     languages: fr, en
     names: !include country-names/cm.yaml
+    postcode: no
 
 
 # China (中国)
@@ -339,6 +410,8 @@ cn:
     partition: 117
     languages: zh
     names: !include country-names/cn.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Colombia (Colombia)
@@ -346,6 +419,8 @@ co:
     partition: 133
     languages: es
     names: !include country-names/co.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Costa Rica (Costa Rica)
@@ -353,6 +428,8 @@ cr:
     partition: 64
     languages: es
     names: !include country-names/cr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Cuba (Cuba)
@@ -360,6 +437,8 @@ cu:
     partition: 42
     languages: es
     names: !include country-names/cu.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Cape Verde (Cabo Verde)
@@ -367,6 +446,8 @@ cv:
     partition: 89
     languages: pt
     names: !include country-names/cv.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Curaçao (Curaçao)
@@ -388,6 +469,8 @@ cy:
     partition: 114
     languages: el, tr
     names: !include country-names/cy.yaml
+    postcode:
+      pattern: "(?:99|d)ddd"
 
 
 # Czechia (Česko)
@@ -395,6 +478,9 @@ cz:
     partition: 124
     languages: cs
     names: !include country-names/cz.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Germany (Deutschland)
@@ -402,6 +488,8 @@ de:
     partition: 3
     languages: de
     names: !include country-names/de.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Djibouti (Djibouti جيبوتي)
@@ -409,6 +497,7 @@ dj:
     partition: 43
     languages: fr, ar, so, aa
     names: !include country-names/dj.yaml
+    postcode: no
 
 
 # Denmark (Danmark)
@@ -416,6 +505,8 @@ dk:
     partition: 160
     languages: da
     names: !include country-names/dk.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Dominica (Dominica)
@@ -423,6 +514,7 @@ dm:
     partition: 209
     languages: en
     names: !include country-names/dm.yaml
+    postcode: no
 
 
 # Dominican Republic (República Dominicana)
@@ -430,6 +522,8 @@ do:
     partition: 37
     languages: es
     names: !include country-names/do.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Algeria (Algérie / ⵍⵣⵣⴰⵢⴻⵔ / الجزائر)
@@ -437,6 +531,8 @@ dz:
     partition: 19
     languages: ar, ber, fr
     names: !include country-names/dz.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ecuador (Ecuador)
@@ -444,6 +540,8 @@ ec:
     partition: 78
     languages: es
     names: !include country-names/ec.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Estonia (Eesti)
@@ -451,6 +549,8 @@ ee:
     partition: 125
     languages: et
     names: !include country-names/ee.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Egypt (مصر)
@@ -458,6 +558,8 @@ eg:
     partition: 16
     languages: ar
     names: !include country-names/eg.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Sahrawi Arab Democratic Republic (الجمهورية العربية الصحراوية الديمقراطية)
@@ -472,6 +574,7 @@ er:
     partition: 142
     languages: ti, ar, en
     names: !include country-names/er.yaml
+    postcode: no
 
 
 # Spain (España)
@@ -479,6 +582,8 @@ es:
     partition: 31
     languages: es, ast, ca, eu, gl
     names: !include country-names/es.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ethiopia (ኢትዮጵያ)
@@ -486,6 +591,8 @@ et:
     partition: 90
     languages: am, om
     names: !include country-names/et.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Finland (Suomi)
@@ -493,6 +600,8 @@ fi:
     partition: 20
     languages: fi, sv, se
     names: !include country-names/fi.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Fiji (Viti)
@@ -500,6 +609,7 @@ fj:
     partition: 210
     languages: en
     names: !include country-names/fj.yaml
+    postcode: no
 
 
 # Falkland Islands (Falkland Islands)
@@ -507,6 +617,8 @@ fk:
     partition: 91
     languages: en
     names: !include country-names/fk.yaml
+    postcode:
+      pattern: "FIQQ 1ZZ"
 
 
 # Federated States of Micronesia (Micronesia)
@@ -514,6 +626,8 @@ fm:
     partition: 217
     languages: en
     names: !include country-names/fm.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Faroe Islands (Føroyar)
@@ -521,6 +635,8 @@ fo:
     partition: 10
     languages: fo, da
     names: !include country-names/fo.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # France (France)
@@ -528,6 +644,8 @@ fr:
     partition: 4
     languages: fr
     names: !include country-names/fr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Gabon (Gabon)
@@ -535,6 +653,7 @@ ga:
     partition: 239
     languages: fr
     names: !include country-names/ga.yaml
+    postcode: no
 
 
 # United Kingdom (United Kingdom)
@@ -542,6 +661,9 @@ gb:
     partition: 1
     languages: en
     names: !include country-names/gb.yaml
+    postcode:
+      pattern: "(l?ld[A-Z0-9]?) ?(dll)"
+      output: \1 \2
 
 
 # Grenada (Grenada)
@@ -549,6 +671,7 @@ gd:
     partition: 143
     languages: en
     names: !include country-names/gd.yaml
+    postcode: no
 
 
 # Georgia (საქართველო)
@@ -556,6 +679,8 @@ ge:
     partition: 21
     languages: ka
     names: !include country-names/ge.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # French Guiana (Guyane Française)
@@ -570,6 +695,9 @@ gg:
     partition: 77
     languages: en
     names: !include country-names/gg.yaml
+    postcode:
+      pattern: "(GYdd?) ?(dll)"
+      output: \1 \2
 
 
 # Ghana (Ghana)
@@ -577,6 +705,8 @@ gh:
     partition: 211
     languages: en
     names: !include country-names/gh.yaml
+    postcode:
+      pattern: "ll-d?ddd-dddd"
 
 
 # Gibraltar (Gibraltar)
@@ -584,6 +714,9 @@ gi:
     partition: 138
     languages: en
     names: !include country-names/gi.yaml
+    postcode:
+      pattern: "(GX11) ?(1AA)"
+      output: GX11 1AA
 
 
 # Greenland (Kalaallit Nunaat)
@@ -591,6 +724,8 @@ gl:
     partition: 111
     languages: kl, da
     names: !include country-names/gl.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # The Gambia (Gambia)
@@ -598,6 +733,7 @@ gm:
     partition: 212
     languages: en
     names: !include country-names/gm.yaml
+    postcode: no
 
 
 # Guinea (Guinée)
@@ -605,6 +741,8 @@ gn:
     partition: 240
     languages: fr
     names: !include country-names/gn.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Guadeloupe (Guadeloupe)
@@ -619,6 +757,7 @@ gq:
     partition: 12
     languages: es, fr, pt
     names: !include country-names/gq.yaml
+    postcode: no
 
 
 # Greece (Ελλάς)
@@ -626,6 +765,9 @@ gr:
     partition: 22
     languages: el
     names: !include country-names/gr.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # South Georgia and the South Sandwich Islands (South Georgia and the South Sandwich Islands)
@@ -633,6 +775,9 @@ gs:
     partition: 44
     languages: en
     names: !include country-names/gs.yaml
+    postcode:
+      pattern: "(SIQQ) ?(1ZZ)"
+      output: \1 \2
 
 
 # Guatemala (Guatemala)
@@ -640,6 +785,8 @@ gt:
     partition: 57
     languages: es
     names: !include country-names/gt.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Guam (Guam)
@@ -654,6 +801,8 @@ gw:
     partition: 8
     languages: pt
     names: !include country-names/gw.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Guyana (Guyana)
@@ -661,6 +810,7 @@ gy:
     partition: 213
     languages: en
     names: !include country-names/gy.yaml
+    postcode: no
 
 
 #  (Hong Kong)
@@ -682,6 +832,8 @@ hn:
     partition: 56
     languages: es
     names: !include country-names/hn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Croatia (Hrvatska)
@@ -689,6 +841,8 @@ hr:
     partition: 92
     languages: hr
     names: !include country-names/hr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Haiti (Ayiti)
@@ -696,6 +850,8 @@ ht:
     partition: 29
     languages: fr, ht
     names: !include country-names/ht.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Hungary (Magyarország)
@@ -703,6 +859,8 @@ hu:
     partition: 45
     languages: hu
     names: !include country-names/hu.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Indonesia (Indonesia)
@@ -710,6 +868,8 @@ id:
     partition: 110
     languages: id
     names: !include country-names/id.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ireland (Éire / Ireland)
@@ -717,6 +877,9 @@ ie:
     partition: 46
     languages: en, ga
     names: !include country-names/ie.yaml
+    postcode:
+      pattern: "(ldd) ?([0123456789ACDEFHKNPRTVWXY]{4})"
+      output: \1 \2
 
 
 # Israel (ישראל)
@@ -724,6 +887,8 @@ il:
     partition: 65
     languages: he
     names: !include country-names/il.yaml
+    postcode:
+      pattern: "ddddddd"
 
 
 # Isle of Man (Isle of Man)
@@ -731,6 +896,9 @@ im:
     partition: 190
     languages: en
     names: !include country-names/im.yaml
+    postcode:
+      pattern: "(IMdd?) ?(dll)"
+      output: \1 \2
 
 
 # India (India)
@@ -738,6 +906,9 @@ in:
     partition: 128
     languages: hi, en
     names: !include country-names/in.yaml
+    postcode:
+      pattern: "(ddd) ?(ddd)"
+      output: \1\2
 
 
 # British Indian Ocean Territory (British Indian Ocean Territory)
@@ -745,6 +916,9 @@ io:
     partition: 13
     languages: en
     names: !include country-names/io.yaml
+    postcode:
+      pattern: "(BBND) ?(1ZZ)"
+      output: \1 \2
 
 
 # Iraq (العراق)
@@ -752,6 +926,8 @@ iq:
     partition: 144
     languages: ar, ku
     names: !include country-names/iq.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Iran (ایران)
@@ -759,6 +935,9 @@ ir:
     partition: 80
     languages: fa
     names: !include country-names/ir.yaml
+    postcode:
+      pattern: "(ddddd)[-_ ]?(ddddd)"
+      output: \1-\2
 
 
 # Iceland (Ísland)
@@ -766,6 +945,8 @@ is:
     partition: 134
     languages: is
     names: !include country-names/is.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Italy (Italia)
@@ -773,6 +954,8 @@ it:
     partition: 28
     languages: it, de, fr
     names: !include country-names/it.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Jersey (Jersey)
@@ -780,6 +963,9 @@ je:
     partition: 123
     languages: en
     names: !include country-names/je.yaml
+    postcode:
+      pattern: "(JEdd?) ?(dll)"
+      output: \1 \2
 
 
 # Jamaica (Jamaica)
@@ -787,6 +973,7 @@ jm:
     partition: 214
     languages: en
     names: !include country-names/jm.yaml
+    postcode: no
 
 
 # Jordan (الأردن)
@@ -794,6 +981,8 @@ jo:
     partition: 17
     languages: ar
     names: !include country-names/jo.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Japan (日本)
@@ -801,6 +990,9 @@ jp:
     partition: 11
     languages: ja
     names: !include country-names/jp.yaml
+    postcode:
+      pattern: "(ddd)-?(dddd)"
+      output: \1-\2
 
 
 # Kenya (Kenya)
@@ -808,6 +1000,8 @@ ke:
     partition: 126
     languages: sw, en
     names: !include country-names/ke.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Kyrgyzstan (Кыргызстан)
@@ -815,6 +1009,8 @@ kg:
     partition: 93
     languages: ky, ru
     names: !include country-names/kg.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Cambodia (ព្រះរាជាណាចក្រ​កម្ពុជា)
@@ -822,6 +1018,8 @@ kh:
     partition: 159
     languages: km
     names: !include country-names/kh.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Kiribati (Kiribati)
@@ -829,6 +1027,7 @@ ki:
     partition: 215
     languages: en
     names: !include country-names/ki.yaml
+    postcode: no
 
 
 # Comoros (Comores Komori جزر القمر)
@@ -836,6 +1035,7 @@ km:
     partition: 47
     languages: ar, fr, sw
     names: !include country-names/km.yaml
+    postcode: no
 
 
 # Saint Kitts and Nevis (Saint Kitts and Nevis)
@@ -843,6 +1043,8 @@ kn:
     partition: 84
     languages: en
     names: !include country-names/kn.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # North Korea (조선민주주의인민공화국)
@@ -850,6 +1052,7 @@ kp:
     partition: 48
     languages: ko
     names: !include country-names/kp.yaml
+    postcode: no
 
 
 # South Korea (대한민국)
@@ -857,6 +1060,8 @@ kr:
     partition: 49
     languages: ko, en
     names: !include country-names/kr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Kuwait (الكويت)
@@ -864,6 +1069,8 @@ kw:
     partition: 127
     languages: ar
     names: !include country-names/kw.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Cayman Islands (Cayman Islands)
@@ -871,6 +1078,9 @@ ky:
     partition: 38
     languages: en
     names: !include country-names/ky.yaml
+    postcode:
+      pattern: "(d)-(dddd)"
+      output: KY\1-\2
 
 
 # Kazakhstan (Қазақстан)
@@ -878,6 +1088,8 @@ kz:
     partition: 94
     languages: kk, ru
     names: !include country-names/kz.yaml
+    postcode:
+      pattern: "(?:lddldld|dddddd)"
 
 
 # Laos (ປະເທດລາວ)
@@ -885,6 +1097,8 @@ la:
     partition: 145
     languages: lo
     names: !include country-names/la.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Lebanon (لبنان)
@@ -892,6 +1106,8 @@ lb:
     partition: 66
     languages: ar, fr
     names: !include country-names/lb.yaml
+    postcode:
+      pattern: "(dddd)(?: ?dddd)?"
 
 
 # Saint Lucia (Saint Lucia)
@@ -899,6 +1115,9 @@ lc:
     partition: 146
     languages: en
     names: !include country-names/lc.yaml
+    postcode:
+      pattern: "(dd) ?(ddd)"
+      output: LC\1 \2
 
 
 # Liechtenstein (Liechtenstein)
@@ -906,6 +1125,8 @@ li:
     partition: 246
     languages: de
     names: !include country-names/li.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Sri Lanka (ශ්‍රී ලංකාව இலங்கை)
@@ -913,6 +1134,8 @@ lk:
     partition: 95
     languages: si, ta
     names: !include country-names/lk.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Liberia (Liberia)
@@ -920,6 +1143,8 @@ lr:
     partition: 216
     languages: en
     names: !include country-names/lr.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Lesotho (Lesotho)
@@ -927,6 +1152,8 @@ ls:
     partition: 136
     languages: en, st
     names: !include country-names/ls.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Lithuania (Lietuva)
@@ -934,6 +1161,8 @@ lt:
     partition: 67
     languages: lt
     names: !include country-names/lt.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Luxembourg (Lëtzebuerg)
@@ -941,6 +1170,8 @@ lu:
     partition: 74
     languages: lb, fr, de
     names: !include country-names/lu.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Latvia (Latvija)
@@ -948,6 +1179,9 @@ lv:
     partition: 162
     languages: lv
     names: !include country-names/lv.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: LV-\1
 
 
 # Libya (ليبيا)
@@ -955,6 +1189,7 @@ ly:
     partition: 163
     languages: ar
     names: !include country-names/ly.yaml
+    postcode: no
 
 
 # Morocco (Maroc ⵍⵎⵖⵔⵉⴱ المغرب)
@@ -962,6 +1197,8 @@ ma:
     partition: 23
     languages: fr, zgh, ar
     names: !include country-names/ma.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Monaco (Monaco)
@@ -969,6 +1206,8 @@ mc:
     partition: 242
     languages: fr
     names: !include country-names/mc.yaml
+    postcode:
+      pattern: "980dd"
 
 
 # Moldova (Moldova)
@@ -976,6 +1215,9 @@ md:
     partition: 147
     languages: ro, ru, uk
     names: !include country-names/md.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: MD-\1
 
 
 # Montenegro (Crna Gora / Црна Гора)
@@ -983,6 +1225,8 @@ me:
     partition: 180
     languages: srp, sr, hr, bs, sq
     names: !include country-names/me.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Saint Martin (Saint Martin)
@@ -997,6 +1241,8 @@ mg:
     partition: 164
     languages: mg, fr
     names: !include country-names/mg.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Marshall Islands (Ṃajeḷ)
@@ -1004,6 +1250,8 @@ mh:
     partition: 105
     languages: en, mh
     names: !include country-names/mh.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # North Macedonia (Северна Македонија)
@@ -1011,6 +1259,8 @@ mk:
     partition: 69
     languages: mk
     names: !include country-names/mk.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Mali (Mali)
@@ -1018,6 +1268,7 @@ ml:
     partition: 241
     languages: fr
     names: !include country-names/ml.yaml
+    postcode: no
 
 
 # Myanmar (မြန်မာ)
@@ -1025,6 +1276,8 @@ mm:
     partition: 148
     languages: my
     names: !include country-names/mm.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Mongolia (Монгол улс ᠮᠤᠩᠭᠤᠯ ᠤᠯᠤᠰ)
@@ -1032,6 +1285,8 @@ mn:
     partition: 167
     languages: mn
     names: !include country-names/mn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Macao (Macao)
@@ -1039,6 +1294,7 @@ mo:
     partition: 191
     languages: zh-hant, pt
     names: !include country-names/mo.yaml
+    postcode: no
 
 
 # Northern Mariana Islands (Northern Mariana Islands)
@@ -1060,6 +1316,7 @@ mr:
     partition: 149
     languages: ar, fr
     names: !include country-names/mr.yaml
+    postcode: no
 
 
 # Montserrat (Montserrat)
@@ -1074,6 +1331,9 @@ mt:
     partition: 165
     languages: mt, en
     names: !include country-names/mt.yaml
+    postcode:
+      pattern: "(lll) ?(dddd)"
+      output: \1 \2
 
 
 # Mauritius (Mauritius)
@@ -1081,6 +1341,8 @@ mu:
     partition: 150
     languages: mfe, fr, en
     names: !include country-names/mu.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Maldives (ދިވެހިރާއްޖެ)
@@ -1088,6 +1350,8 @@ mv:
     partition: 96
     languages: dv
     names: !include country-names/mv.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Malawi (Malawi)
@@ -1095,6 +1359,7 @@ mw:
     partition: 97
     languages: en, ny
     names: !include country-names/mw.yaml
+    postcode: no
 
 
 # Mexico (México)
@@ -1102,6 +1367,8 @@ mx:
     partition: 166
     languages: es
     names: !include country-names/mx.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Malaysia (Malaysia)
@@ -1109,6 +1376,8 @@ my:
     partition: 7
     languages: ms
     names: !include country-names/my.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Mozambique (Moçambique)
@@ -1116,6 +1385,8 @@ mz:
     partition: 98
     languages: pt
     names: !include country-names/mz.yaml
+    postcode:
+      pattern: "(dddd)(?:-dd)?"
 
 
 # Namibia (Namibia)
@@ -1123,6 +1394,8 @@ na:
     partition: 99
     languages: en, sf, de
     names: !include country-names/na.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # New Caledonia (Nouvelle-Calédonie)
@@ -1137,6 +1410,8 @@ ne:
     partition: 226
     languages: fr
     names: !include country-names/ne.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Norfolk Island (Norfolk Island)
@@ -1151,6 +1426,8 @@ ng:
     partition: 218
     languages: en
     names: !include country-names/ng.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Nicaragua (Nicaragua)
@@ -1158,6 +1435,8 @@ ni:
     partition: 151
     languages: es
     names: !include country-names/ni.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Netherlands (Nederland)
@@ -1165,6 +1444,9 @@ nl:
     partition: 63
     languages: nl
     names: !include country-names/nl.yaml
+    postcode:
+      pattern: "(dddd) ?(ll)"
+      output: \1 \2
 
 
 # Norway (Norge)
@@ -1172,6 +1454,8 @@ nl:
     partition: 60
     languages: nb, nn, no, se
     names: !include country-names/no.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Nepal (Nepal)
@@ -1179,6 +1463,8 @@ np:
     partition: 50
     languages: ne
     names: !include country-names/np.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Nauru (Naoero)
@@ -1186,6 +1472,7 @@ nr:
     partition: 70
     languages: na, en
     names: !include country-names/nr.yaml
+    postcode: no
 
 
 # Niue (Niuē)
@@ -1193,6 +1480,7 @@ nu:
     partition: 178
     languages: niu, en
     names: !include country-names/nu.yaml
+    postcode: no
 
 
 # New Zealand (New Zealand / Aotearoa)
@@ -1200,6 +1488,8 @@ nz:
     partition: 27
     languages: mi, en
     names: !include country-names/nz.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Oman (عمان)
@@ -1207,6 +1497,8 @@ om:
     partition: 137
     languages: ar
     names: !include country-names/om.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Panama (Panamá)
@@ -1214,6 +1506,8 @@ pa:
     partition: 152
     languages: es
     names: !include country-names/pa.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Peru (Perú)
@@ -1221,6 +1515,8 @@ pe:
     partition: 51
     languages: es
     names: !include country-names/pe.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # French Polynesia (Polynésie française)
@@ -1235,6 +1531,8 @@ pg:
     partition: 71
     languages: en, tpi, ho
     names: !include country-names/pg.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Philippines (Philippines)
@@ -1242,6 +1540,8 @@ ph:
     partition: 26
     languages: en, tl
     names: !include country-names/ph.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Pakistan (پاکستان)
@@ -1249,6 +1549,8 @@ pk:
     partition: 14
     languages: en, ur, pnb, sd, ps, bal
     names: !include country-names/pk.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Poland (Polska)
@@ -1256,6 +1558,9 @@ pl:
     partition: 168
     languages: pl
     names: !include country-names/pl.yaml
+    postcode:
+      pattern: "(dd)[ -]?(ddd)"
+      output: \1-\2
 
 
 # Saint Pierre and Miquelon (Saint-Pierre-et-Miquelon)
@@ -1270,6 +1575,9 @@ pn:
     partition: 113
     languages: en, pih
     names: !include country-names/pn.yaml
+    postcode:
+      pattern: "(PCRN) ?(1ZZ)"
+      output: \1 \2
 
 
 # Puerto Rico (Puerto Rico)
@@ -1284,6 +1592,8 @@ ps:
     partition: 194
     languages: ar, he
     names: !include country-names/ps.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Portugal (Portugal)
@@ -1291,6 +1601,8 @@ pt:
     partition: 34
     languages: pt
     names: !include country-names/pt.yaml
+    postcode:
+      pattern: "dddd(?:-ddd)?"
 
 
 # Palau (Belau)
@@ -1298,6 +1610,8 @@ pw:
     partition: 195
     languages: en, pau, ja, sov, tox
     names: !include country-names/pw.yaml
+    postcode:
+      pattern: "969(39|40)"
 
 
 # Paraguay (Paraguay)
@@ -1305,6 +1619,8 @@ py:
     partition: 101
     languages: es, gn
     names: !include country-names/py.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Qatar (قطر)
@@ -1312,6 +1628,7 @@ qa:
     partition: 169
     languages: ar
     names: !include country-names/qa.yaml
+    postcode: no
 
 
 #  (Réunion)
@@ -1326,6 +1643,8 @@ ro:
     partition: 170
     languages: ro
     names: !include country-names/ro.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Serbia (Србија)
@@ -1333,6 +1652,8 @@ rs:
     partition: 59
     languages: sr
     names: !include country-names/rs.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Russia (Россия)
@@ -1340,6 +1661,8 @@ ru:
     partition: 135
     languages: ru
     names: !include country-names/ru.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Rwanda (Rwanda)
@@ -1347,6 +1670,7 @@ rw:
     partition: 102
     languages: rw, fr, en
     names: !include country-names/rw.yaml
+    postcode: no
 
 
 # Saudi Arabia (السعودية)
@@ -1354,6 +1678,8 @@ sa:
     partition: 52
     languages: ar
     names: !include country-names/sa.yaml
+    postcode:
+      pattern: "ddddd(?:-dddd)?"
 
 
 # Solomon Islands (Solomon Islands)
@@ -1361,6 +1687,7 @@ sb:
     partition: 201
     languages: en
     names: !include country-names/sb.yaml
+    postcode: no
 
 
 # Seychelles (Sesel)
@@ -1368,6 +1695,7 @@ sc:
     partition: 79
     languages: fr, en, crs
     names: !include country-names/sc.yaml
+    postcode: no
 
 
 # Sudan (السودان)
@@ -1375,6 +1703,8 @@ sd:
     partition: 72
     languages: ar, en
     names: !include country-names/sd.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Sweden (Sverige)
@@ -1382,6 +1712,9 @@ se:
     partition: 112
     languages: sv
     names: !include country-names/se.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Singapore (Singapore)
@@ -1389,6 +1722,8 @@ sg:
     partition: 115
     languages: zh-hans, en, ms, ta
     names: !include country-names/sg.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Saint Helena, Ascension and Tristan da Cunha (Saint Helena, Ascension and Tristan da Cunha)
@@ -1396,6 +1731,9 @@ sh:
     partition: 196
     languages: en
     names: !include country-names/sh.yaml
+    postcode:
+      pattern: "(ASCN|STHL|TDCU) ?(1ZZ)"
+      output: \1 \2
 
 
 # Slovenia (Slovenija)
@@ -1403,6 +1741,8 @@ si:
     partition: 36
     languages: sl
     names: !include country-names/si.yaml
+    postcode:
+      pattern: "dddd"
 
 
 #  (Svalbard and Jan Mayen)
@@ -1417,6 +1757,9 @@ sk:
     partition: 172
     languages: sk
     names: !include country-names/sk.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Sierra Leone (Sierra Leone)
@@ -1424,6 +1767,7 @@ sl:
     partition: 219
     languages: en
     names: !include country-names/sl.yaml
+    postcode: no
 
 
 # San Marino (San Marino)
@@ -1431,6 +1775,8 @@ sm:
     partition: 153
     languages: it
     names: !include country-names/sm.yaml
+    postcode:
+      pattern: "4789d"
 
 
 # Senegal (Sénégal)
@@ -1438,6 +1784,8 @@ sn:
     partition: 237
     languages: fr
     names: !include country-names/sn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Somalia (Soomaaliya الصومال)
@@ -1445,6 +1793,9 @@ so:
     partition: 154
     languages: so, ar
     names: !include country-names/so.yaml
+    postcode:
+      pattern: "(ll) ?(ddddd)"
+      output: \1 \2
 
 
 # Suriname (Suriname)
@@ -1452,6 +1803,7 @@ sr:
     partition: 24
     languages: nl
     names: !include country-names/sr.yaml
+    postcode: no
 
 
 # South Sudan (South Sudan)
@@ -1459,6 +1811,7 @@ ss:
     partition: 247
     languages: en
     names: !include country-names/ss.yaml
+    postcode: no
 
 
 # São Tomé and Príncipe (São Tomé e Príncipe)
@@ -1466,6 +1819,7 @@ st:
     partition: 53
     languages: pt
     names: !include country-names/st.yaml
+    postcode: no
 
 
 # El Salvador (El Salvador)
@@ -1473,6 +1827,8 @@ sv:
     partition: 103
     languages: es
     names: !include country-names/sv.yaml
+    postcode:
+      pattern: "dddd"
 
 
 #  (Sint Maarten)
@@ -1487,6 +1843,7 @@ sy:
     partition: 104
     languages: ar
     names: !include country-names/sy.yaml
+    postcode: no
 
 
 # Eswatini (eSwatini)
@@ -1494,6 +1851,8 @@ sz:
     partition: 82
     languages: en, ss
     names: !include country-names/sz.yaml
+    postcode:
+      pattern: "lddd"
 
 
 # Turks and Caicos Islands (Turks and Caicos Islands)
@@ -1501,6 +1860,9 @@ tc:
     partition: 106
     languages: en
     names: !include country-names/tc.yaml
+    postcode:
+      pattern: "(TKCA) ?(1ZZ)"
+      output: \1 \2
 
 
 # Chad (Tchad تشاد)
@@ -1508,6 +1870,7 @@ td:
     partition: 68
     languages: fr, ar
     names: !include country-names/td.yaml
+    postcode: no
 
 
 # French Southern Lands (Terres australes et antarctiques françaises)
@@ -1522,6 +1885,7 @@ tg:
     partition: 243
     languages: fr
     names: !include country-names/tg.yaml
+    postcode: no
 
 
 # Thailand (ประเทศไทย)
@@ -1529,6 +1893,8 @@ th:
     partition: 32
     languages: th
     names: !include country-names/th.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Tajikistan (Тоҷикистон)
@@ -1536,6 +1902,8 @@ tj:
     partition: 129
     languages: tg, ru
     names: !include country-names/tj.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Tokelau (Tokelau)
@@ -1543,6 +1911,7 @@ tk:
     partition: 179
     languages: tkl, en, sm
     names: !include country-names/tk.yaml
+    postcode: no
 
 
 # East Timor (Timór Lorosa'e)
@@ -1550,6 +1919,7 @@ tl:
     partition: 161
     languages: pt, tet
     names: !include country-names/tl.yaml
+    postcode: no
 
 
 # Turkmenistan (Türkmenistan)
@@ -1557,6 +1927,8 @@ tm:
     partition: 54
     languages: tk
     names: !include country-names/tm.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Tunisia (تونس)
@@ -1564,6 +1936,8 @@ tn:
     partition: 18
     languages: ar, fr
     names: !include country-names/tn.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Tonga (Tonga)
@@ -1571,6 +1945,7 @@ to:
     partition: 220
     languages: en
     names: !include country-names/to.yaml
+    postcode: no
 
 
 # Turkey (Türkiye)
@@ -1578,6 +1953,8 @@ tr:
     partition: 81
     languages: tr
     names: !include country-names/tr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Trinidad and Tobago (Trinidad and Tobago)
@@ -1585,6 +1962,8 @@ tt:
     partition: 221
     languages: en
     names: !include country-names/tt.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Tuvalu (Tuvalu)
@@ -1592,6 +1971,7 @@ tv:
     partition: 156
     languages: en
     names: !include country-names/tv.yaml
+    postcode: no
 
 
 # Taiwan (臺灣)
@@ -1599,6 +1979,8 @@ tw:
     partition: 25
     languages: zh-hant
     names: !include country-names/tw.yaml
+    postcode:
+      pattern: "ddd(?:ddd?)?"
 
 
 # Tanzania (Tanzania)
@@ -1606,6 +1988,8 @@ tz:
     partition: 130
     languages: sw, en
     names: !include country-names/tz.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ukraine (Україна)
@@ -1613,6 +1997,8 @@ ua:
     partition: 173
     languages: uk
     names: !include country-names/ua.yaml
+    postcode:
+      pattern: "d?ddddd"
 
 
 # Uganda (Uganda)
@@ -1620,6 +2006,7 @@ ug:
     partition: 155
     languages: en, sw
     names: !include country-names/ug.yaml
+    postcode: no
 
 
 #  (United States Minor Outlying Islands)
@@ -1627,6 +2014,8 @@ um:
     partition: 198
     languages: en
     names: !include country-names/um.yaml
+    postcode:
+      pattern: "96898"
 
 
 # United States (United States)
@@ -1634,6 +2023,8 @@ us:
     partition: 2
     languages: en
     names: !include country-names/us.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Uruguay (Uruguay)
@@ -1641,6 +2032,8 @@ uy:
     partition: 174
     languages: es
     names: !include country-names/uy.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Uzbekistan (Oʻzbekiston)
@@ -1648,6 +2041,8 @@ uz:
     partition: 157
     languages: uz, kaa
     names: !include country-names/uz.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Vatican City (Civitas Vaticana)
@@ -1655,6 +2050,8 @@ va:
     partition: 107
     languages: it
     names: !include country-names/va.yaml
+    postcode:
+      pattern: "00120"
 
 
 # Saint Vincent and the Grenadines (Saint Vincent and the Grenadines)
@@ -1662,6 +2059,9 @@ vc:
     partition: 171
     languages: en
     names: !include country-names/vc.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: VC\1
 
 
 # Venezuela (Venezuela)
@@ -1669,6 +2069,8 @@ ve:
     partition: 108
     languages: es
     names: !include country-names/ve.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # British Virgin Islands (British Virgin Islands)
@@ -1676,6 +2078,9 @@ vg:
     partition: 109
     languages: en
     names: !include country-names/vg.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: VG\1
 
 
 #  (United States Virgin Islands)
@@ -1690,6 +2095,8 @@ vn:
     partition: 75
     languages: vi
     names: !include country-names/vn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Vanuatu (Vanuatu)
@@ -1697,6 +2104,7 @@ vu:
     partition: 116
     languages: bi, en, fr
     names: !include country-names/vu.yaml
+    postcode: no
 
 
 # Wallis and Futuna Islands (Wallis-et-Futuna)
@@ -1718,6 +2126,8 @@ xk:
     partition: 59
     languages: sq, sr
     names: !include country-names/xk.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Yemen (اليمن)
@@ -1725,6 +2135,7 @@ ye:
     partition: 55
     languages: ar
     names: !include country-names/ye.yaml
+    postcode: no
 
 
 # Mayotte (Mayotte)
@@ -1739,6 +2150,8 @@ za:
     partition: 76
     languages: en, af, st, tn, xh, zu
     names: !include country-names/za.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Zambia (Zambia)
@@ -1746,6 +2159,8 @@ zm:
     partition: 222
     languages: en
     names: !include country-names/zm.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Zimbabwe (Zimbabwe)
@@ -1753,4 +2168,4 @@ zw:
     partition: 223
     languages: en, sn, nd
     names: !include country-names/zw.yaml
-
+    postcode: no
index cd9c0d6dd56974888c9e12fdf834b51a5b55b22e..212fdcb9e2f7d29cac379c0a58e9041e2819912d 100644 (file)
@@ -32,6 +32,9 @@ sanitizers:
         - streetnumber
       convert-to-name:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
+    - step: clean-postcodes
+      convert-to-address: yes
+      default-pattern: "[A-Z0-9- ]{3,12}"
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
@@ -43,6 +46,8 @@ token-analysis:
     - analyzer: generic
     - id: "@housenumber"
       analyzer: housenumbers
+    - id: "@postcode"
+      analyzer: postcodes
     - id: bg
       analyzer: generic
       mode: variant-only
index 15beab57827e31b4f411ed05dee028636626385f..4d146d18c191e95a4cad025bf5c823818072b7f6 100644 (file)
@@ -163,25 +163,8 @@ Feature: Import of postcodes
            | de      | 01982    | country:de |
         And there are word tokens for postcodes 01982
 
-    Scenario: Different postcodes with the same normalization can both be found
-        Given the places
-           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
-           | N34 | place | house | EH4 7EA       | 111              | country:gb |
-           | N35 | place | house | E4 7EA        | 111              | country:gb |
-        When importing
-        Then location_postcode contains exactly
-           | country | postcode | geometry |
-           | gb      | EH4 7EA  | country:gb |
-           | gb      | E4 7EA   | country:gb |
-        When sending search query "EH4 7EA"
-        Then results contain
-           | type     | display_name |
-           | postcode | EH4 7EA      |
-        When sending search query "E4 7EA"
-        Then results contain
-           | type     | display_name |
-           | postcode | E4 7EA       |
 
+    @Fail
     Scenario: search and address ranks for GB post codes correctly assigned
         Given the places
          | osm  | class | type     | postcode | geometry |
@@ -195,55 +178,19 @@ Feature: Import of postcodes
          | E45 2    | gb      | 23          | 5 |
          | Y45      | gb      | 21          | 5 |
 
-    Scenario: wrongly formatted GB postcodes are down-ranked
+    @fail-legacy
+    Scenario: Postcodes outside all countries are not added to the postcode and word table
         Given the places
-         | osm  | class | type     | postcode | geometry |
-         | N1   | place | postcode | EA452CD  | country:gb |
-         | N2   | place | postcode | E45 23   | country:gb |
-        When importing
-        Then location_postcode contains exactly
-         | postcode | country | rank_search | rank_address |
-         | EA452CD  | gb      | 30          | 30 |
-         | E45 23   | gb      | 30          | 30 |
-
-    Scenario: search and address rank for DE postcodes correctly assigned
-        Given the places
-         | osm | class | type     | postcode | geometry |
-         | N1  | place | postcode | 56427    | country:de |
-         | N2  | place | postcode | 5642     | country:de |
-         | N3  | place | postcode | 5642A    | country:de |
-         | N4  | place | postcode | 564276   | country:de |
-        When importing
-        Then location_postcode contains exactly
-         | postcode | country | rank_search | rank_address |
-         | 56427    | de      | 21          | 11 |
-         | 5642     | de      | 30          | 30 |
-         | 5642A    | de      | 30          | 30 |
-         | 564276   | de      | 30          | 30 |
-
-    Scenario: search and address rank for other postcodes are correctly assigned
-        Given the places
-         | osm | class | type     | postcode | geometry |
-         | N1  | place | postcode | 1        | country:ca |
-         | N2  | place | postcode | X3       | country:ca |
-         | N3  | place | postcode | 543      | country:ca |
-         | N4  | place | postcode | 54dc     | country:ca |
-         | N5  | place | postcode | 12345    | country:ca |
-         | N6  | place | postcode | 55TT667  | country:ca |
-         | N7  | place | postcode | 123-65   | country:ca |
-         | N8  | place | postcode | 12 445 4 | country:ca |
-         | N9  | place | postcode | A1:bc10  | country:ca |
+            | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
+            | N34 | place | house | 01982         | 111              | Null Island | 0 0.00001 |
+        And the places
+            | osm | class | type   | name        | geometry |
+            | N1  | place | hamlet | Null Island | 0 0      |
         When importing
         Then location_postcode contains exactly
-         | postcode | country | rank_search | rank_address |
-         | 1        | ca      | 21          | 11 |
-         | X3       | ca      | 21          | 11 |
-         | 543      | ca      | 21          | 11 |
-         | 54DC     | ca      | 21          | 11 |
-         | 12345    | ca      | 21          | 11 |
-         | 55TT667  | ca      | 21          | 11 |
-         | 123-65   | ca      | 25          | 11 |
-         | 12 445 4 | ca      | 25          | 11 |
-         | A1:BC10  | ca      | 25          | 11 |
-
-
+            | country | postcode | geometry |
+        And there are no word tokens for postcodes 01982
+        When sending search query "111, 01982 Null Island"
+        Then results contain
+            | osm | display_name |
+            | N34 | 111, Null Island, 01982 |
index f91c005043df5189b47db57cc9e9163a845cfd40..e5a7a5922b7f18db1abc4fb7a497b01348747938 100644 (file)
@@ -168,14 +168,6 @@ Feature: Import and search of names
          | ID | osm |
          | 0  | R1 |
 
-    Scenario: Unprintable characters in postcodes are ignored
-        Given the named places
-            | osm  | class   | type   | address                    | geometry   |
-            | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de |
-        When importing
-        And sending search query "1234"
-        Then result 0 has not attributes osm_type
-
     Scenario Outline: Housenumbers with special characters are found
         Given the grid
             | 1 |  |   |  | 2 |
diff --git a/test/bdd/db/query/postcodes.feature b/test/bdd/db/query/postcodes.feature
new file mode 100644 (file)
index 0000000..a3ca703
--- /dev/null
@@ -0,0 +1,97 @@
+@DB
+Feature: Querying fo postcode variants
+
+    Scenario: Postcodes in Singapore (6-digit postcode)
+        Given the grid with origin SG
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name   | addr+postcode | geometry |
+            | W1  | highway | path | Lorang | 399174        | 10,11    |
+        When importing
+        When sending search query "399174"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 399174       |
+
+
+    @fail-legacy
+    Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces)
+        Given the grid with origin NL
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name     | addr+postcode | geometry |
+            | W1  | highway | path | De Weide | 3993 DX       | 10,11    |
+        When importing
+        When sending search query "3993 DX"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 3993 DX      |
+        When sending search query "3993dx"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 3993 DX      |
+
+        Examples:
+            | postcode |
+            | 3993 DX  |
+            | 3993DX   |
+            | 3993 dx  |
+
+
+    @fail-legacy
+    Scenario: Postcodes in Singapore (6-digit postcode)
+        Given the grid with origin SG
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name   | addr+postcode | geometry |
+            | W1  | highway | path | Lorang | 399174        | 10,11    |
+        When importing
+        When sending search query "399174"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 399174       |
+
+
+    @fail-legacy
+    Scenario Outline: Postcodes in Andorra (with country code)
+        Given the grid with origin AD
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name   | addr+postcode | geometry |
+            | W1  | highway | path | Lorang | <postcode>    | 10,11    |
+        When importing
+        When sending search query "675"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | AD675        |
+        When sending search query "AD675"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | AD675        |
+
+        Examples:
+            | postcode |
+            | 675      |
+            | AD 675   |
+            | AD675    |
+
+
+    Scenario: Different postcodes with the same normalization can both be found
+        Given the places
+           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
+           | N34 | place | house | EH4 7EA       | 111              | country:gb |
+           | N35 | place | house | E4 7EA        | 111              | country:gb |
+        When importing
+        Then location_postcode contains exactly
+           | country | postcode | geometry |
+           | gb      | EH4 7EA  | country:gb |
+           | gb      | E4 7EA   | country:gb |
+        When sending search query "EH4 7EA"
+        Then results contain
+           | type     | display_name |
+           | postcode | EH4 7EA      |
+        When sending search query "E4 7EA"
+        Then results contain
+           | type     | display_name |
+           | postcode | E4 7EA       |
+
index 44c82b017c04e687432a933234a5e852a0b96bec..8fd918f88fe7a3f70083be63650726738390aa9f 100644 (file)
@@ -18,13 +18,19 @@ from nominatim.tokenizer import factory as tokenizer_factory
 def check_database_integrity(context):
     """ Check some generic constraints on the tables.
     """
-    # place_addressline should not have duplicate (place_id, address_place_id)
-    cur = context.db.cursor()
-    cur.execute("""SELECT count(*) FROM
-                    (SELECT place_id, address_place_id, count(*) as c
-                     FROM place_addressline GROUP BY place_id, address_place_id) x
-                   WHERE c > 1""")
-    assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+    with context.db.cursor() as cur:
+        # place_addressline should not have duplicate (place_id, address_place_id)
+        cur.execute("""SELECT count(*) FROM
+                        (SELECT place_id, address_place_id, count(*) as c
+                         FROM place_addressline GROUP BY place_id, address_place_id) x
+                       WHERE c > 1""")
+        assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+
+        # word table must not have empty word_tokens
+        if context.nominatim.tokenizer != 'legacy':
+            cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
+            assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
+
 
 
 ################################ GIVEN ##################################
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
new file mode 100644 (file)
index 0000000..4437619
--- /dev/null
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for the sanitizer that normalizes postcodes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+from nominatim.tools import country_info
+
+@pytest.fixture
+def sanitize(def_config, request):
+    country_info.setup_country_config(def_config)
+    sanitizer_args = {'step': 'clean-postcodes'}
+    for mark in request.node.iter_markers(name="sanitizer_params"):
+        sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
+
+    def _run(country=None, **kwargs):
+        pi = {'address': kwargs}
+        if country is not None:
+            pi['country_code'] = country
+
+        _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi))
+
+        return sorted([(p.kind, p.name) for p in address])
+
+    return _run
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+def test_postcode_no_country(sanitize, country):
+    assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')]
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_no_country_drop(sanitize, country):
+    assert sanitize(country=country, postcode='23231') == []
+
+
+@pytest.mark.parametrize("postcode", ('12345', '  12345  ', 'de 12345',
+                                      'DE12345', 'DE 12345', 'DE-12345'))
+def test_postcode_pass_good_format(sanitize, postcode):
+    assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')]
+
+
+@pytest.mark.parametrize("postcode", ('123456', '', '   ', '.....',
+                                      'DE  12345', 'DEF12345', 'CH 12345'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_drop_bad_format(sanitize, postcode):
+    assert sanitize(country='de', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('1234', '9435', '99000'))
+def test_postcode_cyprus_pass(sanitize, postcode):
+    assert sanitize(country='cy', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('91234', '99a45', '567'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_cyprus_fail(sanitize, postcode):
+    assert sanitize(country='cy', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('123456', 'A33F2G7'))
+def test_postcode_kazakhstan_pass(sanitize, postcode):
+    assert sanitize(country='kz', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('V34T6Y923456', '99345'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_kazakhstan_fail(sanitize, postcode):
+    assert sanitize(country='kz', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534'))
+def test_postcode_sweden_pass(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')]
+
+
+@pytest.mark.parametrize("postcode", ('67 345', '671123'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_sweden_fail(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
+@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_pass(sanitize, postcode):
+    assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())]
+
+
+@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224'))
+@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_fail(sanitize, postcode):
+    assert sanitize(country='an', postcode=postcode) == []
+
index d85a5b65e565d83187b2688839afe72c1f175fcb..b9de97bcc2b11f7ff94ef4d6dd00ce6b6c54b7a7 100644 (file)
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
 
     def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
                      variants=('~gasse -> gasse', 'street => st', ),
-                     sanitizers=[], with_housenumber=False):
+                     sanitizers=[], with_housenumber=False,
+                     with_postcode=False):
         cfgstr = {'normalization': list(norm),
                   'sanitizers': sanitizers,
                   'transliteration': list(trans),
@@ -81,6 +82,9 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
         if with_housenumber:
             cfgstr['token-analysis'].append({'id': '@housenumber',
                                              'analyzer': 'housenumbers'})
+        if with_postcode:
+            cfgstr['token-analysis'].append({'id': '@postcode',
+                                             'analyzer': 'postcodes'})
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
 
@@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer):
         anl.normalize_postcode('38 Б') == '38 Б'
 
 
-def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
-    table_factory('location_postcode', 'postcode TEXT',
-                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+class TestPostcodes:
 
-    with analyzer() as anl:
-        anl.update_postcodes_from_db()
+    @pytest.fixture(autouse=True)
+    def setup(self, analyzer, sql_functions):
+        sanitizers = [{'step': 'clean-postcodes'}]
+        with analyzer(sanitizers=sanitizers, with_postcode=True) as anl:
+            self.analyzer = anl
+            yield anl
 
-    assert word_table.count() == 3
-    assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
 
+    def process_postcode(self, cc, postcode):
+        return self.analyzer.process_place(PlaceInfo({'country_code': cc,
+                                                      'address': {'postcode': postcode}}))
 
-def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
-    table_factory('location_postcode', 'postcode TEXT',
-                  content=(('1234',), ('45BC', ), ('XX45', )))
-    word_table.add_postcode(' 1234', '1234')
-    word_table.add_postcode(' 5678', '5678')
 
-    with analyzer() as anl:
-        anl.update_postcodes_from_db()
+    def test_update_postcodes_from_db_empty(self, table_factory, word_table):
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+                      content=(('de', '12345'), ('se', '132 34'),
+                               ('bm', 'AB23'), ('fr', '12345')))
+
+        self.analyzer.update_postcodes_from_db()
+
+        assert word_table.count() == 5
+        assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
+
+
+    def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+                      content=(('in', '123456'), ('sg', '123456')))
+
+        self.analyzer.update_postcodes_from_db()
+
+        assert word_table.count() == 3
+        assert word_table.get_postcodes() == {'123456', '123456@123 456'}
+
+
+    def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+                      content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
+        word_table.add_postcode(' 1234', '1234')
+        word_table.add_postcode(' 5678', '5678')
+
+        self.analyzer.update_postcodes_from_db()
+
+        assert word_table.count() == 5
+        assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
+
+
+    def test_process_place_postcode_simple(self, word_table):
+        info = self.process_postcode('de', '12345')
+
+        assert info['postcode'] == '12345'
+
+        assert word_table.get_postcodes() == {'12345', }
+
+
+    def test_process_place_postcode_with_space(self, word_table):
+        info = self.process_postcode('in', '123 567')
+
+        assert info['postcode'] == '123567'
+
+        assert word_table.get_postcodes() == {'123567@123 567', }
 
-    assert word_table.count() == 3
-    assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
 
 
 def test_update_special_phrase_empty_table(analyzer, word_table):
@@ -437,13 +482,6 @@ class TestPlaceAddress:
         assert word_table.get_postcodes() == {pcode, }
 
 
-    @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-    def test_process_place_bad_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert not word_table.get_postcodes()
-
-
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
         info = self.process_address(housenumber=hnr)
diff --git a/test/python/tokenizer/token_analysis/test_analysis_postcodes.py b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py
new file mode 100644 (file)
index 0000000..623bed5
--- /dev/null
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for special postcode analysis and variant generation.
+"""
+import pytest
+
+from icu import Transliterator
+
+import nominatim.tokenizer.token_analysis.postcodes as module
+from nominatim.errors import UsageError
+
+DEFAULT_NORMALIZATION = """ :: NFD ();
+                            '🜳' > ' ';
+                            [[:Nonspacing Mark:] [:Cf:]] >;
+                            :: lower ();
+                            [[:Punctuation:][:Space:]]+ > ' ';
+                            :: NFC ();
+                        """
+
+DEFAULT_TRANSLITERATION = """ ::  Latin ();
+                              '🜵' > ' ';
+                          """
+
+@pytest.fixture
+def analyser():
+    rules = { 'analyzer': 'postcodes'}
+    config = module.configure(rules, DEFAULT_NORMALIZATION)
+
+    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+
+    return module.create(norm, trans, config)
+
+
+def get_normalized_variants(proc, name):
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+    return proc.get_variants_ascii(norm.transliterate(name).strip())
+
+
+@pytest.mark.parametrize('name,norm', [('12', '12'),
+                                       ('A 34 ', 'A 34'),
+                                       ('34-av', '34-AV')])
+def test_normalize(analyser, name, norm):
+    assert analyser.normalize(name) == norm
+
+
+@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
+                                               ('AB-998', {'ab 998', 'ab998'}),
+                                               ('23 FGH D3', {'23 fgh d3', '23fgh d3',
+                                                              '23 fghd3', '23fghd3'})])
+def test_get_variants_ascii(analyser, postcode, variants):
+    out = analyser.get_variants_ascii(postcode)
+
+    assert len(out) == len(set(out))
+    assert set(out) == variants
index bdfe309471f0995188c9fdd32cc13815e9cac9ee..0c4b93fcac2e8e85cc7065d07505cf87f5616402 100644 (file)
@@ -11,7 +11,7 @@ import subprocess
 
 import pytest
 
-from nominatim.tools import postcodes
+from nominatim.tools import postcodes, country_info
 import dummy_tokenizer
 
 class MockPostcodeTable:
@@ -64,11 +64,26 @@ class MockPostcodeTable:
 def tokenizer():
     return dummy_tokenizer.DummyTokenizer(None, None)
 
+
 @pytest.fixture
-def postcode_table(temp_db_conn, placex_table):
+def postcode_table(def_config, temp_db_conn, placex_table):
+    country_info.setup_country_config(def_config)
     return MockPostcodeTable(temp_db_conn)
 
 
+@pytest.fixture
+def insert_implicit_postcode(placex_table, place_row):
+    """
+        Inserts data into the placex and place table
+        which can then be used to compute one postcode.
+    """
+    def _insert_implicit_postcode(osm_id, country, geometry, address):
+        placex_table.add(osm_id=osm_id, country=country, geom=geometry)
+        place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address)
+
+    return _insert_implicit_postcode
+
+
 def test_postcodes_empty(dsn, postcode_table, place_table,
                          tmp_path, tokenizer):
     postcodes.update_postcodes(dsn, tmp_path, tokenizer)
@@ -193,27 +208,30 @@ def test_can_compute(dsn, table_factory):
     table_factory('place')
     assert postcodes.can_compute(dsn)
 
+
 def test_no_placex_entry(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer):
     #Rewrite the get_country_code function to verify its execution.
     temp_db_cursor.execute("""
         CREATE OR REPLACE FUNCTION get_country_code(place geometry)
         RETURNS TEXT AS $$ BEGIN 
-        RETURN 'fr';
+        RETURN 'yy';
         END; $$ LANGUAGE plpgsql;
     """)
     place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511'))
     postcodes.update_postcodes(dsn, tmp_path, tokenizer)
 
-    assert postcode_table.row_set == {('fr', 'AB 4511', 10, 12)}
+    assert postcode_table.row_set == {('yy', 'AB 4511', 10, 12)}
 
-@pytest.fixture
-def insert_implicit_postcode(placex_table, place_row):
-    """
-        Inserts data into the placex and place table
-        which can then be used to compute one postcode.
-    """
-    def _insert_implicit_postcode(osm_id, country, geometry, address):
-        placex_table.add(osm_id=osm_id, country=country, geom=geometry)
-        place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address)
 
-    return _insert_implicit_postcode
+def test_discard_badly_formatted_postcodes(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer):
+    #Rewrite the get_country_code function to verify its execution.
+    temp_db_cursor.execute("""
+        CREATE OR REPLACE FUNCTION get_country_code(place geometry)
+        RETURNS TEXT AS $$ BEGIN 
+        RETURN 'fr';
+        END; $$ LANGUAGE plpgsql;
+    """)
+    place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511'))
+    postcodes.update_postcodes(dsn, tmp_path, tokenizer)
+
+    assert not postcode_table.row_set
diff --git a/test/python/utils/test_centroid.py b/test/python/utils/test_centroid.py
new file mode 100644 (file)
index 0000000..63d967e
--- /dev/null
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for centroid computation.
+"""
+import pytest
+
+from nominatim.utils.centroid import PointsCentroid
+
+def test_empty_set():
+    c = PointsCentroid()
+
+    with pytest.raises(ValueError, match='No points'):
+        c.centroid()
+
+
+@pytest.mark.parametrize("centroid", [(0,0), (-1, 3), [0.0000032, 88.4938]])
+def test_one_point_centroid(centroid):
+    c = PointsCentroid()
+
+    c += centroid
+
+    assert len(c.centroid()) == 2
+    assert c.centroid() == (pytest.approx(centroid[0]), pytest.approx(centroid[1]))
+
+
+def test_multipoint_centroid():
+    c = PointsCentroid()
+
+    c += (20.0, -10.0)
+    assert c.centroid() == (pytest.approx(20.0), pytest.approx(-10.0))
+    c += (20.2, -9.0)
+    assert c.centroid() == (pytest.approx(20.1), pytest.approx(-9.5))
+    c += (20.2, -9.0)
+    assert c.centroid() == (pytest.approx(20.13333), pytest.approx(-9.333333))
+
+
+def test_manypoint_centroid():
+    c = PointsCentroid()
+
+    for _ in range(10000):
+        c += (4.564732, -0.000034)
+
+    assert c.centroid() == (pytest.approx(4.564732), pytest.approx(-0.000034))
+
+
+@pytest.mark.parametrize("param", ["aa", None, 5, [1, 2, 3], (3, None), ("a", 3.9)])
+def test_add_non_tuple(param):
+    c = PointsCentroid()
+
+    with pytest.raises(ValueError, match='2-element tuples'):
+        c += param