introduce and use analyzer for postcodes

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 24 May 2022 19:45:06 +0000 (21:45 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 24 May 2022 19:45:06 +0000 (21:45 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql

index a3dac8ddcbe82eb5fd6057bd81bb9b823befa159..f323334b88c5e4c65c38c998d1bbaac839bcec61 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -223,3 +223,26 @@ BEGIN
  END;
  $$
  LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
+  RETURNS BOOLEAN
+  AS $$
+DECLARE
+  existing INTEGER;
+BEGIN
+  SELECT count(*) INTO existing
+    FROM word WHERE word = postcode and type = 'P';
+
+  IF existing > 0 THEN
+    RETURN TRUE;
+  END IF;
+
+  -- postcodes don't need word ids
+  INSERT INTO word (word_token, type, word)
+    SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
+
+  RETURN FALSE;
+END;
+$$
+LANGUAGE plpgsql;
+
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 4678af66eb08d019b30e38bb8280da108083bd13..e9812ba0430338e6647d459a0c162eadad0d467c 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module.
  import itertools
  import json
  import logging
-import re
  from textwrap import dedent
  
  from nominatim.db.connection import connect
@@ -473,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _process_place_address(self, token_info, address):
          for item in address:
              if item.kind == 'postcode':
-                self._add_postcode(item.name)
+                token_info.set_postcode(self._add_postcode(item))
              elif item.kind == 'housenumber':
                  token_info.add_housenumber(*self._compute_housenumber_token(item))
              elif item.kind == 'street':
@@ -605,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item):
          """ Make sure the normalized postcode is present in the word table.
          """
-        if re.search(r'[:,;]', postcode) is None:
-            postcode = self.normalize_postcode(postcode)
+        analyzer = self.token_analysis.get_analyzer('@postcode')
  
-            if postcode not in self._cache.postcodes:
-                term = self._search_normalized(postcode)
-                if not term:
-                    return
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.normalize(item.name)
+            variant_base = item.get_attr("variant")
  
-                with self.conn.cursor() as cur:
-                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, word)
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
-                                    WHERE NOT EXISTS
-                                     (SELECT * FROM word
-                                      WHERE type = 'P' and word = pc))
-                                """, (term, postcode))
-                self._cache.postcodes.add(postcode)
+        if variant_base is not None:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
+
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return
+
+            variants = {term}
+            if analyzer is not None and variant_base is not None:
+                variants.update(analyzer.get_variants_ascii(variant_base))
+
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
  
  
  class _TokenInfo:
@@ -637,6 +646,7 @@ class _TokenInfo:
          self.street_tokens = set()
          self.place_tokens = set()
          self.address_tokens = {}
+        self.postcode = None
  
  
      @staticmethod
@@ -701,6 +711,11 @@ class _TokenInfo:
          if partials:
              self.address_tokens[key] = self._mk_array(partials)
  
+    def set_postcode(self, postcode):
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
+
  
  class _TokenCache:
      """ Cache for token information to avoid repeated database queries.
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py

index c6292a2942b217ae866c8b43ded81a5aa75ff089..d1edc60d1e5e8c109a446323a7832f3970e3c3a9 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -98,7 +98,7 @@ class _PostcodeSanitizer:
                      obj.address.pop(pos)
              else:
                  postcode.name = formatted[0]
-                postcode.set_attr('lookup', formatted[1])
+                postcode.set_attr('variant', formatted[1])
  
  
      def scan(self, postcode, country):
diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py

new file mode 100644 (file)

index 0000000..e105b13
--- /dev/null
+++ b/nominatim/tokenizer/token_analysis/postcodes.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+
+from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules, normalization_rules): # pylint: disable=W0613
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer, transliterator, config): # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return PostcodeTokenAnalysis(normalizer, transliterator)
+
+class PostcodeTokenAnalysis:
+    """ Detects common housenumber patterns and normalizes them.
+    """
+    def __init__(self, norm, trans):
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+    def normalize(self, name):
+        """ Return the standard form of the postcode.
+        """
+        return name.strip().upper()
+
+
+    def get_variants_ascii(self, norm_name):
+        """ Compute the spelling variants for the given normalized postcode.
+
+            The official form creates one variant. If a 'lookup version' is
+            given, then it will create variants with optional spaces.
+        """
+        # Postcodes follow their own transliteration rules.
+        # Make sure at this point, that the terms are normalized in a way
+        # that they are searchable with the standard transliteration rules.
+        return [self.trans.transliterate(term) for term in
+                self.mutator.generate([self.norm.transliterate(norm_name)])]
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index f682bbcdf8ad4c38a4e891e55f5360e70a7a28d0..212fdcb9e2f7d29cac379c0a58e9041e2819912d 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -34,7 +34,7 @@ sanitizers:
          - (\A|.*,)[^\d,]{3,}(,.*|\Z)
      - step: clean-postcodes
        convert-to-address: yes
-      default-pattern: [A-Z0-9- ]{3,12}
+      default-pattern: "[A-Z0-9- ]{3,12}"
      - step: split-name-list
      - step: strip-brace-terms
      - step: tag-analyzer-by-language
@@ -46,6 +46,8 @@ token-analysis:
      - analyzer: generic
      - id: "@housenumber"
        analyzer: housenumbers
+    - id: "@postcode"
+      analyzer: postcodes
      - id: bg
        analyzer: generic
        mode: variant-only
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 24 May 2022 19:45:06 +0000 (21:45 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
lib-sql/tokenizer/icu_tokenizer.sql		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/clean_postcodes.py		patch \| blob \| history
nominatim/tokenizer/token_analysis/postcodes.py	[new file with mode: 0644]	patch \| blob
settings/icu_tokenizer.yaml		patch \| blob \| history