add framework for analysing housenumbers

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 16 Feb 2022 10:15:43 +0000 (11:15 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Tue, 1 Mar 2022 08:34:32 +0000 (09:34 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 16 Feb 2022 10:15:43 +0000 (11:15 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Tue, 1 Mar 2022 08:34:32 +0000 (09:34 +0100)
diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php

index cbbf240a27a2c4f95fd1180bb0842dd28c613ee2..ccce99ca1330d7a42a6976d7fb7c9eaf3d8a84d7 100644 (file)
--- a/lib-php/tokenizer/icu_tokenizer.php
+++ b/lib-php/tokenizer/icu_tokenizer.php
@@ -157,7 +157,8 @@ class Tokenizer
          $sSQL = 'SELECT word_id, word_token, type, word,';
          $sSQL .= "      info->>'op' as operator,";
          $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
-        $sSQL .= "      info->>'count' as count";
+        $sSQL .= "      info->>'count' as count,";
+        $sSQL .= "      info->>'lookup' as lookup";
          $sSQL .= ' FROM word WHERE word_token in (';
          $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
  
@@ -179,7 +180,8 @@ class Tokenizer
                      }
                      break;
                  case 'H':  // house number tokens
-                    $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
+                    $sLookup = $aWord['lookup'] ?? $aWord['word_token'];
+                    $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup));
                      break;
                  case 'P':  // postcode tokens
                      // Postcodes are not normalized, so they may have content
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql

index 03408b4ac1b8d8dba868bbb7cb60e3e9febe5e56..a3dac8ddcbe82eb5fd6057bd81bb9b823befa159 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -200,3 +200,26 @@ BEGIN
  END;
  $$
  LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  return_id INTEGER;
+BEGIN
+  SELECT min(word_id) INTO return_id
+    FROM word WHERE word = norm_term and type = 'H';
+
+  IF return_id IS NULL THEN
+    return_id := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, word, info)
+      SELECT return_id, lookup_term, 'H', norm_term,
+             json_build_object('lookup', lookup_terms[1])
+        FROM unnest(lookup_terms) as lookup_term;
+  END IF;
+
+  RETURN return_id;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql

index 58965b57fcb25478021cc8a0c27ccb8ddeaa7d85..509f6f65d9d6e5dd659c3d95ad2e3220b4209bb5 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql
@@ -28,6 +28,10 @@ CREATE INDEX idx_word_postcodes ON word
  CREATE INDEX idx_word_full_word ON word
      USING btree(word) {{db.tablespace.address_index}}
      WHERE type = 'W';
+-- Used when inserting analyzed housenumbers (exclude old-style entries).
+CREATE INDEX idx_word_housenumbers ON word
+    USING btree(word) {{db.tablespace.address_index}}
+    WHERE type = 'H' and word is not null;
  
  GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
  
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 3ce4895b901f7253744e4900f3072997f202507b..7bc4720ef56ed82b4d8fe45f484ea8a386ade422 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -485,18 +485,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          """ Normalize the housenumber and return the word token and the
              canonical form.
          """
-        norm_name = self._search_normalized(hnr.name)
-        if not norm_name:
-            return None, None
+        analyzer = self.token_analysis.analysis.get('@housenumber')
+        result = None, None
  
-        token = self._cache.housenumbers.get(norm_name)
-        if token is None:
-            with self.conn.cursor() as cur:
-                cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                token = cur.fetchone()[0]
-                self._cache.housenumbers[norm_name] = token
-
-        return token, norm_name
+        if analyzer is None:
+            # When no custom analyzer is set, simply normalize and transliterate
+            norm_name = self._search_normalized(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    with self.conn.cursor() as cur:
+                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+                        result = cur.fetchone()[0], norm_name
+                        self._cache.housenumbers[norm_name] = result
+        else:
+            # Otherwise use the analyzer to determine the canonical name.
+            # Per convention we use the first variant as the 'lookup name', the
+            # name that gets saved in the housenumber field of the place.
+            norm_name = analyzer.normalize(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    variants = analyzer.get_variants_ascii(norm_name)
+                    if variants:
+                        with self.conn.cursor() as cur:
+                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
+                                        (norm_name, list(variants)))
+                            result = cur.fetchone()[0], variants[0]
+                            self._cache.housenumbers[norm_name] = result
+
+        return result
  
  
      def _compute_partial_tokens(self, name):
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 16 Feb 2022 10:15:43 +0000 (11:15 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 1 Mar 2022 08:34:32 +0000 (09:34 +0100)
lib-php/tokenizer/icu_tokenizer.php		patch \| blob \| history
lib-sql/tokenizer/icu_tokenizer.sql		patch \| blob \| history
lib-sql/tokenizer/icu_tokenizer_tables.sql		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history