]> git.openstreetmap.org Git - nominatim.git/commitdiff
add framework for analysing housenumbers
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 16 Feb 2022 10:15:43 +0000 (11:15 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 1 Mar 2022 08:34:32 +0000 (09:34 +0100)
This lays the groundwork for adding variants for housenumbers.
When analysis is enabled, then the 'word' field in the word table
is used as usual, so that variants can be created. There will be
only one analyser allowed which must have the fixed name
'@housenumber'.

lib-php/tokenizer/icu_tokenizer.php
lib-sql/tokenizer/icu_tokenizer.sql
lib-sql/tokenizer/icu_tokenizer_tables.sql
nominatim/tokenizer/icu_tokenizer.py

index cbbf240a27a2c4f95fd1180bb0842dd28c613ee2..ccce99ca1330d7a42a6976d7fb7c9eaf3d8a84d7 100644 (file)
@@ -157,7 +157,8 @@ class Tokenizer
         $sSQL = 'SELECT word_id, word_token, type, word,';
         $sSQL .= "      info->>'op' as operator,";
         $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
-        $sSQL .= "      info->>'count' as count";
+        $sSQL .= "      info->>'count' as count,";
+        $sSQL .= "      info->>'lookup' as lookup";
         $sSQL .= ' FROM word WHERE word_token in (';
         $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
 
@@ -179,7 +180,8 @@ class Tokenizer
                     }
                     break;
                 case 'H':  // house number tokens
-                    $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
+                    $sLookup = $aWord['lookup'] ?? $aWord['word_token'];
+                    $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup));
                     break;
                 case 'P':  // postcode tokens
                     // Postcodes are not normalized, so they may have content
index 03408b4ac1b8d8dba868bbb7cb60e3e9febe5e56..a3dac8ddcbe82eb5fd6057bd81bb9b823befa159 100644 (file)
@@ -200,3 +200,26 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  return_id INTEGER;
+BEGIN
+  SELECT min(word_id) INTO return_id
+    FROM word WHERE word = norm_term and type = 'H';
+
+  IF return_id IS NULL THEN
+    return_id := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, word, info)
+      SELECT return_id, lookup_term, 'H', norm_term,
+             json_build_object('lookup', lookup_terms[1])
+        FROM unnest(lookup_terms) as lookup_term;
+  END IF;
+
+  RETURN return_id;
+END;
+$$
+LANGUAGE plpgsql;
index 58965b57fcb25478021cc8a0c27ccb8ddeaa7d85..509f6f65d9d6e5dd659c3d95ad2e3220b4209bb5 100644 (file)
@@ -28,6 +28,10 @@ CREATE INDEX idx_word_postcodes ON word
 CREATE INDEX idx_word_full_word ON word
     USING btree(word) {{db.tablespace.address_index}}
     WHERE type = 'W';
+-- Used when inserting analyzed housenumbers (exclude old-style entries).
+CREATE INDEX idx_word_housenumbers ON word
+    USING btree(word) {{db.tablespace.address_index}}
+    WHERE type = 'H' and word is not null;
 
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 
index 3ce4895b901f7253744e4900f3072997f202507b..7bc4720ef56ed82b4d8fe45f484ea8a386ade422 100644 (file)
@@ -485,18 +485,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         """ Normalize the housenumber and return the word token and the
             canonical form.
         """
-        norm_name = self._search_normalized(hnr.name)
-        if not norm_name:
-            return None, None
+        analyzer = self.token_analysis.analysis.get('@housenumber')
+        result = None, None
 
-        token = self._cache.housenumbers.get(norm_name)
-        if token is None:
-            with self.conn.cursor() as cur:
-                cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                token = cur.fetchone()[0]
-                self._cache.housenumbers[norm_name] = token
-
-        return token, norm_name
+        if analyzer is None:
+            # When no custom analyzer is set, simply normalize and transliterate
+            norm_name = self._search_normalized(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    with self.conn.cursor() as cur:
+                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+                        result = cur.fetchone()[0], norm_name
+                        self._cache.housenumbers[norm_name] = result
+        else:
+            # Otherwise use the analyzer to determine the canonical name.
+            # Per convention we use the first variant as the 'lookup name', the
+            # name that gets saved in the housenumber field of the place.
+            norm_name = analyzer.normalize(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    variants = analyzer.get_variants_ascii(norm_name)
+                    if variants:
+                        with self.conn.cursor() as cur:
+                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
+                                        (norm_name, list(variants)))
+                            result = cur.fetchone()[0], variants[0]
+                            self._cache.housenumbers[norm_name] = result
+
+        return result
 
 
     def _compute_partial_tokens(self, name):