From 5ab0a63fd6881f1b7273363e7f20562bc5e8dc39 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 20 Jul 2021 11:36:20 +0200
Subject: [PATCH] switch housenumber tokens to new word table layout

---
 lib-php/tokenizer/legacy_icu_tokenizer.php  |  9 ++++++---
 lib-sql/tokenizer/legacy_icu_tokenizer.sql  | 10 ++++------
 nominatim/tokenizer/legacy_icu_tokenizer.py |  3 ++-
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php
index ea445f23..b2fc27c7 100644
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -156,6 +156,8 @@ class Tokenizer
         $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
 
         foreach ($aDBWords as $aWord) {
+            $iId = (int) $aWord['word_id'];
+
             switch ($aWord['type']) {
                 'C':  // country name tokens
                     if ($aWord['country'] === null
@@ -166,12 +168,13 @@ class Tokenizer
                     }
                     $oToken = new Token\Country($iId, $aWord['country'])
                     break;
+                'H':  // house number tokens
+                    $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
+                    break;
                 default:
                     continue;
             }
-/*            $iId = (int) $aWord['word_id'];
-
-            if ($aWord['class']) {
+/*          if ($aWord['class']) {
                 // Special terms need to appear in their normalized form.
                 // (postcodes are not normalized in the word table)
                 $sNormWord = $this->normalizeString($aWord['word']);
diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
index 686137de..e9dcf4bc 100644
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -140,15 +140,13 @@ CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
 DECLARE
   return_id INTEGER;
 BEGIN
-  SELECT min(word_id) INTO return_id
-    FROM word
-    WHERE word_token = '  '  || lookup_term
-          and class = 'place' and type = 'house';
+  SELECT min(word_id) INTO return_id FROM word
+    WHERE word_token = lookup_term and type = 'H';
 
   IF return_id IS NULL THEN
     return_id := nextval('seq_word');
-    INSERT INTO word (word_id, word_token, class, type, search_name_count)
-      VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0);
+    INSERT INTO word (word_id, word_token, type)
+      VALUES (return_id, lookup_term, 'H');
   END IF;
 
   RETURN return_id;
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 32dd6535..9fbb9bb0 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -601,7 +601,8 @@ class _TokenCache:
 
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
         """
         tokens = []
         askdb = []
-- 
2.39.5