From 5ab0a63fd6881f1b7273363e7f20562bc5e8dc39 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 11:36:20 +0200 Subject: [PATCH] switch housenumber tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 9 ++++++--- lib-sql/tokenizer/legacy_icu_tokenizer.sql | 10 ++++------ nominatim/tokenizer/legacy_icu_tokenizer.py | 3 ++- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index ea445f23..b2fc27c7 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -156,6 +156,8 @@ class Tokenizer $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.'); foreach ($aDBWords as $aWord) { + $iId = (int) $aWord['word_id']; + switch ($aWord['type']) { 'C': // country name tokens if ($aWord['country'] === null @@ -166,12 +168,13 @@ class Tokenizer } $oToken = new Token\Country($iId, $aWord['country']) break; + 'H': // house number tokens + $oToken = new Token\HouseNumber($iId, $aWord['word_token']); + break; default: continue; } -/* $iId = (int) $aWord['word_id']; - - if ($aWord['class']) { +/* if ($aWord['class']) { // Special terms need to appear in their normalized form. // (postcodes are not normalized in the word table) $sNormWord = $this->normalizeString($aWord['word']); diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index 686137de..e9dcf4bc 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -140,15 +140,13 @@ CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) DECLARE return_id INTEGER; BEGIN - SELECT min(word_id) INTO return_id - FROM word - WHERE word_token = ' ' || lookup_term - and class = 'place' and type = 'house'; + SELECT min(word_id) INTO return_id FROM word + WHERE word_token = lookup_term and type = 'H'; IF return_id IS NULL THEN return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, class, type, search_name_count) - VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0); + INSERT INTO word (word_id, word_token, type) + VALUES (return_id, lookup_term, 'H'); END IF; RETURN return_id; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 32dd6535..9fbb9bb0 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -601,7 +601,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = [] -- 2.43.2