From b9fbfeff67b420905a4176f4f5e9312746d0c42e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 26 Jun 2021 11:57:09 +0200 Subject: [PATCH] only consider partials in multi-words for initial count This ensures that it is less likely that we exclude meaningful words like 'hauptstrasse' just because they are frequent. --- nominatim/tokenizer/legacy_icu_tokenizer.py | 3 ++- test/python/test_tokenizer_legacy_icu.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 5f83b73d..6bf409cc 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -168,7 +168,8 @@ class LegacyICUTokenizer: for name, cnt in cur: terms = set() for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - terms.update(word.split()) + if ' ' in word: + terms.update(word.split()) for term in terms: words[term] += cnt diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index 56c08e5a..39fc9fb4 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -150,9 +150,8 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table): tok = tokenizer_factory() tok.init_new_db(test_config) - assert word_table.get_partial_words() == {('test', 1), ('52', 1), + assert word_table.get_partial_words() == {('test', 1), ('no', 1), ('area', 2), - ('holzstrasse', 1), ('holzstr', 1), ('holz', 1), ('strasse', 1), ('str', 1)} -- 2.39.5