From b9fbfeff67b420905a4176f4f5e9312746d0c42e Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sat, 26 Jun 2021 11:57:09 +0200
Subject: [PATCH] only consider partials in multi-words for initial count

This ensures that it is less likely that we exclude meaningful
words like 'hauptstrasse' just because they are frequent.
---
 nominatim/tokenizer/legacy_icu_tokenizer.py | 3 ++-
 test/python/test_tokenizer_legacy_icu.py    | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 5f83b73d..6bf409cc 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -168,7 +168,8 @@ class LegacyICUTokenizer:
                 for name, cnt in cur:
                     terms = set()
                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        terms.update(word.split())
+                        if ' ' in word:
+                            terms.update(word.split())
                     for term in terms:
                         words[term] += cnt
 
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py
index 56c08e5a..39fc9fb4 100644
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -150,9 +150,8 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
 
-    assert word_table.get_partial_words() == {('test', 1), ('52', 1),
+    assert word_table.get_partial_words() == {('test', 1),
                                               ('no', 1), ('area', 2),
-                                              ('holzstrasse', 1), ('holzstr', 1),
                                               ('holz', 1), ('strasse', 1),
                                               ('str', 1)}
 
-- 
2.39.5