From c32551b4e0978d2bd26b3fe6997a722562b3565b Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 2 Jul 2021 15:05:17 +0200
Subject: [PATCH] restrict partial word counting to names of reasoanble length

The partial word count does not split names to save a bit of time.
The result is that it might enounter unreasonably long names
which in truth consist of multiple words. No accurate statistics
are needed so simply restrict the count to words shorter than
75 characters.
---
 nominatim/tokenizer/legacy_icu_tokenizer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 6bf409cc..c585c5af 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -163,7 +163,9 @@ class LegacyICUTokenizer:
             words = Counter()
             name_proc = ICUNameProcessor(self.naming_rules)
             with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
 
                 for name, cnt in cur:
                     terms = set()
-- 
2.39.5