From: Sarah Hoffmann Date: Fri, 2 Jul 2021 13:05:17 +0000 (+0200) Subject: restrict partial word counting to names of reasoanble length X-Git-Tag: v4.0.0~58^2~2 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/c32551b4e0978d2bd26b3fe6997a722562b3565b?ds=sidebyside restrict partial word counting to names of reasoanble length The partial word count does not split names to save a bit of time. The result is that it might enounter unreasonably long names which in truth consist of multiple words. No accurate statistics are needed so simply restrict the count to words shorter than 75 characters. --- diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6bf409cc..c585c5af 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -163,7 +163,9 @@ class LegacyICUTokenizer: words = Counter() name_proc = ICUNameProcessor(self.naming_rules) with conn.cursor(name="words") as cur: - cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v") + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") for name, cnt in cur: terms = set()