]> git.openstreetmap.org Git - nominatim.git/commitdiff
restrict partial word counting to names of reasoanble length
authorSarah Hoffmann <lonvia@denofr.de>
Fri, 2 Jul 2021 13:05:17 +0000 (15:05 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:28 +0000 (10:28 +0200)
The partial word count does not split names to save a bit of time.
The result is that it might enounter unreasonably long names
which in truth consist of multiple words. No accurate statistics
are needed so simply restrict the count to words shorter than
75 characters.

nominatim/tokenizer/legacy_icu_tokenizer.py

index 6bf409cca3ab3674b41605b06e8dfe49eda40e41..c585c5afe0bf28bfa24590ed05cb165f6fd2dd01 100644 (file)
@@ -163,7 +163,9 @@ class LegacyICUTokenizer:
             words = Counter()
             name_proc = ICUNameProcessor(self.naming_rules)
             with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
 
                 for name, cnt in cur:
                     terms = set()