From 62d5984b1bf4f121159ae49c88ac6d3d9a2ea619 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 2 Jul 2021 16:42:13 +0200 Subject: [PATCH] limit the number of variants that can be produced --- nominatim/tokenizer/icu_name_processor.py | 5 +++++ test/python/test_tokenizer_icu_name_processor.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index 6ead712e..28719df1 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -105,6 +105,11 @@ class ICUNameProcessor: partials = [v + done + r for v, r in itertools.product(partials, repl) if not force_space or r.startswith(' ')] + if len(partials) > 128: + # If too many variants are produced, they are unlikely + # to be helpful. Only use the original term. + startpos = 0 + break startpos = pos + len(full) if full[-1] == ' ': startpos -= 1 diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py index 553d25c5..cc103116 100644 --- a/test/python/test_tokenizer_icu_name_processor.py +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -78,6 +78,8 @@ VARIANT_TESTS = [ (('river$ -> r',), "Bent River", {'bent river', 'bent r'}), (('^north => n',), "North 2nd Street", {'n 2nd street'}), (('^north => n',), "Airport North", {'airport north'}), +(('am -> a',), "am am am am am am am am", {'am am am am am am am am'}), +(('am => a',), "am am am am am am am am", {'a a a a a a a a'}) ] @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS) -- 2.43.2