]> git.openstreetmap.org Git - nominatim.git/commitdiff
improve penalty for token-split words
authorSarah Hoffmann <lonvia@denofr.de>
Sat, 12 Aug 2023 09:26:02 +0000 (11:26 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Sat, 12 Aug 2023 09:26:02 +0000 (11:26 +0200)
The rematch penalty for partial words created by the transliteration
need to take into account that they are rematched against the full word.
That means that missing beginning and end should not get a significant
penalty.

nominatim/api/search/icu_tokenizer.py

index f259995db112bbbe537aaa3855f2d4d78e36f5e2..7bf516e3aa25c12775d0c3ffae7271076f6d0032 100644 (file)
@@ -83,7 +83,7 @@ class ICUToken(qmod.Token):
         seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
         distance = 0
         for tag, afrom, ato, bfrom, bto in seq.get_opcodes():
-            if tag == 'delete' and (afrom == 0 or ato == len(self.lookup_word)):
+            if tag in ('delete', 'insert') and (afrom == 0 or ato == len(self.lookup_word)):
                 distance += 1
             elif tag == 'replace':
                 distance += max((ato-afrom), (bto-bfrom))