make compund decomposition pure import feature

author Sarah Hoffmann <lonvia@denofr.de>

Fri, 11 Jun 2021 08:03:31 +0000 (10:03 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Fri, 11 Jun 2021 08:03:31 +0000 (10:03 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index 269faed981abbbb9ffc530bd32d6b38ae0c30df4..6bf23201cf953545bc182cf0c3d12b9333827e93 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -31,23 +31,13 @@ class ICURuleLoader:
  
      def get_search_rules(self):
          """ Return the ICU rules to be used during search.
-            The rules combine normalization, compound decomposition (including
-            abbreviated compounds) and transliteration.
+            The rules combine normalization and transliteration.
          """
          # First apply the normalization rules.
          rules = io.StringIO()
          rules.write(self.normalization_rules)
  
-        # For all compound suffixes: add them in their full and any abbreviated form.
-        suffixes = set()
-        for suffix in self.compound_suffixes:
-            suffixes.add(suffix)
-            suffixes.update(self.abbreviations.get(suffix, []))
-
-        for suffix in sorted(suffixes, key=len, reverse=True):
-            rules.write("'{0} ' > ' {0} ';".format(suffix))
-
-        # Finally add transliteration.
+        # Then add transliteration.
          rules.write(self.transliteration_rules)
          return rules.getvalue()
  
@@ -69,6 +59,12 @@ class ICURuleLoader:
          """
          synonyms = defaultdict(set)
  
+        # First add entries for compound decomposition.
+        for suffix in self.compound_suffixes:
+            variants = (suffix + ' ', ' ' + suffix + ' ')
+            for key in variants:
+                synonyms[key].update(variants)
+
          for full, abbr in self.abbreviations.items():
              key = ' ' + full + ' '
              # Entries in the abbreviation list always apply to full words:
@@ -76,15 +72,15 @@ class ICURuleLoader:
              # Replacements are optional, so add a noop
              synonyms[key].add(key)
  
-        # Entries in the compound list expand to themselves and to
-        # abbreviations.
-        for suffix in self.compound_suffixes:
-            keyset = synonyms[suffix + ' ']
-            keyset.add(' ' + suffix + ' ')
-            keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
-            # The terms the entries are shortended to, need to be decompunded as well.
-            for abbr in self.abbreviations.get(suffix, []):
-                synonyms[abbr + ' '].add(' ' + abbr + ' ')
+            if full in self.compound_suffixes:
+                # Full word abbreviating to compunded version.
+                synonyms[key].update((a + ' ' for a in abbr))
+
+                key = full + ' '
+                # Uncompunded suffix abbrevitating to decompounded version.
+                synonyms[key].update((' ' + a + ' ' for a in abbr))
+                # Uncompunded suffix abbrevitating to compunded version.
+                synonyms[key].update((a + ' ' for a in abbr))
  
          # sort the resulting list by descending length (longer matches are prefered).
          sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature

index 350455891495f470a92954b0545a260c8c7531dd..b8a760f99bd0bc03e127c14ae60de5f93fdf0290 100644 (file)
--- a/test/bdd/db/query/normalization.feature
+++ b/test/bdd/db/query/normalization.feature
@@ -53,7 +53,7 @@ Feature: Import and search of names
      Scenario: Special characters in name
          Given the places
            | osm | class | type      | name |
-          | N1  | place | locality  | Jim-Knopf-Str |
+          | N1  | place | locality  | Jim-Knopf-Straße |
            | N2  | place | locality  | Smith/Weston |
            | N3  | place | locality  | space mountain |
            | N4  | place | locality  | space |
diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py

index 73636f933e83e3c85ce80e59906e85b6b36713c1..48a2fbd91847bafdff5a7e246dd899776ddad7b1 100644 (file)
--- a/test/python/test_tokenizer_icu_name_processor.py
+++ b/test/python/test_tokenizer_icu_name_processor.py
@@ -48,9 +48,10 @@ def test_simple_variants(cfgfile):
      proc = ICUNameProcessor(rules)
  
      assert set(get_normalized_variants(proc, "Bauwegstraße")) \
-            == {'bauweg straße', 'bauweg str'}
-    assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str']
-    assert get_normalized_variants(proc, "holzweg") == ['holz weg']
+            == {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}
+    assert get_normalized_variants(proc, "Bauwegstr") == ['bauwegstr']
+    assert set(get_normalized_variants(proc, "holzweg")) \
+            == {'holz weg', 'holzweg'}
      assert get_normalized_variants(proc, "hallo") == ['hallo']
  
  
@@ -82,6 +83,6 @@ def test_search_normalized(cfgfile):
      rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
      proc = ICUNameProcessor(rules)
  
-    assert proc.get_search_normalized('Master Street') == 'master  street'
-    assert proc.get_search_normalized('Earnes St') == 'earne s  st'
-    assert proc.get_search_normalized('Nostreet') == 'no street'
+    assert proc.get_search_normalized('Master Street') == 'master street'
+    assert proc.get_search_normalized('Earnes St') == 'earnes st'
+    assert proc.get_search_normalized('Nostreet') == 'nostreet'
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py

index 51927eaacf420e0745f21ff2f76b82c29ed7f3dc..53c0b0d059737b9d433edddf0788543738636808 100644 (file)
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -91,10 +91,10 @@ def test_get_search_rules(cfgfile):
      trans = Transliterator.createFromRules("test", rules)
  
      assert trans.transliterate(" Baum straße ") == " baum straße "
-    assert trans.transliterate(" Baumstraße ") == " baum straße "
-    assert trans.transliterate(" Baumstrasse ") == " baum strasse "
-    assert trans.transliterate(" Baumstr ") == " baum str "
-    assert trans.transliterate(" Baumwegstr ") == " baumweg str "
+    assert trans.transliterate(" Baumstraße ") == " baumstraße "
+    assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
+    assert trans.transliterate(" Baumstr ") == " baumstr "
+    assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
      assert trans.transliterate(" Αθήνα ") == " athēna "
      assert trans.transliterate(" проспект ") == " prospekt "
  
@@ -128,11 +128,10 @@ def test_get_replacement_pairs_multi_to(cfgfile):
      repl = ICURuleLoader(fpath).get_replacement_pairs()
  
      assert [(a, sorted(b)) for a, b in repl] == \
-             [(' strasse ', [' st ', ' str ', ' strasse ']),
-              ('strasse ', [' st ', ' str ', ' strasse ']),
-              ('pfad ', [' pfad ']),
-              ('str ' , [' str ']),
-              ('st ' , [' st '])]
+             [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
+              ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
+              (' pfad ', [' pfad ', 'pfad ']),
+              ('pfad ', [' pfad ', 'pfad '])]
  
  
  def test_get_replacement_pairs_multi_from(cfgfile):
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py

index b687d0e4203f182263070c380170f03354ec8d41..b86925ee5dd22b134bd66b7419db8b7c7b45f9d1 100644 (file)
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -151,8 +151,9 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
  
-    assert word_table.get_partial_words() == {('te', 1), ('st', 1), ('52', 1),
+    assert word_table.get_partial_words() == {('test', 1), ('52', 1),
                                                ('no', 1), ('area', 2),
+                                              ('holzstrasse', 1), ('holzstr', 1),
                                                ('holz', 1), ('strasse', 1),
                                                ('str', 1)}
author	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 11 Jun 2021 08:03:31 +0000 (10:03 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
nominatim/tokenizer/icu_rule_loader.py		patch \| blob \| history
test/bdd/db/query/normalization.feature		patch \| blob \| history
test/python/test_tokenizer_icu_name_processor.py		patch \| blob \| history
test/python/test_tokenizer_icu_rule_loader.py		patch \| blob \| history
test/python/test_tokenizer_legacy_icu.py		patch \| blob \| history