From f70930b1a04629a33241047586bda54bc0176dc7 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 11 Jun 2021 10:03:31 +0200 Subject: [PATCH] make compund decomposition pure import feature Compound decomposition now creates a full name variant on import just like abbreviations. This simplifies query time normalization and opens a path for changing abbreviation and compund decomposition lists for an existing database. --- nominatim/tokenizer/icu_rule_loader.py | 38 +++++++++---------- test/bdd/db/query/normalization.feature | 2 +- .../test_tokenizer_icu_name_processor.py | 13 ++++--- test/python/test_tokenizer_icu_rule_loader.py | 17 ++++----- test/python/test_tokenizer_legacy_icu.py | 3 +- 5 files changed, 35 insertions(+), 38 deletions(-) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 269faed9..6bf23201 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -31,23 +31,13 @@ class ICURuleLoader: def get_search_rules(self): """ Return the ICU rules to be used during search. - The rules combine normalization, compound decomposition (including - abbreviated compounds) and transliteration. + The rules combine normalization and transliteration. """ # First apply the normalization rules. rules = io.StringIO() rules.write(self.normalization_rules) - # For all compound suffixes: add them in their full and any abbreviated form. - suffixes = set() - for suffix in self.compound_suffixes: - suffixes.add(suffix) - suffixes.update(self.abbreviations.get(suffix, [])) - - for suffix in sorted(suffixes, key=len, reverse=True): - rules.write("'{0} ' > ' {0} ';".format(suffix)) - - # Finally add transliteration. + # Then add transliteration. rules.write(self.transliteration_rules) return rules.getvalue() @@ -69,6 +59,12 @@ class ICURuleLoader: """ synonyms = defaultdict(set) + # First add entries for compound decomposition. + for suffix in self.compound_suffixes: + variants = (suffix + ' ', ' ' + suffix + ' ') + for key in variants: + synonyms[key].update(variants) + for full, abbr in self.abbreviations.items(): key = ' ' + full + ' ' # Entries in the abbreviation list always apply to full words: @@ -76,15 +72,15 @@ class ICURuleLoader: # Replacements are optional, so add a noop synonyms[key].add(key) - # Entries in the compound list expand to themselves and to - # abbreviations. - for suffix in self.compound_suffixes: - keyset = synonyms[suffix + ' '] - keyset.add(' ' + suffix + ' ') - keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, []))) - # The terms the entries are shortended to, need to be decompunded as well. - for abbr in self.abbreviations.get(suffix, []): - synonyms[abbr + ' '].add(' ' + abbr + ' ') + if full in self.compound_suffixes: + # Full word abbreviating to compunded version. + synonyms[key].update((a + ' ' for a in abbr)) + + key = full + ' ' + # Uncompunded suffix abbrevitating to decompounded version. + synonyms[key].update((' ' + a + ' ' for a in abbr)) + # Uncompunded suffix abbrevitating to compunded version. + synonyms[key].update((a + ' ' for a in abbr)) # sort the resulting list by descending length (longer matches are prefered). sorted_keys = sorted(synonyms.keys(), key=len, reverse=True) diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index 35045589..b8a760f9 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -53,7 +53,7 @@ Feature: Import and search of names Scenario: Special characters in name Given the places | osm | class | type | name | - | N1 | place | locality | Jim-Knopf-Str | + | N1 | place | locality | Jim-Knopf-Straße | | N2 | place | locality | Smith/Weston | | N3 | place | locality | space mountain | | N4 | place | locality | space | diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py index 73636f93..48a2fbd9 100644 --- a/test/python/test_tokenizer_icu_name_processor.py +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -48,9 +48,10 @@ def test_simple_variants(cfgfile): proc = ICUNameProcessor(rules) assert set(get_normalized_variants(proc, "Bauwegstraße")) \ - == {'bauweg straße', 'bauweg str'} - assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str'] - assert get_normalized_variants(proc, "holzweg") == ['holz weg'] + == {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'} + assert get_normalized_variants(proc, "Bauwegstr") == ['bauwegstr'] + assert set(get_normalized_variants(proc, "holzweg")) \ + == {'holz weg', 'holzweg'} assert get_normalized_variants(proc, "hallo") == ['hallo'] @@ -82,6 +83,6 @@ def test_search_normalized(cfgfile): rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) proc = ICUNameProcessor(rules) - assert proc.get_search_normalized('Master Street') == 'master street' - assert proc.get_search_normalized('Earnes St') == 'earne s st' - assert proc.get_search_normalized('Nostreet') == 'no street' + assert proc.get_search_normalized('Master Street') == 'master street' + assert proc.get_search_normalized('Earnes St') == 'earnes st' + assert proc.get_search_normalized('Nostreet') == 'nostreet' diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py index 51927eaa..53c0b0d0 100644 --- a/test/python/test_tokenizer_icu_rule_loader.py +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -91,10 +91,10 @@ def test_get_search_rules(cfgfile): trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " - assert trans.transliterate(" Baumstraße ") == " baum straße " - assert trans.transliterate(" Baumstrasse ") == " baum strasse " - assert trans.transliterate(" Baumstr ") == " baum str " - assert trans.transliterate(" Baumwegstr ") == " baumweg str " + assert trans.transliterate(" Baumstraße ") == " baumstraße " + assert trans.transliterate(" Baumstrasse ") == " baumstrasse " + assert trans.transliterate(" Baumstr ") == " baumstr " + assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt " @@ -128,11 +128,10 @@ def test_get_replacement_pairs_multi_to(cfgfile): repl = ICURuleLoader(fpath).get_replacement_pairs() assert [(a, sorted(b)) for a, b in repl] == \ - [(' strasse ', [' st ', ' str ', ' strasse ']), - ('strasse ', [' st ', ' str ', ' strasse ']), - ('pfad ', [' pfad ']), - ('str ' , [' str ']), - ('st ' , [' st '])] + [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']), + ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']), + (' pfad ', [' pfad ', 'pfad ']), + ('pfad ', [' pfad ', 'pfad '])] def test_get_replacement_pairs_multi_from(cfgfile): diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index b687d0e4..b86925ee 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -151,8 +151,9 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table): tok = tokenizer_factory() tok.init_new_db(test_config) - assert word_table.get_partial_words() == {('te', 1), ('st', 1), ('52', 1), + assert word_table.get_partial_words() == {('test', 1), ('52', 1), ('no', 1), ('area', 2), + ('holzstrasse', 1), ('holzstr', 1), ('holz', 1), ('strasse', 1), ('str', 1)} -- 2.43.2