]> git.openstreetmap.org Git - nominatim.git/commitdiff
move generation of normalized token form to analyzer
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 15 Feb 2022 11:15:18 +0000 (12:15 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 1 Mar 2022 08:34:32 +0000 (09:34 +0100)
This gives the analyzer more flexibility in choosing the normalized
form. In particular, an analyzer creating different variants can choose
the variant that will be used as the canonical form.

nominatim/tokenizer/icu_token_analysis.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/token_analysis/generic.py
test/python/tokenizer/token_analysis/test_generic.py
test/python/tokenizer/token_analysis/test_generic_mutation.py

index 1d319b32edd7556dd80004a77f3faa058d82b727..ee3144a8eeec107e2dce41203efc36c1ac8e80dd 100644 (file)
@@ -25,5 +25,5 @@ class ICUTokenAnalysis:
         self.search = Transliterator.createFromRules("icu_search",
                                                      norm_rules + trans_rules)
 
-        self.analysis = {name: arules.create(self.to_ascii, arules.config)
+        self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
                          for name, arules in analysis_rules.items()}
index 9c25b6d7940fc145a2565a326d239463e32227cc..b89180aefe11f681033b367362cb5e7517e7e83c 100644 (file)
@@ -561,7 +561,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
         for name in names:
             analyzer_id = name.get_attr('analyzer')
-            norm_name = self._normalized(name.name)
+            analyzer = self.token_analysis.analysis[analyzer_id]
+            norm_name = analyzer.normalize(name.name)
             if analyzer_id is None:
                 token_id = norm_name
             else:
@@ -569,7 +570,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
+                variants = analyzer.get_variants_ascii(norm_name)
                 if not variants:
                     continue
 
index d4eae312d9dfc4f3407cb915988a41256614565b..3de915ba5254e1859976dd7e9842247df5a58b98 100644 (file)
@@ -47,10 +47,10 @@ def configure(rules, normalization_rules):
 
 ### Analysis section
 
-def create(transliterator, config):
+def create(normalizer, transliterator, config):
     """ Create a new token analysis instance for this module.
     """
-    return GenericTokenAnalysis(transliterator, config)
+    return GenericTokenAnalysis(normalizer, transliterator, config)
 
 
 class GenericTokenAnalysis:
@@ -58,7 +58,8 @@ class GenericTokenAnalysis:
         and provides the functions to apply the transformations.
     """
 
-    def __init__(self, to_ascii, config):
+    def __init__(self, norm, to_ascii, config):
+        self.norm = norm
         self.to_ascii = to_ascii
         self.variant_only = config['variant_only']
 
@@ -74,6 +75,13 @@ class GenericTokenAnalysis:
         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
 
 
+    def normalize(self, name):
+        """ Return the normalized form of the name. This is the standard form
+            from which possible variants for the name can be derived.
+        """
+        return self.norm.transliterate(name).strip()
+
+
     def get_variants_ascii(self, norm_name):
         """ Compute the spelling variants for the given normalized name
             and transliterate the result.
index 9b008cc5ee47042a90fe2b3ba54cb13915d4e8b6..afbd5e9bf813590ff6537f4893fd8325b48f1d09 100644 (file)
@@ -32,8 +32,9 @@ def make_analyser(*variants, variant_only=False):
         rules['mode'] = 'variant-only'
     config = module.configure(rules, DEFAULT_NORMALIZATION)
     trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
 
-    return module.create(trans, config)
+    return module.create(norm, trans, config)
 
 
 def get_normalized_variants(proc, name):
@@ -45,8 +46,9 @@ def test_no_variants():
     rules = { 'analyzer': 'generic' }
     config = module.configure(rules, DEFAULT_NORMALIZATION)
     trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
 
-    proc = module.create(trans, config)
+    proc = module.create(norm, trans, config)
 
     assert get_normalized_variants(proc, '大德!') == ['dà dé']
 
index 757f03112d47c1ff8a73f31c8eed696636238e9e..abe31f6d468ac631f86dbd1a1dc8d25205bbcdcc 100644 (file)
@@ -33,8 +33,9 @@ class TestMutationNoVariants:
                 }
         config = module.configure(rules, DEFAULT_NORMALIZATION)
         trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+        norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
 
-        self.analysis = module.create(trans, config)
+        self.analysis = module.create(norm, trans, config)
 
 
     def variants(self, name):