]> git.openstreetmap.org Git - nominatim.git/commitdiff
harmonize interface of token analysis module
authorSarah Hoffmann <lonvia@denofr.de>
Fri, 29 Jul 2022 08:43:07 +0000 (10:43 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Fri, 29 Jul 2022 08:43:07 +0000 (10:43 +0200)
The configure() function now receives a Transliterator object instead
of the ICU rules. This harmonizes the parameters with the create
function.

nominatim/tokenizer/icu_rule_loader.py
nominatim/tokenizer/token_analysis/base.py
nominatim/tokenizer/token_analysis/config_variants.py
nominatim/tokenizer/token_analysis/generic.py
nominatim/tokenizer/token_analysis/housenumbers.py
nominatim/tokenizer/token_analysis/postcodes.py
test/python/tokenizer/token_analysis/test_generic.py
test/python/tokenizer/token_analysis/test_generic_mutation.py

index f461a1f11d8eafc24f56d058dc2430bdf554270a..aeb8a3234788b7169bb58e134ba3d53e199d53cb 100644 (file)
@@ -12,6 +12,8 @@ import io
 import json
 import logging
 
+from icu import Transliterator
+
 from nominatim.config import flatten_config_list, Configuration
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.connection import Connection
@@ -135,6 +137,11 @@ class ICURuleLoader:
         if not isinstance(self.analysis_rules, list):
             raise UsageError("Configuration section 'token-analysis' must be a list.")
 
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+        trans = Transliterator.createFromRules("rule_loader_transliteration",
+                                              self.transliteration_rules)
+
         for section in self.analysis_rules:
             name = section.get('id', None)
             if name in self.analysis:
@@ -144,8 +151,7 @@ class ICURuleLoader:
                     LOG.fatal("ICU tokenizer configuration has two token "
                               "analyzers with id '%s'.", name)
                 raise UsageError("Syntax error in ICU tokenizer config.")
-            self.analysis[name] = TokenAnalyzerRule(section,
-                                                    self.normalization_rules,
+            self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
                                                     self.config)
 
 
@@ -170,7 +176,8 @@ class TokenAnalyzerRule:
         and creates a new token analyzer on request.
     """
 
-    def __init__(self, rules: Mapping[str, Any], normalization_rules: str,
+    def __init__(self, rules: Mapping[str, Any],
+                 normalizer: Any, transliterator: Any,
                  config: Configuration) -> None:
         analyzer_name = _get_section(rules, 'analyzer')
         if not analyzer_name or not isinstance(analyzer_name, str):
@@ -179,7 +186,8 @@ class TokenAnalyzerRule:
         self._analysis_mod: AnalysisModule = \
             config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
 
-        self.config = self._analysis_mod.configure(rules, normalization_rules)
+        self.config = self._analysis_mod.configure(rules, normalizer,
+                                                   transliterator)
 
 
     def create(self, normalizer: Any, transliterator: Any) -> Analyser:
index 53264b949b440e89842e735e3a5808eab027e465..d17a626c316c6b5581d79396c165ff094e7ca07e 100644 (file)
@@ -30,7 +30,8 @@ class AnalysisModule(Protocol):
     """ Protocol for analysis modules.
     """
 
-    def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any:
+    def configure(self, rules: Mapping[str, Any],
+                  normalizer: Any, transliterator: Any) -> Any:
         """ Prepare the configuration of the analysis module.
             This function should prepare all data that can be shared
             between instances of this analyser.
@@ -38,8 +39,10 @@ class AnalysisModule(Protocol):
             Arguments:
                 rules: A dictionary with the additional configuration options
                        as specified in the tokenizer configuration.
-                normalization_rules: ICU rules for normalization as a string
-                                     that can be used with createFromRules().
+                normalizer: an ICU Transliterator with the compiled normalization
+                            rules.
+                transliterator: an ICU tranliterator with the compiled
+                                transliteration rules.
 
             Returns:
                 A data object with the configuration that was set up. May be
index d86d8072a2a18c95096c6fb6f9591b64dff355fe..1258373eea9230ff3552e243ae726f4c0a4b2b2b 100644 (file)
@@ -12,8 +12,6 @@ from collections import defaultdict
 import itertools
 import re
 
-from icu import Transliterator
-
 from nominatim.config import flatten_config_list
 from nominatim.errors import UsageError
 
@@ -25,7 +23,7 @@ class ICUVariant(NamedTuple):
 
 
 def get_variant_config(in_rules: Any,
-                       normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
+                       normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
     """ Convert the variant definition from the configuration into
         replacement sets.
 
@@ -39,7 +37,7 @@ def get_variant_config(in_rules: Any,
         vset: Set[ICUVariant] = set()
         rules = flatten_config_list(in_rules, 'variants')
 
-        vmaker = _VariantMaker(normalization_rules)
+        vmaker = _VariantMaker(normalizer)
 
         for section in rules:
             for rule in (section.get('words') or []):
@@ -63,9 +61,8 @@ class _VariantMaker:
         All text in rules is normalized to make sure the variants match later.
     """
 
-    def __init__(self, norm_rules: Any) -> None:
-        self.norm = Transliterator.createFromRules("rule_loader_normalization",
-                                                   norm_rules)
+    def __init__(self, normalizer: Any) -> None:
+        self.norm = normalizer
 
 
     def compute(self, rule: Any) -> Iterator[ICUVariant]:
index e14f844c5d3ff969502e014d41a67ef35ef0378c..28cd0d94113df536b30613e267cd144e9a69eefa 100644 (file)
@@ -18,13 +18,13 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
 
 ### Configuration section
 
-def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]:
+def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
     """ Extract and preprocess the configuration for this module.
     """
     config: Dict[str, Any] = {}
 
     config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
-                                                                 normalization_rules)
+                                                                 normalizer)
     config['variant_only'] = rules.get('mode', '') == 'variant-only'
 
     # parse mutation rules
index a0f4214d55fee1b6862541409b7e2f6bab434b26..e3048a0912e5c20b0f71fca83472da55b4424b3f 100644 (file)
@@ -8,7 +8,7 @@
 Specialized processor for housenumbers. Analyses common housenumber patterns
 and creates variants for them.
 """
-from typing import Mapping, Any, List, cast
+from typing import Any, List, cast
 import re
 
 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
@@ -20,7 +20,7 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}')
 
 ### Configuration section
 
-def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
+def configure(*_: Any) -> None:
     """ All behaviour is currently hard-coded.
     """
     return None
index 15b20bf915b3f48ba462e55c0441acf18038ceb5..f5b5b9c41fdac0477c82906b9a86bd2f80b1b57e 100644 (file)
@@ -8,13 +8,13 @@
 Specialized processor for postcodes. Supports a 'lookup' variant of the
 token, which produces variants with optional spaces.
 """
-from typing import Mapping, Any, List
+from typing import Any, List
 
 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
 
 ### Configuration section
 
-def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
+def configure(*_: Any) -> None:
     """ All behaviour is currently hard-coded.
     """
     return None
index afbd5e9bf813590ff6537f4893fd8325b48f1d09..18ed109b3d1759f3661c9f47055003b187f69f0a 100644 (file)
@@ -30,9 +30,9 @@ def make_analyser(*variants, variant_only=False):
     rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
     if variant_only:
         rules['mode'] = 'variant-only'
-    config = module.configure(rules, DEFAULT_NORMALIZATION)
     trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
     norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+    config = module.configure(rules, norm, trans)
 
     return module.create(norm, trans, config)
 
@@ -44,9 +44,9 @@ def get_normalized_variants(proc, name):
 
 def test_no_variants():
     rules = { 'analyzer': 'generic' }
-    config = module.configure(rules, DEFAULT_NORMALIZATION)
     trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
     norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+    config = module.configure(rules, norm, trans)
 
     proc = module.create(norm, trans, config)
 
@@ -123,7 +123,9 @@ class TestGetReplacements:
     @staticmethod
     def configure_rules(*variants):
         rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
-        return module.configure(rules, DEFAULT_NORMALIZATION)
+        trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+        norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+        return module.configure(rules, norm, trans)
 
 
     def get_replacements(self, *variants):
index abe31f6d468ac631f86dbd1a1dc8d25205bbcdcc..ee842355cc95724df2da3156e7e9b54e0eee638b 100644 (file)
@@ -31,9 +31,9 @@ class TestMutationNoVariants:
                   'mutations': [ {'pattern': m[0], 'replacements': m[1]}
                                  for m in mutations]
                 }
-        config = module.configure(rules, DEFAULT_NORMALIZATION)
         trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
         norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+        config = module.configure(rules, norm, trans)
 
         self.analysis = module.create(norm, trans, config)