]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_rule_loader.py
penalize name token splitting when phrases are used
[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
index f461a1f11d8eafc24f56d058dc2430bdf554270a..4c36282ca54bfbd3526d24ead471a3e9fe9dbc33 100644 (file)
@@ -12,13 +12,15 @@ import io
 import json
 import logging
 
+from icu import Transliterator
+
 from nominatim.config import flatten_config_list, Configuration
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.connection import Connection
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
-from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser
+from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyzer
 import nominatim.data.country_info
 
 LOG = logging.getLogger()
@@ -135,6 +137,11 @@ class ICURuleLoader:
         if not isinstance(self.analysis_rules, list):
             raise UsageError("Configuration section 'token-analysis' must be a list.")
 
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+        trans = Transliterator.createFromRules("rule_loader_transliteration",
+                                              self.transliteration_rules)
+
         for section in self.analysis_rules:
             name = section.get('id', None)
             if name in self.analysis:
@@ -144,8 +151,7 @@ class ICURuleLoader:
                     LOG.fatal("ICU tokenizer configuration has two token "
                               "analyzers with id '%s'.", name)
                 raise UsageError("Syntax error in ICU tokenizer config.")
-            self.analysis[name] = TokenAnalyzerRule(section,
-                                                    self.normalization_rules,
+            self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
                                                     self.config)
 
 
@@ -170,7 +176,8 @@ class TokenAnalyzerRule:
         and creates a new token analyzer on request.
     """
 
-    def __init__(self, rules: Mapping[str, Any], normalization_rules: str,
+    def __init__(self, rules: Mapping[str, Any],
+                 normalizer: Any, transliterator: Any,
                  config: Configuration) -> None:
         analyzer_name = _get_section(rules, 'analyzer')
         if not analyzer_name or not isinstance(analyzer_name, str):
@@ -179,10 +186,11 @@ class TokenAnalyzerRule:
         self._analysis_mod: AnalysisModule = \
             config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
 
-        self.config = self._analysis_mod.configure(rules, normalization_rules)
+        self.config = self._analysis_mod.configure(rules, normalizer,
+                                                   transliterator)
 
 
-    def create(self, normalizer: Any, transliterator: Any) -> Analyser:
+    def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
         """ Create a new analyser instance for the given rule.
         """
         return self._analysis_mod.create(normalizer, transliterator, self.config)