]> git.openstreetmap.org Git - nominatim.git/commitdiff
extend ICU config to accomodate multiple analysers
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 4 Oct 2021 14:40:28 +0000 (16:40 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Mon, 4 Oct 2021 14:40:28 +0000 (16:40 +0200)
Adds parsing of multiple variant lists from the configuration.
Every entry except one must have a unique 'id' paramter to
distinguish the entries. The entry without id is considered
the default. Currently only the list without an id is used
for analysis.

nominatim/tokenizer/icu_rule_loader.py
settings/icu_tokenizer.yaml
test/python/test_tokenizer_icu.py
test/python/test_tokenizer_icu_name_processor.py
test/python/test_tokenizer_icu_rule_loader.py

index 7719f211c0dcc7bedc80dd492da92105d6b3f762..cf72520953456e9318576f51d9fc7acc280d668e 100644 (file)
@@ -43,12 +43,10 @@ class ICURuleLoader:
         rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                               config='TOKENIZER_CONFIG')
 
-        self.variants = set()
-
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self.analysis_rules = self._get_section(rules, 'variants')
-        self._parse_variant_list()
+        self.analysis_rules = self._get_section(rules, 'token-analysis')
+        self._setup_analysis()
 
         # Load optional sanitizer rule set.
         self.sanitizer_rules = rules.get('sanitizers', [])
@@ -61,7 +59,7 @@ class ICURuleLoader:
         self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
         self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
         self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
-        self._parse_variant_list()
+        self._setup_analysis()
 
 
     def save_config_to_db(self, conn):
@@ -82,9 +80,8 @@ class ICURuleLoader:
     def make_token_analysis(self):
         """ Create a token analyser from the reviouly loaded rules.
         """
-        return ICUNameProcessor(self.normalization_rules,
-                                self.transliteration_rules,
-                                self.variants)
+        return self.analysis[None].create(self.normalization_rules,
+                                          self.transliteration_rules)
 
 
     def get_search_rules(self):
@@ -99,23 +96,37 @@ class ICURuleLoader:
         rules.write(self.transliteration_rules)
         return rules.getvalue()
 
+
     def get_normalization_rules(self):
         """ Return rules for normalisation of a term.
         """
         return self.normalization_rules
 
+
     def get_transliteration_rules(self):
         """ Return the rules for converting a string into its asciii representation.
         """
         return self.transliteration_rules
 
-    def get_replacement_pairs(self):
-        """ Return the list of possible compound decompositions with
-            application of abbreviations included.
-            The result is a list of pairs: the first item is the sequence to
-            replace, the second is a list of replacements.
+
+    def _setup_analysis(self):
+        """ Process the rules used for creating the various token analyzers.
         """
-        return self.variants
+        self.analysis = {}
+
+        if not isinstance(self.analysis_rules, list):
+            raise UsageError("Configuration section 'token-analysis' must be a list.")
+
+        for section in self.analysis_rules:
+            name = section.get('id', None)
+            if name in self.analysis:
+                if name is None:
+                    LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
+                else:
+                    LOG.fatal("ICU tokenizer configuration has two token "
+                              "analyzers with id '%s'.", name)
+                UsageError("Syntax error in ICU tokenizer config.")
+            self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
 
 
     @staticmethod
@@ -145,17 +156,32 @@ class ICURuleLoader:
         return ';'.join(flatten_config_list(content, section)) + ';'
 
 
-    def _parse_variant_list(self):
-        rules = self.analysis_rules
+class TokenAnalyzerRule:
+    """ Factory for a single analysis module. The class saves the configuration
+        and creates a new token analyzer on request.
+    """
+
+    def __init__(self, rules, normalization_rules):
+        self._parse_variant_list(rules.get('variants'), normalization_rules)
+
+
+    def create(self, normalization_rules, transliteration_rules):
+        """ Create an analyzer from the given rules.
+        """
+        return ICUNameProcessor(normalization_rules,
+                                transliteration_rules,
+                                self.variants)
 
-        self.variants.clear()
+
+    def _parse_variant_list(self, rules, normalization_rules):
+        self.variants = set()
 
         if not rules:
             return
 
         rules = flatten_config_list(rules, 'variants')
 
-        vmaker = _VariantMaker(self.normalization_rules)
+        vmaker = _VariantMaker(normalization_rules)
 
         properties = []
         for section in rules:
index 08b7a7ff3570890da389447c5ae93efe64959a5d..f85c33ffc75f02d1fe6d47eb9de4985abd02a34e 100644 (file)
@@ -27,34 +27,35 @@ transliteration:
 sanitizers:
     - step: split-name-list
     - step: strip-brace-terms
-variants:
-    - !include icu-rules/variants-bg.yaml
-    - !include icu-rules/variants-ca.yaml
-    - !include icu-rules/variants-cs.yaml
-    - !include icu-rules/variants-da.yaml
-    - !include icu-rules/variants-de.yaml
-    - !include icu-rules/variants-el.yaml
-    - !include icu-rules/variants-en.yaml
-    - !include icu-rules/variants-es.yaml
-    - !include icu-rules/variants-et.yaml
-    - !include icu-rules/variants-eu.yaml
-    - !include icu-rules/variants-fi.yaml
-    - !include icu-rules/variants-fr.yaml
-    - !include icu-rules/variants-gl.yaml
-    - !include icu-rules/variants-hu.yaml
-    - !include icu-rules/variants-it.yaml
-    - !include icu-rules/variants-ja.yaml
-    - !include icu-rules/variants-mg.yaml
-    - !include icu-rules/variants-ms.yaml
-    - !include icu-rules/variants-nl.yaml
-    - !include icu-rules/variants-no.yaml
-    - !include icu-rules/variants-pl.yaml
-    - !include icu-rules/variants-pt.yaml
-    - !include icu-rules/variants-ro.yaml
-    - !include icu-rules/variants-ru.yaml
-    - !include icu-rules/variants-sk.yaml
-    - !include icu-rules/variants-sl.yaml
-    - !include icu-rules/variants-sv.yaml
-    - !include icu-rules/variants-tr.yaml
-    - !include icu-rules/variants-uk.yaml
-    - !include icu-rules/variants-vi.yaml
+token-analysis:
+    - variants:
+          - !include icu-rules/variants-bg.yaml
+          - !include icu-rules/variants-ca.yaml
+          - !include icu-rules/variants-cs.yaml
+          - !include icu-rules/variants-da.yaml
+          - !include icu-rules/variants-de.yaml
+          - !include icu-rules/variants-el.yaml
+          - !include icu-rules/variants-en.yaml
+          - !include icu-rules/variants-es.yaml
+          - !include icu-rules/variants-et.yaml
+          - !include icu-rules/variants-eu.yaml
+          - !include icu-rules/variants-fi.yaml
+          - !include icu-rules/variants-fr.yaml
+          - !include icu-rules/variants-gl.yaml
+          - !include icu-rules/variants-hu.yaml
+          - !include icu-rules/variants-it.yaml
+          - !include icu-rules/variants-ja.yaml
+          - !include icu-rules/variants-mg.yaml
+          - !include icu-rules/variants-ms.yaml
+          - !include icu-rules/variants-nl.yaml
+          - !include icu-rules/variants-no.yaml
+          - !include icu-rules/variants-pl.yaml
+          - !include icu-rules/variants-pt.yaml
+          - !include icu-rules/variants-ro.yaml
+          - !include icu-rules/variants-ru.yaml
+          - !include icu-rules/variants-sk.yaml
+          - !include icu-rules/variants-sl.yaml
+          - !include icu-rules/variants-sv.yaml
+          - !include icu-rules/variants-tr.yaml
+          - !include icu-rules/variants-uk.yaml
+          - !include icu-rules/variants-vi.yaml
index 9a6f5a94f01c7b3486bfe659bd4878979581838b..16caf3edf12299cfe95e57bf5c72e889a39d0c63 100644 (file)
@@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
     def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
                      variants=('~gasse -> gasse', 'street => st', ),
                      sanitizers=[]):
-        cfgstr = {'normalization' : list(norm),
-                  'sanitizers' : sanitizers,
-                  'transliteration' : list(trans),
-                  'variants' : [ {'words': list(variants)}]}
+        cfgstr = {'normalization': list(norm),
+                  'sanitizers': sanitizers,
+                  'transliteration': list(trans),
+                  'token-analysis': [{'variants': [{'words': list(variants)}]}]}
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = ICURuleLoader(test_config)
 
index d0ed21ecd8b6d8625a967b5b4a7a158360afcec7..366d2aee23855bd2e9a9f00a2df81a0974ebba79 100644 (file)
@@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path):
             - "::  Latin ()"
             - "'🜵' > ' '"
         """)
-        content += "variants:\n  - words:\n"
-        content += '\n'.join(("      - " + s for s in variants)) + '\n'
+        content += "token-analysis:\n  - variants:\n      - words:\n"
+        content += '\n'.join(("          - " + s for s in variants)) + '\n'
         for k, v in kwargs:
-            content += "    {}: {}\n".format(k, v)
+            content += "        {}: {}\n".format(k, v)
         (project_dir / 'icu_tokenizer.yaml').write_text(content)
 
         return def_config
index 6ec53edcfa10ca0f403d7ebfa308b4cc555d9d7f..5d931043d737e92f96cc5560088a226839cd4276 100644 (file)
@@ -34,8 +34,8 @@ def cfgrules(test_config):
             - "::  Latin ()"
             - "[[:Punctuation:][:Space:]]+ > ' '"
         """)
-        content += "variants:\n  - words:\n"
-        content += '\n'.join(("      - " + s for s in variants)) + '\n'
+        content += "token-analysis:\n  - variants:\n     - words:\n"
+        content += '\n'.join(("         - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
@@ -49,20 +49,20 @@ def test_empty_rule_set(test_config):
     (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
         normalization:
         transliteration:
-        variants:
+        token-analysis:
+          - variants:
         """))
 
     rules = ICURuleLoader(test_config)
     assert rules.get_search_rules() == ''
     assert rules.get_normalization_rules() == ''
     assert rules.get_transliteration_rules() == ''
-    assert list(rules.get_replacement_pairs()) == []
 
-CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
+CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
 
 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
 def test_missing_section(section, test_config):
-    rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+    rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
     (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
 
     with pytest.raises(UsageError):
@@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config):
         transliteration:
             - "'ax' > 'b'"
             - !include transliteration.yaml
-        variants:
+        token-analysis:
+            - variants:
         """))
     transpath = test_config.project_dir / ('transliteration.yaml')
     transpath.write_text('- "x > y"')
@@ -127,7 +128,7 @@ class TestGetReplacements:
 
     def get_replacements(self, *variants):
         loader = ICURuleLoader(self.cfgrules(*variants))
-        rules = loader.get_replacement_pairs()
+        rules = loader.analysis[None].variants
 
         return set((v.source, v.replacement) for v in rules)