Adds parsing of multiple variant lists from the configuration.
Every entry except one must have a unique 'id' paramter to
distinguish the entries. The entry without id is considered
the default. Currently only the list without an id is used
for analysis.
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
- self.variants = set()
-
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
- self.analysis_rules = self._get_section(rules, 'variants')
- self._parse_variant_list()
+ self.analysis_rules = self._get_section(rules, 'token-analysis')
+ self._setup_analysis()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
- self._parse_variant_list()
+ self._setup_analysis()
def save_config_to_db(self, conn):
def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules.
"""
- return ICUNameProcessor(self.normalization_rules,
- self.transliteration_rules,
- self.variants)
+ return self.analysis[None].create(self.normalization_rules,
+ self.transliteration_rules)
def get_search_rules(self):
rules.write(self.transliteration_rules)
return rules.getvalue()
+
def get_normalization_rules(self):
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
+
def get_transliteration_rules(self):
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
- def get_replacement_pairs(self):
- """ Return the list of possible compound decompositions with
- application of abbreviations included.
- The result is a list of pairs: the first item is the sequence to
- replace, the second is a list of replacements.
+
+ def _setup_analysis(self):
+ """ Process the rules used for creating the various token analyzers.
"""
- return self.variants
+ self.analysis = {}
+
+ if not isinstance(self.analysis_rules, list):
+ raise UsageError("Configuration section 'token-analysis' must be a list.")
+
+ for section in self.analysis_rules:
+ name = section.get('id', None)
+ if name in self.analysis:
+ if name is None:
+ LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
+ else:
+ LOG.fatal("ICU tokenizer configuration has two token "
+ "analyzers with id '%s'.", name)
+ UsageError("Syntax error in ICU tokenizer config.")
+ self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
@staticmethod
return ';'.join(flatten_config_list(content, section)) + ';'
- def _parse_variant_list(self):
- rules = self.analysis_rules
+class TokenAnalyzerRule:
+ """ Factory for a single analysis module. The class saves the configuration
+ and creates a new token analyzer on request.
+ """
+
+ def __init__(self, rules, normalization_rules):
+ self._parse_variant_list(rules.get('variants'), normalization_rules)
+
+
+ def create(self, normalization_rules, transliteration_rules):
+ """ Create an analyzer from the given rules.
+ """
+ return ICUNameProcessor(normalization_rules,
+ transliteration_rules,
+ self.variants)
- self.variants.clear()
+
+ def _parse_variant_list(self, rules, normalization_rules):
+ self.variants = set()
if not rules:
return
rules = flatten_config_list(rules, 'variants')
- vmaker = _VariantMaker(self.normalization_rules)
+ vmaker = _VariantMaker(normalization_rules)
properties = []
for section in rules:
sanitizers:
- step: split-name-list
- step: strip-brace-terms
-variants:
- - !include icu-rules/variants-bg.yaml
- - !include icu-rules/variants-ca.yaml
- - !include icu-rules/variants-cs.yaml
- - !include icu-rules/variants-da.yaml
- - !include icu-rules/variants-de.yaml
- - !include icu-rules/variants-el.yaml
- - !include icu-rules/variants-en.yaml
- - !include icu-rules/variants-es.yaml
- - !include icu-rules/variants-et.yaml
- - !include icu-rules/variants-eu.yaml
- - !include icu-rules/variants-fi.yaml
- - !include icu-rules/variants-fr.yaml
- - !include icu-rules/variants-gl.yaml
- - !include icu-rules/variants-hu.yaml
- - !include icu-rules/variants-it.yaml
- - !include icu-rules/variants-ja.yaml
- - !include icu-rules/variants-mg.yaml
- - !include icu-rules/variants-ms.yaml
- - !include icu-rules/variants-nl.yaml
- - !include icu-rules/variants-no.yaml
- - !include icu-rules/variants-pl.yaml
- - !include icu-rules/variants-pt.yaml
- - !include icu-rules/variants-ro.yaml
- - !include icu-rules/variants-ru.yaml
- - !include icu-rules/variants-sk.yaml
- - !include icu-rules/variants-sl.yaml
- - !include icu-rules/variants-sv.yaml
- - !include icu-rules/variants-tr.yaml
- - !include icu-rules/variants-uk.yaml
- - !include icu-rules/variants-vi.yaml
+token-analysis:
+ - variants:
+ - !include icu-rules/variants-bg.yaml
+ - !include icu-rules/variants-ca.yaml
+ - !include icu-rules/variants-cs.yaml
+ - !include icu-rules/variants-da.yaml
+ - !include icu-rules/variants-de.yaml
+ - !include icu-rules/variants-el.yaml
+ - !include icu-rules/variants-en.yaml
+ - !include icu-rules/variants-es.yaml
+ - !include icu-rules/variants-et.yaml
+ - !include icu-rules/variants-eu.yaml
+ - !include icu-rules/variants-fi.yaml
+ - !include icu-rules/variants-fr.yaml
+ - !include icu-rules/variants-gl.yaml
+ - !include icu-rules/variants-hu.yaml
+ - !include icu-rules/variants-it.yaml
+ - !include icu-rules/variants-ja.yaml
+ - !include icu-rules/variants-mg.yaml
+ - !include icu-rules/variants-ms.yaml
+ - !include icu-rules/variants-nl.yaml
+ - !include icu-rules/variants-no.yaml
+ - !include icu-rules/variants-pl.yaml
+ - !include icu-rules/variants-pt.yaml
+ - !include icu-rules/variants-ro.yaml
+ - !include icu-rules/variants-ru.yaml
+ - !include icu-rules/variants-sk.yaml
+ - !include icu-rules/variants-sl.yaml
+ - !include icu-rules/variants-sv.yaml
+ - !include icu-rules/variants-tr.yaml
+ - !include icu-rules/variants-uk.yaml
+ - !include icu-rules/variants-vi.yaml
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', ),
sanitizers=[]):
- cfgstr = {'normalization' : list(norm),
- 'sanitizers' : sanitizers,
- 'transliteration' : list(trans),
- 'variants' : [ {'words': list(variants)}]}
+ cfgstr = {'normalization': list(norm),
+ 'sanitizers': sanitizers,
+ 'transliteration': list(trans),
+ 'token-analysis': [{'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
- ":: Latin ()"
- "'🜵' > ' '"
""")
- content += "variants:\n - words:\n"
- content += '\n'.join((" - " + s for s in variants)) + '\n'
+ content += "token-analysis:\n - variants:\n - words:\n"
+ content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
- content += " {}: {}\n".format(k, v)
+ content += " {}: {}\n".format(k, v)
(project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
- content += "variants:\n - words:\n"
- content += '\n'.join((" - " + s for s in variants)) + '\n'
+ content += "token-analysis:\n - variants:\n - words:\n"
+ content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
- variants:
+ token-analysis:
+ - variants:
"""))
rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
- assert list(rules.get_replacement_pairs()) == []
-CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
+CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section, test_config):
- rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+ rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
transliteration:
- "'ax' > 'b'"
- !include transliteration.yaml
- variants:
+ token-analysis:
+ - variants:
"""))
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants))
- rules = loader.get_replacement_pairs()
+ rules = loader.analysis[None].variants
return set((v.source, v.replacement) for v in rules)