nominatim/tokenizer/icu_rule_loader.py

   1 """
   2 Helper class to create ICU rules from a configuration file.
   3 """
   4 import io
   5 import json
   6 import logging
   7 import itertools
   8 import re
   9
  10 from icu import Transliterator
  11
  12 from nominatim.db.properties import set_property, get_property
  13 from nominatim.errors import UsageError
  14 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
  15 import nominatim.tokenizer.icu_variants as variants
  16
  17 LOG = logging.getLogger()
  18
  19 DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
  20 DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
  21 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
  22
  23
  24 def _flatten_config_list(content):
  25     if not content:
  26         return []
  27
  28     if not isinstance(content, list):
  29         raise UsageError("List expected in ICU configuration.")
  30
  31     output = []
  32     for ele in content:
  33         if isinstance(ele, list):
  34             output.extend(_flatten_config_list(ele))
  35         else:
  36             output.append(ele)
  37
  38     return output
  39
  40
  41 class VariantRule:
  42     """ Saves a single variant expansion.
  43
  44         An expansion consists of the normalized replacement term and
  45         a dicitonary of properties that describe when the expansion applies.
  46     """
  47
  48     def __init__(self, replacement, properties):
  49         self.replacement = replacement
  50         self.properties = properties or {}
  51
  52
  53 class ICURuleLoader:
  54     """ Compiler for ICU rules from a tokenizer configuration file.
  55     """
  56
  57     def __init__(self, config):
  58         rules = config.load_sub_configuration('icu_tokenizer.yaml',
  59                                               config='TOKENIZER_CONFIG')
  60
  61         self.variants = set()
  62
  63         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
  64         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
  65         self.analysis_rules = self._get_section(rules, 'variants')
  66         self._parse_variant_list()
  67
  68
  69     def load_config_from_db(self, conn):
  70         """ Get previously saved parts of the configuration from the
  71             database.
  72         """
  73         self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
  74         self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
  75         self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
  76         self._parse_variant_list()
  77
  78
  79     def save_config_to_db(self, conn):
  80         """ Save the part of the configuration that cannot be changed into
  81             the database.
  82         """
  83         set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
  84         set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
  85         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  86
  87
  88     def make_token_analysis(self):
  89         """ Create a token analyser from the reviouly loaded rules.
  90         """
  91         return ICUNameProcessor(self.normalization_rules,
  92                                 self.transliteration_rules,
  93                                 self.variants)
  94
  95
  96     def get_search_rules(self):
  97         """ Return the ICU rules to be used during search.
  98             The rules combine normalization and transliteration.
  99         """
 100         # First apply the normalization rules.
 101         rules = io.StringIO()
 102         rules.write(self.normalization_rules)
 103
 104         # Then add transliteration.
 105         rules.write(self.transliteration_rules)
 106         return rules.getvalue()
 107
 108     def get_normalization_rules(self):
 109         """ Return rules for normalisation of a term.
 110         """
 111         return self.normalization_rules
 112
 113     def get_transliteration_rules(self):
 114         """ Return the rules for converting a string into its asciii representation.
 115         """
 116         return self.transliteration_rules
 117
 118     def get_replacement_pairs(self):
 119         """ Return the list of possible compound decompositions with
 120             application of abbreviations included.
 121             The result is a list of pairs: the first item is the sequence to
 122             replace, the second is a list of replacements.
 123         """
 124         return self.variants
 125
 126
 127     @staticmethod
 128     def _get_section(rules, section):
 129         """ Get the section named 'section' from the rules. If the section does
 130             not exist, raise a usage error with a meaningful message.
 131         """
 132         if section not in rules:
 133             LOG.fatal("Section '%s' not found in tokenizer config.", section)
 134             raise UsageError("Syntax error in tokenizer configuration file.")
 135
 136         return rules[section]
 137
 138
 139     def _cfg_to_icu_rules(self, rules, section):
 140         """ Load an ICU ruleset from the given section. If the section is a
 141             simple string, it is interpreted as a file name and the rules are
 142             loaded verbatim from the given file. The filename is expected to be
 143             relative to the tokenizer rule file. If the section is a list then
 144             each line is assumed to be a rule. All rules are concatenated and returned.
 145         """
 146         content = self._get_section(rules, section)
 147
 148         if content is None:
 149             return ''
 150
 151         return ';'.join(_flatten_config_list(content)) + ';'
 152
 153
 154     def _parse_variant_list(self):
 155         rules = self.analysis_rules
 156
 157         self.variants.clear()
 158
 159         if not rules:
 160             return
 161
 162         rules = _flatten_config_list(rules)
 163
 164         vmaker = _VariantMaker(self.normalization_rules)
 165
 166         properties = []
 167         for section in rules:
 168             # Create the property field and deduplicate against existing
 169             # instances.
 170             props = variants.ICUVariantProperties.from_rules(section)
 171             for existing in properties:
 172                 if existing == props:
 173                     props = existing
 174                     break
 175             else:
 176                 properties.append(props)
 177
 178             for rule in (section.get('words') or []):
 179                 self.variants.update(vmaker.compute(rule, props))
 180
 181
 182 class _VariantMaker:
 183     """ Generater for all necessary ICUVariants from a single variant rule.
 184
 185         All text in rules is normalized to make sure the variants match later.
 186     """
 187
 188     def __init__(self, norm_rules):
 189         self.norm = Transliterator.createFromRules("rule_loader_normalization",
 190                                                    norm_rules)
 191
 192
 193     def compute(self, rule, props):
 194         """ Generator for all ICUVariant tuples from a single variant rule.
 195         """
 196         parts = re.split(r'(\|)?([=-])>', rule)
 197         if len(parts) != 4:
 198             raise UsageError("Syntax error in variant rule: " + rule)
 199
 200         decompose = parts[1] is None
 201         src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
 202         repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
 203
 204         # If the source should be kept, add a 1:1 replacement
 205         if parts[2] == '-':
 206             for src in src_terms:
 207                 if src:
 208                     for froms, tos in _create_variants(*src, src[0], decompose):
 209                         yield variants.ICUVariant(froms, tos, props)
 210
 211         for src, repl in itertools.product(src_terms, repl_terms):
 212             if src and repl:
 213                 for froms, tos in _create_variants(*src, repl, decompose):
 214                     yield variants.ICUVariant(froms, tos, props)
 215
 216
 217     def _parse_variant_word(self, name):
 218         name = name.strip()
 219         match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
 220         if match is None or (match.group(1) == '~' and match.group(3) == '~'):
 221             raise UsageError("Invalid variant word descriptor '{}'".format(name))
 222         norm_name = self.norm.transliterate(match.group(2))
 223         if not norm_name:
 224             return None
 225
 226         return norm_name, match.group(1), match.group(3)
 227
 228
 229 _FLAG_MATCH = {'^': '^ ',
 230                '$': ' ^',
 231                '': ' '}
 232
 233
 234 def _create_variants(src, preflag, postflag, repl, decompose):
 235     if preflag == '~':
 236         postfix = _FLAG_MATCH[postflag]
 237         # suffix decomposition
 238         src = src + postfix
 239         repl = repl + postfix
 240
 241         yield src, repl
 242         yield ' ' + src, ' ' + repl
 243
 244         if decompose:
 245             yield src, ' ' + repl
 246             yield ' ' + src, repl
 247     elif postflag == '~':
 248         # prefix decomposition
 249         prefix = _FLAG_MATCH[preflag]
 250         src = prefix + src
 251         repl = prefix + repl
 252
 253         yield src, repl
 254         yield src + ' ', repl + ' '
 255
 256         if decompose:
 257             yield src, repl + ' '
 258             yield src + ' ', repl
 259     else:
 260         prefix = _FLAG_MATCH[preflag]
 261         postfix = _FLAG_MATCH[postflag]
 262
 263         yield prefix + src + postfix, prefix + repl + postfix