nominatim/tokenizer/icu_rule_loader.py

   1 """
   2 Helper class to create ICU rules from a configuration file.
   3 """
   4 import io
   5 import json
   6 import logging
   7 import itertools
   8 import re
   9
  10 from icu import Transliterator
  11
  12 from nominatim.db.properties import set_property, get_property
  13 from nominatim.errors import UsageError
  14 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
  15 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  16 import nominatim.tokenizer.icu_variants as variants
  17
  18 LOG = logging.getLogger()
  19
  20 DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
  21 DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
  22 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
  23
  24
  25 def _flatten_config_list(content):
  26     if not content:
  27         return []
  28
  29     if not isinstance(content, list):
  30         raise UsageError("List expected in ICU configuration.")
  31
  32     output = []
  33     for ele in content:
  34         if isinstance(ele, list):
  35             output.extend(_flatten_config_list(ele))
  36         else:
  37             output.append(ele)
  38
  39     return output
  40
  41
  42 class VariantRule:
  43     """ Saves a single variant expansion.
  44
  45         An expansion consists of the normalized replacement term and
  46         a dicitonary of properties that describe when the expansion applies.
  47     """
  48
  49     def __init__(self, replacement, properties):
  50         self.replacement = replacement
  51         self.properties = properties or {}
  52
  53
  54 class ICURuleLoader:
  55     """ Compiler for ICU rules from a tokenizer configuration file.
  56     """
  57
  58     def __init__(self, config):
  59         rules = config.load_sub_configuration('icu_tokenizer.yaml',
  60                                               config='TOKENIZER_CONFIG')
  61
  62         self.variants = set()
  63
  64         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
  65         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
  66         self.analysis_rules = self._get_section(rules, 'variants')
  67         self._parse_variant_list()
  68
  69         # Load optional sanitizer rule set.
  70         self.sanitizer_rules = rules.get('sanitizers', [])
  71
  72
  73     def load_config_from_db(self, conn):
  74         """ Get previously saved parts of the configuration from the
  75             database.
  76         """
  77         self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
  78         self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
  79         self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
  80         self._parse_variant_list()
  81
  82
  83     def save_config_to_db(self, conn):
  84         """ Save the part of the configuration that cannot be changed into
  85             the database.
  86         """
  87         set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
  88         set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
  89         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  90
  91
  92     def make_sanitizer(self):
  93         """ Create a place sanitizer from the configured rules.
  94         """
  95         return PlaceSanitizer(self.sanitizer_rules)
  96
  97
  98     def make_token_analysis(self):
  99         """ Create a token analyser from the reviouly loaded rules.
 100         """
 101         return ICUNameProcessor(self.normalization_rules,
 102                                 self.transliteration_rules,
 103                                 self.variants)
 104
 105
 106     def get_search_rules(self):
 107         """ Return the ICU rules to be used during search.
 108             The rules combine normalization and transliteration.
 109         """
 110         # First apply the normalization rules.
 111         rules = io.StringIO()
 112         rules.write(self.normalization_rules)
 113
 114         # Then add transliteration.
 115         rules.write(self.transliteration_rules)
 116         return rules.getvalue()
 117
 118     def get_normalization_rules(self):
 119         """ Return rules for normalisation of a term.
 120         """
 121         return self.normalization_rules
 122
 123     def get_transliteration_rules(self):
 124         """ Return the rules for converting a string into its asciii representation.
 125         """
 126         return self.transliteration_rules
 127
 128     def get_replacement_pairs(self):
 129         """ Return the list of possible compound decompositions with
 130             application of abbreviations included.
 131             The result is a list of pairs: the first item is the sequence to
 132             replace, the second is a list of replacements.
 133         """
 134         return self.variants
 135
 136
 137     @staticmethod
 138     def _get_section(rules, section):
 139         """ Get the section named 'section' from the rules. If the section does
 140             not exist, raise a usage error with a meaningful message.
 141         """
 142         if section not in rules:
 143             LOG.fatal("Section '%s' not found in tokenizer config.", section)
 144             raise UsageError("Syntax error in tokenizer configuration file.")
 145
 146         return rules[section]
 147
 148
 149     def _cfg_to_icu_rules(self, rules, section):
 150         """ Load an ICU ruleset from the given section. If the section is a
 151             simple string, it is interpreted as a file name and the rules are
 152             loaded verbatim from the given file. The filename is expected to be
 153             relative to the tokenizer rule file. If the section is a list then
 154             each line is assumed to be a rule. All rules are concatenated and returned.
 155         """
 156         content = self._get_section(rules, section)
 157
 158         if content is None:
 159             return ''
 160
 161         return ';'.join(_flatten_config_list(content)) + ';'
 162
 163
 164     def _parse_variant_list(self):
 165         rules = self.analysis_rules
 166
 167         self.variants.clear()
 168
 169         if not rules:
 170             return
 171
 172         rules = _flatten_config_list(rules)
 173
 174         vmaker = _VariantMaker(self.normalization_rules)
 175
 176         properties = []
 177         for section in rules:
 178             # Create the property field and deduplicate against existing
 179             # instances.
 180             props = variants.ICUVariantProperties.from_rules(section)
 181             for existing in properties:
 182                 if existing == props:
 183                     props = existing
 184                     break
 185             else:
 186                 properties.append(props)
 187
 188             for rule in (section.get('words') or []):
 189                 self.variants.update(vmaker.compute(rule, props))
 190
 191
 192 class _VariantMaker:
 193     """ Generater for all necessary ICUVariants from a single variant rule.
 194
 195         All text in rules is normalized to make sure the variants match later.
 196     """
 197
 198     def __init__(self, norm_rules):
 199         self.norm = Transliterator.createFromRules("rule_loader_normalization",
 200                                                    norm_rules)
 201
 202
 203     def compute(self, rule, props):
 204         """ Generator for all ICUVariant tuples from a single variant rule.
 205         """
 206         parts = re.split(r'(\|)?([=-])>', rule)
 207         if len(parts) != 4:
 208             raise UsageError("Syntax error in variant rule: " + rule)
 209
 210         decompose = parts[1] is None
 211         src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
 212         repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
 213
 214         # If the source should be kept, add a 1:1 replacement
 215         if parts[2] == '-':
 216             for src in src_terms:
 217                 if src:
 218                     for froms, tos in _create_variants(*src, src[0], decompose):
 219                         yield variants.ICUVariant(froms, tos, props)
 220
 221         for src, repl in itertools.product(src_terms, repl_terms):
 222             if src and repl:
 223                 for froms, tos in _create_variants(*src, repl, decompose):
 224                     yield variants.ICUVariant(froms, tos, props)
 225
 226
 227     def _parse_variant_word(self, name):
 228         name = name.strip()
 229         match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
 230         if match is None or (match.group(1) == '~' and match.group(3) == '~'):
 231             raise UsageError("Invalid variant word descriptor '{}'".format(name))
 232         norm_name = self.norm.transliterate(match.group(2))
 233         if not norm_name:
 234             return None
 235
 236         return norm_name, match.group(1), match.group(3)
 237
 238
 239 _FLAG_MATCH = {'^': '^ ',
 240                '$': ' ^',
 241                '': ' '}
 242
 243
 244 def _create_variants(src, preflag, postflag, repl, decompose):
 245     if preflag == '~':
 246         postfix = _FLAG_MATCH[postflag]
 247         # suffix decomposition
 248         src = src + postfix
 249         repl = repl + postfix
 250
 251         yield src, repl
 252         yield ' ' + src, ' ' + repl
 253
 254         if decompose:
 255             yield src, ' ' + repl
 256             yield ' ' + src, repl
 257     elif postflag == '~':
 258         # prefix decomposition
 259         prefix = _FLAG_MATCH[preflag]
 260         src = prefix + src
 261         repl = prefix + repl
 262
 263         yield src, repl
 264         yield src + ' ', repl + ' '
 265
 266         if decompose:
 267             yield src, repl + ' '
 268             yield src + ' ', repl
 269     else:
 270         prefix = _FLAG_MATCH[preflag]
 271         postfix = _FLAG_MATCH[postflag]
 272
 273         yield prefix + src + postfix, prefix + repl + postfix