nominatim/tokenizer/icu_rule_loader.py

   1 """
   2 Helper class to create ICU rules from a configuration file.
   3 """
   4 import io
   5 import logging
   6 from collections import defaultdict
   7 import itertools
   8
   9 import yaml
  10 from icu import Transliterator
  11
  12 from nominatim.errors import UsageError
  13
  14 LOG = logging.getLogger()
  15
  16
  17 class ICURuleLoader:
  18     """ Compiler for ICU rules from a tokenizer configuration file.
  19     """
  20
  21     def __init__(self, configfile):
  22         self.configfile = configfile
  23         self.compound_suffixes = set()
  24         self.abbreviations = defaultdict()
  25
  26         if configfile.suffix == '.yaml':
  27             self._load_from_yaml()
  28         else:
  29             raise UsageError("Unknown format of tokenizer configuration.")
  30
  31
  32     def get_search_rules(self):
  33         """ Return the ICU rules to be used during search.
  34             The rules combine normalization and transliteration.
  35         """
  36         # First apply the normalization rules.
  37         rules = io.StringIO()
  38         rules.write(self.normalization_rules)
  39
  40         # Then add transliteration.
  41         rules.write(self.transliteration_rules)
  42         return rules.getvalue()
  43
  44     def get_normalization_rules(self):
  45         """ Return rules for normalisation of a term.
  46         """
  47         return self.normalization_rules
  48
  49     def get_transliteration_rules(self):
  50         """ Return the rules for converting a string into its asciii representation.
  51         """
  52         return self.transliteration_rules
  53
  54     def get_replacement_pairs(self):
  55         """ Return the list of possible compound decompositions with
  56             application of abbreviations included.
  57             The result is a list of pairs: the first item is the sequence to
  58             replace, the second is a list of replacements.
  59         """
  60         synonyms = defaultdict(set)
  61
  62         # First add entries for compound decomposition.
  63         for suffix in self.compound_suffixes:
  64             variants = (suffix + ' ', ' ' + suffix + ' ')
  65             for key in variants:
  66                 synonyms[key].update(variants)
  67
  68         for full, abbr in self.abbreviations.items():
  69             key = ' ' + full + ' '
  70             # Entries in the abbreviation list always apply to full words:
  71             synonyms[key].update((' ' + a + ' ' for a in abbr))
  72             # Replacements are optional, so add a noop
  73             synonyms[key].add(key)
  74
  75             if full in self.compound_suffixes:
  76                 # Full word abbreviating to compunded version.
  77                 synonyms[key].update((a + ' ' for a in abbr))
  78
  79                 key = full + ' '
  80                 # Uncompunded suffix abbrevitating to decompounded version.
  81                 synonyms[key].update((' ' + a + ' ' for a in abbr))
  82                 # Uncompunded suffix abbrevitating to compunded version.
  83                 synonyms[key].update((a + ' ' for a in abbr))
  84
  85         # sort the resulting list by descending length (longer matches are prefered).
  86         sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
  87
  88         return [(k, list(synonyms[k])) for k in sorted_keys]
  89
  90
  91     def _load_from_yaml(self):
  92         rules = yaml.safe_load(self.configfile.read_text())
  93
  94         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
  95         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
  96         self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
  97         self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
  98
  99
 100     def _get_section(self, rules, section):
 101         """ Get the section named 'section' from the rules. If the section does
 102             not exist, raise a usage error with a meaningful message.
 103         """
 104         if section not in rules:
 105             LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
 106                       section, str(self.configfile))
 107             raise UsageError("Syntax error in tokenizer configuration file.")
 108
 109         return rules[section]
 110
 111
 112     def _cfg_to_icu_rules(self, rules, section):
 113         """ Load an ICU ruleset from the given section. If the section is a
 114             simple string, it is interpreted as a file name and the rules are
 115             loaded verbatim from the given file. The filename is expected to be
 116             relative to the tokenizer rule file. If the section is a list then
 117             each line is assumed to be a rule. All rules are concatenated and returned.
 118         """
 119         content = self._get_section(rules, section)
 120
 121         if content is None:
 122             return ''
 123
 124         if isinstance(content, str):
 125             return (self.configfile.parent / content).read_text().replace('\n', ' ')
 126
 127         return ';'.join(content) + ';'
 128
 129
 130     def _parse_compound_suffix_list(self, rules):
 131         if not rules:
 132             self.compound_suffixes = set()
 133             return
 134
 135         norm = Transliterator.createFromRules("rule_loader_normalization",
 136                                               self.normalization_rules)
 137
 138         # Make sure all suffixes are in their normalised form.
 139         self.compound_suffixes = set((norm.transliterate(s) for s in rules))
 140
 141
 142     def _parse_abbreviation_list(self, rules):
 143         self.abbreviations = defaultdict(list)
 144
 145         if not rules:
 146             return
 147
 148         norm = Transliterator.createFromRules("rule_loader_normalization",
 149                                               self.normalization_rules)
 150
 151         for rule in rules:
 152             parts = rule.split('=>')
 153             if len(parts) != 2:
 154                 LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
 155                 raise UsageError("Syntax error in tokenizer configuration file.")
 156
 157             # Make sure all terms match the normalised version.
 158             fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
 159             abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
 160
 161             for full, abbr in itertools.product(fullterms, abbrterms):
 162                 if full and abbr:
 163                     self.abbreviations[full].append(abbr)