2 Helper class to create ICU rules from a configuration file.
6 from collections import defaultdict
10 from icu import Transliterator
12 from nominatim.errors import UsageError
14 LOG = logging.getLogger()
18 """ Compiler for ICU rules from a tokenizer configuration file.
21 def __init__(self, configfile):
22 self.configfile = configfile
23 self.compound_suffixes = set()
24 self.abbreviations = defaultdict()
26 if configfile.suffix == '.yaml':
27 self._load_from_yaml()
29 raise UsageError("Unknown format of tokenizer configuration.")
32 def get_search_rules(self):
33 """ Return the ICU rules to be used during search.
34 The rules combine normalization and transliteration.
36 # First apply the normalization rules.
38 rules.write(self.normalization_rules)
40 # Then add transliteration.
41 rules.write(self.transliteration_rules)
42 return rules.getvalue()
44 def get_normalization_rules(self):
45 """ Return rules for normalisation of a term.
47 return self.normalization_rules
49 def get_transliteration_rules(self):
50 """ Return the rules for converting a string into its asciii representation.
52 return self.transliteration_rules
54 def get_replacement_pairs(self):
55 """ Return the list of possible compound decompositions with
56 application of abbreviations included.
57 The result is a list of pairs: the first item is the sequence to
58 replace, the second is a list of replacements.
60 synonyms = defaultdict(set)
62 # First add entries for compound decomposition.
63 for suffix in self.compound_suffixes:
64 variants = (suffix + ' ', ' ' + suffix + ' ')
66 synonyms[key].update(variants)
68 for full, abbr in self.abbreviations.items():
69 key = ' ' + full + ' '
70 # Entries in the abbreviation list always apply to full words:
71 synonyms[key].update((' ' + a + ' ' for a in abbr))
72 # Replacements are optional, so add a noop
73 synonyms[key].add(key)
75 if full in self.compound_suffixes:
76 # Full word abbreviating to compunded version.
77 synonyms[key].update((a + ' ' for a in abbr))
80 # Uncompunded suffix abbrevitating to decompounded version.
81 synonyms[key].update((' ' + a + ' ' for a in abbr))
82 # Uncompunded suffix abbrevitating to compunded version.
83 synonyms[key].update((a + ' ' for a in abbr))
85 # sort the resulting list by descending length (longer matches are prefered).
86 sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
88 return [(k, list(synonyms[k])) for k in sorted_keys]
91 def _load_from_yaml(self):
92 rules = yaml.safe_load(self.configfile.read_text())
94 self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
95 self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
96 self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
97 self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
100 def _get_section(self, rules, section):
101 """ Get the section named 'section' from the rules. If the section does
102 not exist, raise a usage error with a meaningful message.
104 if section not in rules:
105 LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
106 section, str(self.configfile))
107 raise UsageError("Syntax error in tokenizer configuration file.")
109 return rules[section]
112 def _cfg_to_icu_rules(self, rules, section):
113 """ Load an ICU ruleset from the given section. If the section is a
114 simple string, it is interpreted as a file name and the rules are
115 loaded verbatim from the given file. The filename is expected to be
116 relative to the tokenizer rule file. If the section is a list then
117 each line is assumed to be a rule. All rules are concatenated and returned.
119 content = self._get_section(rules, section)
124 if isinstance(content, str):
125 return (self.configfile.parent / content).read_text().replace('\n', ' ')
127 return ';'.join(content) + ';'
130 def _parse_compound_suffix_list(self, rules):
132 self.compound_suffixes = set()
135 norm = Transliterator.createFromRules("rule_loader_normalization",
136 self.normalization_rules)
138 # Make sure all suffixes are in their normalised form.
139 self.compound_suffixes = set((norm.transliterate(s) for s in rules))
142 def _parse_abbreviation_list(self, rules):
143 self.abbreviations = defaultdict(list)
148 norm = Transliterator.createFromRules("rule_loader_normalization",
149 self.normalization_rules)
152 parts = rule.split('=>')
154 LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
155 raise UsageError("Syntax error in tokenizer configuration file.")
157 # Make sure all terms match the normalised version.
158 fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
159 abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
161 for full, abbr in itertools.product(fullterms, abbrterms):
163 self.abbreviations[full].append(abbr)