2 Generic processor for names that creates abbreviation variants.
4 from collections import defaultdict, namedtuple
8 from icu import Transliterator
11 from nominatim.config import flatten_config_list
12 from nominatim.errors import UsageError
14 ### Configuration section
16 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
18 def configure(rules, normalization_rules):
19 """ Extract and preprocess the configuration for this module.
21 rules = rules.get('variants')
22 immediate = defaultdict(list)
27 rules = flatten_config_list(rules, 'variants')
29 vmaker = _VariantMaker(normalization_rules)
32 for rule in (section.get('words') or []):
33 vset.update(vmaker.compute(rule))
35 # Intermediate reorder by source. Also compute required character set.
37 if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
38 replstr = variant.replacement[:-1]
40 replstr = variant.replacement
41 immediate[variant.source].append(replstr)
42 chars.update(variant.source)
44 return {'replacements': list(immediate.items()),
45 'chars': ''.join(chars)}
49 """ Generater for all necessary ICUVariants from a single variant rule.
51 All text in rules is normalized to make sure the variants match later.
54 def __init__(self, norm_rules):
55 self.norm = Transliterator.createFromRules("rule_loader_normalization",
59 def compute(self, rule):
60 """ Generator for all ICUVariant tuples from a single variant rule.
62 parts = re.split(r'(\|)?([=-])>', rule)
64 raise UsageError("Syntax error in variant rule: " + rule)
66 decompose = parts[1] is None
67 src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
68 repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
70 # If the source should be kept, add a 1:1 replacement
74 for froms, tos in _create_variants(*src, src[0], decompose):
75 yield ICUVariant(froms, tos)
77 for src, repl in itertools.product(src_terms, repl_terms):
79 for froms, tos in _create_variants(*src, repl, decompose):
80 yield ICUVariant(froms, tos)
83 def _parse_variant_word(self, name):
85 match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
86 if match is None or (match.group(1) == '~' and match.group(3) == '~'):
87 raise UsageError("Invalid variant word descriptor '{}'".format(name))
88 norm_name = self.norm.transliterate(match.group(2))
92 return norm_name, match.group(1), match.group(3)
95 _FLAG_MATCH = {'^': '^ ',
100 def _create_variants(src, preflag, postflag, repl, decompose):
102 postfix = _FLAG_MATCH[postflag]
103 # suffix decomposition
105 repl = repl + postfix
108 yield ' ' + src, ' ' + repl
111 yield src, ' ' + repl
112 yield ' ' + src, repl
113 elif postflag == '~':
114 # prefix decomposition
115 prefix = _FLAG_MATCH[preflag]
120 yield src + ' ', repl + ' '
123 yield src, repl + ' '
124 yield src + ' ', repl
126 prefix = _FLAG_MATCH[preflag]
127 postfix = _FLAG_MATCH[postflag]
129 yield prefix + src + postfix, prefix + repl + postfix
134 def create(trans_rules, config):
135 """ Create a new token analysis instance for this module.
137 return GenericTokenAnalysis(trans_rules, config)
140 class GenericTokenAnalysis:
141 """ Collects the different transformation rules for normalisation of names
142 and provides the functions to apply the transformations.
145 def __init__(self, to_ascii, config):
146 self.to_ascii = to_ascii
149 self.replacements = datrie.Trie(config['chars'])
150 for src, repllist in config['replacements']:
151 self.replacements[src] = repllist
154 def get_variants_ascii(self, norm_name):
155 """ Compute the spelling variants for the given normalized name
156 and transliterate the result.
158 baseform = '^ ' + norm_name + ' ^'
164 while pos < len(baseform):
165 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
168 done = baseform[startpos:pos]
169 partials = [v + done + r
170 for v, r in itertools.product(partials, repl)
171 if not force_space or r.startswith(' ')]
172 if len(partials) > 128:
173 # If too many variants are produced, they are unlikely
174 # to be helpful. Only use the original term.
177 startpos = pos + len(full)
186 # No variants detected? Fast return.
188 trans_name = self.to_ascii.transliterate(norm_name).strip()
189 return [trans_name] if trans_name else []
191 return self._compute_result_set(partials, baseform[startpos:])
194 def _compute_result_set(self, partials, prefix):
197 for variant in partials:
198 vname = variant + prefix
199 trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
201 results.add(trans_name)