2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
9 from textwrap import dedent
10 from pathlib import Path
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
19 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer:
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
39 self.naming_rules = None
40 self.term_normalization = None
41 self.max_word_frequency = None
44 def init_new_db(self, config, init_db=True):
45 """ Set up a new tokenizer for the database.
47 This copies all necessary data in the project directory to make
48 sure the tokenizer remains stable even over updates.
50 if config.TOKENIZER_CONFIG:
51 cfgfile = Path(config.TOKENIZER_CONFIG)
53 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
55 loader = ICURuleLoader(cfgfile)
56 self.naming_rules = ICUNameProcessorRules(loader=loader)
57 self.term_normalization = config.TERM_NORMALIZATION
58 self.max_word_frequency = config.MAX_WORD_FREQUENCY
60 self._install_php(config.lib_dir.php)
61 self._save_config(config)
64 self.update_sql_functions(config)
65 self._init_db_tables(config)
68 def init_from_project(self):
69 """ Initialise the tokenizer from the project directory.
71 with connect(self.dsn) as conn:
72 self.naming_rules = ICUNameProcessorRules(conn=conn)
73 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
74 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
77 def finalize_import(self, config):
78 """ Do any required postprocessing to make the tokenizer data ready
81 with connect(self.dsn) as conn:
82 sqlp = SQLPreprocessor(conn, config)
83 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
86 def update_sql_functions(self, config):
87 """ Reimport the SQL functions for this tokenizer.
89 with connect(self.dsn) as conn:
90 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
91 sqlp = SQLPreprocessor(conn, config)
92 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
93 max_word_freq=max_word_freq)
96 def check_database(self):
97 """ Check that the tokenizer is set up correctly.
99 self.init_from_project()
101 if self.naming_rules is None:
102 return "Configuration for tokenizer 'legacy_icu' are missing."
107 def name_analyzer(self):
108 """ Create a new analyzer for tokenizing names and queries
109 using this tokinzer. Analyzers are context managers and should
113 with tokenizer.name_analyzer() as analyzer:
117 When used outside the with construct, the caller must ensure to
118 call the close() function before destructing the analyzer.
120 Analyzers are not thread-safe. You need to instantiate one per thread.
122 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
124 # pylint: disable=missing-format-attribute
125 def _install_php(self, phpdir):
126 """ Install the php script for the tokenizer.
128 php_file = self.data_dir / "tokenizer.php"
129 php_file.write_text(dedent("""\
131 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
132 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
133 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
134 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
135 """.format(self, phpdir)))
138 def _save_config(self, config):
139 """ Save the configuration that needs to remain stable for the given
140 database as database properties.
142 with connect(self.dsn) as conn:
143 self.naming_rules.save_rules(conn)
145 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
146 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
149 def _init_db_tables(self, config):
150 """ Set up the word table and fill it with pre-computed word
153 with connect(self.dsn) as conn:
154 sqlp = SQLPreprocessor(conn, config)
155 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
158 LOG.warning("Precomputing word tokens")
160 # get partial words and their frequencies
162 name_proc = ICUNameProcessor(self.naming_rules)
163 with conn.cursor(name="words") as cur:
164 cur.execute(""" SELECT v, count(*) FROM
165 (SELECT svals(name) as v FROM place)x
166 WHERE length(v) < 75 GROUP BY v""")
168 for name, cnt in cur:
170 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
172 terms.update(word.split())
176 # copy them back into the word table
177 with CopyBuffer() as copystr:
178 for args in words.items():
181 with conn.cursor() as cur:
182 copystr.copy_out(cur, 'word',
183 columns=['word_token', 'search_name_count'])
184 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
185 WHERE word_id is null""")
190 class LegacyICUNameAnalyzer:
191 """ The legacy analyzer uses the ICU library for splitting names.
193 Each instance opens a connection to the database to request the
197 def __init__(self, dsn, name_proc):
198 self.conn = connect(dsn).connection
199 self.conn.autocommit = True
200 self.name_processor = name_proc
202 self._cache = _TokenCache()
209 def __exit__(self, exc_type, exc_value, traceback):
214 """ Free all resources used by the analyzer.
221 def get_word_token_info(self, words):
222 """ Return token information for the given list of words.
223 If a word starts with # it is assumed to be a full name
224 otherwise is a partial name.
226 The function returns a list of tuples with
227 (original word, word token, word id).
229 The function is used for testing and debugging only
230 and not necessarily efficient.
234 if word.startswith('#'):
235 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
237 tokens[word] = self.name_processor.get_search_normalized(word)
239 with self.conn.cursor() as cur:
240 cur.execute("""SELECT word_token, word_id
241 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
242 WHERE word_token = t.term
243 and class is null and country_code is null""",
244 (list(tokens.values()), ))
245 ids = {r[0]: r[1] for r in cur}
247 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
251 def normalize_postcode(postcode):
252 """ Convert the postcode to a standardized form.
254 This function must yield exactly the same result as the SQL function
255 'token_normalized_postcode()'.
257 return postcode.strip().upper()
260 def _make_standard_hnr(self, hnr):
261 """ Create a normalised version of a housenumber.
263 This function takes minor shortcuts on transliteration.
265 return self.name_processor.get_search_normalized(hnr)
267 def update_postcodes_from_db(self):
268 """ Update postcode tokens in the word table from the location_postcode
272 with self.conn.cursor() as cur:
273 # This finds us the rows in location_postcode and word that are
274 # missing in the other table.
275 cur.execute("""SELECT * FROM
276 (SELECT pc, word FROM
277 (SELECT distinct(postcode) as pc FROM location_postcode) p
279 (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
281 WHERE pc is null or word is null""")
283 with CopyBuffer() as copystr:
284 for postcode, word in cur:
286 to_delete.append(word)
288 copystr.add(self.name_processor.get_search_normalized(postcode),
289 'P', {'postcode': postcode})
292 cur.execute("""DELETE FROM WORD
293 WHERE class ='P' and info->>'postcode' = any(%s)
296 copystr.copy_out(cur, 'word',
297 columns=['word_token', 'type', 'info'])
300 def update_special_phrases(self, phrases, should_replace):
301 """ Replace the search index for special phrases with the new phrases.
302 If `should_replace` is True, then the previous set of will be
303 completely replaced. Otherwise the phrases are added to the
304 already existing ones.
306 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
309 with self.conn.cursor() as cur:
310 # Get the old phrases.
311 existing_phrases = set()
312 cur.execute("SELECT info FROM word WHERE type = 'S'")
314 existing_phrases.add((info['word'], info['class'], info['type'],
315 info.get('op') or '-'))
317 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
319 deleted = self._remove_special_phrases(cur, norm_phrases,
324 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
325 len(norm_phrases), added, deleted)
328 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
329 """ Add all phrases to the database that are not yet there.
331 to_add = new_phrases - existing_phrases
334 with CopyBuffer() as copystr:
335 for word, cls, typ, oper in to_add:
336 term = self.name_processor.get_search_normalized(word)
338 copystr.add(term, 'S',
339 {'word': word, 'class': cls, 'type': typ,
340 'op': oper if oper in ('in', 'near') else None})
343 copystr.copy_out(cursor, 'word',
344 columns=['word_token', 'type', 'info'])
350 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
351 """ Remove all phrases from the databse that are no longer in the
354 to_delete = existing_phrases - new_phrases
357 cursor.execute_values(
358 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
359 WHERE info->>'word' = name
360 and info->>'class' = in_class and info->>'type' = in_type
361 and ((op = '-' and info->>'op' is null) or op = info->>'op')
364 return len(to_delete)
367 def add_country_names(self, country_code, names):
368 """ Add names for the given country to the search index.
371 for name in self._compute_full_names(names):
372 norm_name = self.name_processor.get_search_normalized(name)
374 word_tokens.add(norm_name)
376 with self.conn.cursor() as cur:
378 cur.execute("""SELECT word_token FROM word
379 WHERE type = 'C' and info->>'cc'= %s""",
381 word_tokens.difference_update((t[0] for t in cur))
383 # Only add those names that are not yet in the list.
385 cur.execute("""INSERT INTO word (word_token, type, info)
386 (SELECT token, 'C', json_build_object('cc', %s)
387 FROM unnest(%s) as token)
388 """, (country_code, list(word_tokens)))
390 # No names are deleted at the moment.
391 # If deletion is made possible, then the static names from the
392 # initial 'country_name' table should be kept.
395 def process_place(self, place):
396 """ Determine tokenizer information about the given place.
398 Returns a JSON-serialisable structure that will be handed into
399 the database via the token_info field.
401 token_info = _TokenInfo(self._cache)
403 names = place.get('name')
406 fulls, partials = self._compute_name_tokens(names)
408 token_info.add_names(fulls, partials)
410 country_feature = place.get('country_feature')
411 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
412 self.add_country_names(country_feature.lower(), names)
414 address = place.get('address')
416 self._process_place_address(token_info, address)
418 return token_info.data
421 def _process_place_address(self, token_info, address):
424 for key, value in address.items():
425 if key == 'postcode':
426 self._add_postcode(value)
427 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
429 elif key == 'street':
430 token_info.add_street(*self._compute_name_tokens({'name': value}))
432 token_info.add_place(*self._compute_name_tokens({'name': value}))
433 elif not key.startswith('_') and \
434 key not in ('country', 'full'):
435 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
438 hnrs = self._split_housenumbers(hnrs)
439 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
442 token_info.add_address_terms(addr_terms)
445 def _compute_name_tokens(self, names):
446 """ Computes the full name and partial name tokens for the given
449 full_names = self._compute_full_names(names)
451 partial_tokens = set()
453 for name in full_names:
454 norm_name = self.name_processor.get_normalized(name)
455 full, part = self._cache.names.get(norm_name, (None, None))
457 variants = self.name_processor.get_variants_ascii(norm_name)
461 with self.conn.cursor() as cur:
462 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
463 (norm_name, variants))
464 full, part = cur.fetchone()
466 self._cache.names[norm_name] = (full, part)
468 full_tokens.add(full)
469 partial_tokens.update(part)
471 return full_tokens, partial_tokens
475 def _compute_full_names(names):
476 """ Return the set of all full name word ids to be used with the
477 given dictionary of names.
480 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
484 brace_idx = name.find('(')
486 full_names.add(name[:brace_idx].strip())
491 def _add_postcode(self, postcode):
492 """ Make sure the normalized postcode is present in the word table.
494 if re.search(r'[:,;]', postcode) is None:
495 postcode = self.normalize_postcode(postcode)
497 if postcode not in self._cache.postcodes:
498 term = self.name_processor.get_search_normalized(postcode)
502 with self.conn.cursor() as cur:
503 # no word_id needed for postcodes
504 cur.execute("""INSERT INTO word (word_token, type, info)
505 (SELECT %s, 'P', json_build_object('postcode', pc)
506 FROM (VALUES (%s)) as v(pc)
509 WHERE type = 'P' and info->>postcode = pc))
510 """, (term, postcode))
511 self._cache.postcodes.add(postcode)
515 def _split_housenumbers(hnrs):
516 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
517 # split numbers if necessary
520 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
522 if len(simple_list) > 1:
523 hnrs = list(set(simple_list))
533 """ Collect token information to be sent back to the database.
535 def __init__(self, cache):
540 def _mk_array(tokens):
541 return '{%s}' % ','.join((str(s) for s in tokens))
544 def add_names(self, fulls, partials):
545 """ Adds token information for the normalised names.
547 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
550 def add_housenumbers(self, conn, hnrs):
551 """ Extract housenumber information from a list of normalised
554 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
555 self.data['hnr'] = ';'.join(hnrs)
558 def add_street(self, fulls, _):
559 """ Add addr:street match terms.
562 self.data['street'] = self._mk_array(fulls)
565 def add_place(self, fulls, partials):
566 """ Add addr:place search and match terms.
569 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
570 self.data['place_match'] = self._mk_array(fulls)
573 def add_address_terms(self, terms):
574 """ Add additional address terms.
578 for key, fulls, partials in terms:
580 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
581 self._mk_array(fulls)]
584 self.data['addr'] = tokens
588 """ Cache for token information to avoid repeated database queries.
590 This cache is not thread-safe and needs to be instantiated per
595 self.postcodes = set()
596 self.housenumbers = {}
599 def get_hnr_tokens(self, conn, terms):
600 """ Get token ids for a list of housenumbers, looking them up in the
601 database if necessary. `terms` is an iterable of normalized
608 token = self.housenumbers.get(term)
615 with conn.cursor() as cur:
616 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
618 for term, tid in cur:
619 self.housenumbers[term] = tid