1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def _cleanup_housenumbers(self):
116 """ Remove unused house numbers.
118 with connect(self.dsn) as conn:
119 with conn.cursor(name="hnr_counter") as cur:
120 cur.execute("""SELECT word_id, word_token FROM word
122 AND NOT EXISTS(SELECT * FROM search_name
123 WHERE ARRAY[word.word_id] && name_vector)
124 AND (char_length(word_token) > 6
125 OR word_token not similar to '\d+')
127 candidates = {token: wid for wid, token in cur}
128 with conn.cursor(name="hnr_counter") as cur:
129 cur.execute("""SELECT housenumber FROM placex
130 WHERE housenumber is not null
131 AND (char_length(housenumber) > 6
132 OR housenumber not similar to '\d+')
135 for hnr in row[0].split(';'):
136 candidates.pop(hnr, None)
137 LOG.info("There are %s outdated housenumbers.", len(candidates))
139 with conn.cursor() as cur:
140 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
141 (list(candidates.values()), ))
146 def update_word_tokens(self):
147 """ Remove unused tokens.
149 LOG.warn("Cleaning up housenumber tokens.")
150 self._cleanup_housenumbers()
151 LOG.warn("Tokenizer house-keeping done.")
154 def name_analyzer(self):
155 """ Create a new analyzer for tokenizing names and queries
156 using this tokinzer. Analyzers are context managers and should
160 with tokenizer.name_analyzer() as analyzer:
164 When used outside the with construct, the caller must ensure to
165 call the close() function before destructing the analyzer.
167 Analyzers are not thread-safe. You need to instantiate one per thread.
169 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
170 self.loader.make_token_analysis())
173 def _install_php(self, phpdir):
174 """ Install the php script for the tokenizer.
176 php_file = self.data_dir / "tokenizer.php"
177 php_file.write_text(dedent(f"""\
179 @define('CONST_Max_Word_Frequency', 10000000);
180 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
181 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
182 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
185 def _save_config(self):
186 """ Save the configuration that needs to remain stable for the given
187 database as database properties.
189 with connect(self.dsn) as conn:
190 self.loader.save_config_to_db(conn)
193 def _init_db_tables(self, config):
194 """ Set up the word table and fill it with pre-computed word
197 with connect(self.dsn) as conn:
198 sqlp = SQLPreprocessor(conn, config)
199 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
203 class LegacyICUNameAnalyzer(AbstractAnalyzer):
204 """ The legacy analyzer uses the ICU library for splitting names.
206 Each instance opens a connection to the database to request the
210 def __init__(self, dsn, sanitizer, token_analysis):
211 self.conn = connect(dsn).connection
212 self.conn.autocommit = True
213 self.sanitizer = sanitizer
214 self.token_analysis = token_analysis
216 self._cache = _TokenCache()
220 """ Free all resources used by the analyzer.
227 def _search_normalized(self, name):
228 """ Return the search token transliteration of the given name.
230 return self.token_analysis.search.transliterate(name).strip()
233 def _normalized(self, name):
234 """ Return the normalized version of the given name with all
235 non-relevant information removed.
237 return self.token_analysis.normalizer.transliterate(name).strip()
240 def get_word_token_info(self, words):
241 """ Return token information for the given list of words.
242 If a word starts with # it is assumed to be a full name
243 otherwise is a partial name.
245 The function returns a list of tuples with
246 (original word, word token, word id).
248 The function is used for testing and debugging only
249 and not necessarily efficient.
254 if word.startswith('#'):
255 full_tokens[word] = self._search_normalized(word[1:])
257 partial_tokens[word] = self._search_normalized(word)
259 with self.conn.cursor() as cur:
260 cur.execute("""SELECT word_token, word_id
261 FROM word WHERE word_token = ANY(%s) and type = 'W'
262 """, (list(full_tokens.values()),))
263 full_ids = {r[0]: r[1] for r in cur}
264 cur.execute("""SELECT word_token, word_id
265 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
266 (list(partial_tokens.values()),))
267 part_ids = {r[0]: r[1] for r in cur}
269 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
270 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
274 def normalize_postcode(postcode):
275 """ Convert the postcode to a standardized form.
277 This function must yield exactly the same result as the SQL function
278 'token_normalized_postcode()'.
280 return postcode.strip().upper()
283 def _make_standard_hnr(self, hnr):
284 """ Create a normalised version of a housenumber.
286 This function takes minor shortcuts on transliteration.
288 return self._search_normalized(hnr)
290 def update_postcodes_from_db(self):
291 """ Update postcode tokens in the word table from the location_postcode
295 with self.conn.cursor() as cur:
296 # This finds us the rows in location_postcode and word that are
297 # missing in the other table.
298 cur.execute("""SELECT * FROM
299 (SELECT pc, word FROM
300 (SELECT distinct(postcode) as pc FROM location_postcode) p
302 (SELECT word FROM word WHERE type = 'P') w
304 WHERE pc is null or word is null""")
306 with CopyBuffer() as copystr:
307 for postcode, word in cur:
309 to_delete.append(word)
311 copystr.add(self._search_normalized(postcode),
315 cur.execute("""DELETE FROM WORD
316 WHERE type ='P' and word = any(%s)
319 copystr.copy_out(cur, 'word',
320 columns=['word_token', 'type', 'word'])
323 def update_special_phrases(self, phrases, should_replace):
324 """ Replace the search index for special phrases with the new phrases.
325 If `should_replace` is True, then the previous set of will be
326 completely replaced. Otherwise the phrases are added to the
327 already existing ones.
329 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
332 with self.conn.cursor() as cur:
333 # Get the old phrases.
334 existing_phrases = set()
335 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
336 for word, info in cur:
337 existing_phrases.add((word, info['class'], info['type'],
338 info.get('op') or '-'))
340 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
342 deleted = self._remove_special_phrases(cur, norm_phrases,
347 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
348 len(norm_phrases), added, deleted)
351 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
352 """ Add all phrases to the database that are not yet there.
354 to_add = new_phrases - existing_phrases
357 with CopyBuffer() as copystr:
358 for word, cls, typ, oper in to_add:
359 term = self._search_normalized(word)
361 copystr.add(term, 'S', word,
362 json.dumps({'class': cls, 'type': typ,
363 'op': oper if oper in ('in', 'near') else None}))
366 copystr.copy_out(cursor, 'word',
367 columns=['word_token', 'type', 'word', 'info'])
373 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
374 """ Remove all phrases from the databse that are no longer in the
377 to_delete = existing_phrases - new_phrases
380 cursor.execute_values(
381 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
382 WHERE type = 'S' and word = name
383 and info->>'class' = in_class and info->>'type' = in_type
384 and ((op = '-' and info->>'op' is null) or op = info->>'op')
387 return len(to_delete)
390 def add_country_names(self, country_code, names):
391 """ Add names for the given country to the search index.
393 # Make sure any name preprocessing for country names applies.
394 info = PlaceInfo({'name': names, 'country_code': country_code,
395 'rank_address': 4, 'class': 'boundary',
396 'type': 'administrative'})
397 self._add_country_full_names(country_code,
398 self.sanitizer.process_names(info)[0])
401 def _add_country_full_names(self, country_code, names):
402 """ Add names for the given country from an already sanitized
407 norm_name = self._search_normalized(name.name)
409 word_tokens.add(norm_name)
411 with self.conn.cursor() as cur:
413 cur.execute("""SELECT word_token FROM word
414 WHERE type = 'C' and word = %s""",
416 word_tokens.difference_update((t[0] for t in cur))
418 # Only add those names that are not yet in the list.
420 cur.execute("""INSERT INTO word (word_token, type, word)
421 (SELECT token, 'C', %s
422 FROM unnest(%s) as token)
423 """, (country_code, list(word_tokens)))
425 # No names are deleted at the moment.
426 # If deletion is made possible, then the static names from the
427 # initial 'country_name' table should be kept.
430 def process_place(self, place):
431 """ Determine tokenizer information about the given place.
433 Returns a JSON-serializable structure that will be handed into
434 the database via the token_info field.
436 token_info = _TokenInfo(self._cache)
438 names, address = self.sanitizer.process_names(place)
441 fulls, partials = self._compute_name_tokens(names)
443 token_info.add_names(fulls, partials)
445 if place.is_country():
446 self._add_country_full_names(place.country_code, names)
449 self._process_place_address(token_info, address)
451 return token_info.data
454 def _process_place_address(self, token_info, address):
459 if item.kind == 'postcode':
460 self._add_postcode(item.name)
461 elif item.kind == 'housenumber':
462 norm_name = self._make_standard_hnr(item.name)
465 elif item.kind == 'street':
466 streets.extend(self._retrieve_full_tokens(item.name))
467 elif item.kind == 'place':
469 token_info.add_place(self._compute_partial_tokens(item.name))
470 elif not item.kind.startswith('_') and not item.suffix and \
471 item.kind not in ('country', 'full'):
472 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
475 token_info.add_housenumbers(self.conn, hnrs)
478 token_info.add_address_terms(addr_terms)
481 token_info.add_street(streets)
484 def _compute_partial_tokens(self, name):
485 """ Normalize the given term, split it into partial words and return
486 then token list for them.
488 norm_name = self._search_normalized(name)
492 for partial in norm_name.split():
493 token = self._cache.partials.get(partial)
497 need_lookup.append(partial)
500 with self.conn.cursor() as cur:
501 cur.execute("""SELECT word, getorcreate_partial_word(word)
502 FROM unnest(%s) word""",
505 for partial, token in cur:
507 self._cache.partials[partial] = token
512 def _retrieve_full_tokens(self, name):
513 """ Get the full name token for the given name, if it exists.
514 The name is only retrived for the standard analyser.
516 norm_name = self._search_normalized(name)
518 # return cached if possible
519 if norm_name in self._cache.fulls:
520 return self._cache.fulls[norm_name]
522 with self.conn.cursor() as cur:
523 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
525 full = [row[0] for row in cur]
527 self._cache.fulls[norm_name] = full
532 def _compute_name_tokens(self, names):
533 """ Computes the full name and partial name tokens for the given
537 partial_tokens = set()
540 analyzer_id = name.get_attr('analyzer')
541 norm_name = self._normalized(name.name)
542 if analyzer_id is None:
545 token_id = f'{norm_name}@{analyzer_id}'
547 full, part = self._cache.names.get(token_id, (None, None))
549 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
553 with self.conn.cursor() as cur:
554 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
555 (token_id, variants))
556 full, part = cur.fetchone()
558 self._cache.names[token_id] = (full, part)
560 full_tokens.add(full)
561 partial_tokens.update(part)
563 return full_tokens, partial_tokens
566 def _add_postcode(self, postcode):
567 """ Make sure the normalized postcode is present in the word table.
569 if re.search(r'[:,;]', postcode) is None:
570 postcode = self.normalize_postcode(postcode)
572 if postcode not in self._cache.postcodes:
573 term = self._search_normalized(postcode)
577 with self.conn.cursor() as cur:
578 # no word_id needed for postcodes
579 cur.execute("""INSERT INTO word (word_token, type, word)
580 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
583 WHERE type = 'P' and word = pc))
584 """, (term, postcode))
585 self._cache.postcodes.add(postcode)
589 """ Collect token information to be sent back to the database.
591 def __init__(self, cache):
596 def _mk_array(tokens):
597 return '{%s}' % ','.join((str(s) for s in tokens))
600 def add_names(self, fulls, partials):
601 """ Adds token information for the normalised names.
603 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
606 def add_housenumbers(self, conn, hnrs):
607 """ Extract housenumber information from a list of normalised
610 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
611 self.data['hnr'] = ';'.join(hnrs)
614 def add_street(self, tokens):
615 """ Add addr:street match terms.
617 self.data['street'] = self._mk_array(tokens)
620 def add_place(self, tokens):
621 """ Add addr:place search and match terms.
624 self.data['place'] = self._mk_array(tokens)
627 def add_address_terms(self, terms):
628 """ Add additional address terms.
630 tokens = {key: self._mk_array(partials)
631 for key, partials in terms if partials}
634 self.data['addr'] = tokens
638 """ Cache for token information to avoid repeated database queries.
640 This cache is not thread-safe and needs to be instantiated per
647 self.postcodes = set()
648 self.housenumbers = {}
651 def get_hnr_tokens(self, conn, terms):
652 """ Get token ids for a list of housenumbers, looking them up in the
653 database if necessary. `terms` is an iterable of normalized
660 token = self.housenumbers.get(term)
667 with conn.cursor() as cur:
668 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
670 for term, tid in cur:
671 self.housenumbers[term] = tid