1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def _cleanup_housenumbers(self):
116 """ Remove unused house numbers.
118 with connect(self.dsn) as conn:
119 if not conn.table_exists('search_name'):
121 with conn.cursor(name="hnr_counter") as cur:
122 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
125 AND NOT EXISTS(SELECT * FROM search_name
126 WHERE ARRAY[word.word_id] && name_vector)
127 AND (char_length(coalesce(word, word_token)) > 6
128 OR coalesce(word, word_token) not similar to '\\d+')
130 candidates = {token: wid for wid, token in cur}
131 with conn.cursor(name="hnr_counter") as cur:
132 cur.execute("""SELECT housenumber FROM placex
133 WHERE housenumber is not null
134 AND (char_length(housenumber) > 6
135 OR housenumber not similar to '\\d+')
138 for hnr in row[0].split(';'):
139 candidates.pop(hnr, None)
140 LOG.info("There are %s outdated housenumbers.", len(candidates))
141 LOG.debug("Outdated housenumbers: %s", candidates.keys())
143 with conn.cursor() as cur:
144 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
145 (list(candidates.values()), ))
150 def update_word_tokens(self):
151 """ Remove unused tokens.
153 LOG.warning("Cleaning up housenumber tokens.")
154 self._cleanup_housenumbers()
155 LOG.warning("Tokenizer house-keeping done.")
158 def name_analyzer(self):
159 """ Create a new analyzer for tokenizing names and queries
160 using this tokinzer. Analyzers are context managers and should
164 with tokenizer.name_analyzer() as analyzer:
168 When used outside the with construct, the caller must ensure to
169 call the close() function before destructing the analyzer.
171 Analyzers are not thread-safe. You need to instantiate one per thread.
173 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
174 self.loader.make_token_analysis())
177 def _install_php(self, phpdir):
178 """ Install the php script for the tokenizer.
180 php_file = self.data_dir / "tokenizer.php"
181 php_file.write_text(dedent(f"""\
183 @define('CONST_Max_Word_Frequency', 10000000);
184 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
185 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
186 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
189 def _save_config(self):
190 """ Save the configuration that needs to remain stable for the given
191 database as database properties.
193 with connect(self.dsn) as conn:
194 self.loader.save_config_to_db(conn)
197 def _init_db_tables(self, config):
198 """ Set up the word table and fill it with pre-computed word
201 with connect(self.dsn) as conn:
202 sqlp = SQLPreprocessor(conn, config)
203 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
207 class LegacyICUNameAnalyzer(AbstractAnalyzer):
208 """ The legacy analyzer uses the ICU library for splitting names.
210 Each instance opens a connection to the database to request the
214 def __init__(self, dsn, sanitizer, token_analysis):
215 self.conn = connect(dsn).connection
216 self.conn.autocommit = True
217 self.sanitizer = sanitizer
218 self.token_analysis = token_analysis
220 self._cache = _TokenCache()
224 """ Free all resources used by the analyzer.
231 def _search_normalized(self, name):
232 """ Return the search token transliteration of the given name.
234 return self.token_analysis.search.transliterate(name).strip()
237 def _normalized(self, name):
238 """ Return the normalized version of the given name with all
239 non-relevant information removed.
241 return self.token_analysis.normalizer.transliterate(name).strip()
244 def get_word_token_info(self, words):
245 """ Return token information for the given list of words.
246 If a word starts with # it is assumed to be a full name
247 otherwise is a partial name.
249 The function returns a list of tuples with
250 (original word, word token, word id).
252 The function is used for testing and debugging only
253 and not necessarily efficient.
258 if word.startswith('#'):
259 full_tokens[word] = self._search_normalized(word[1:])
261 partial_tokens[word] = self._search_normalized(word)
263 with self.conn.cursor() as cur:
264 cur.execute("""SELECT word_token, word_id
265 FROM word WHERE word_token = ANY(%s) and type = 'W'
266 """, (list(full_tokens.values()),))
267 full_ids = {r[0]: r[1] for r in cur}
268 cur.execute("""SELECT word_token, word_id
269 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
270 (list(partial_tokens.values()),))
271 part_ids = {r[0]: r[1] for r in cur}
273 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
274 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
278 def normalize_postcode(postcode):
279 """ Convert the postcode to a standardized form.
281 This function must yield exactly the same result as the SQL function
282 'token_normalized_postcode()'.
284 return postcode.strip().upper()
287 def update_postcodes_from_db(self):
288 """ Update postcode tokens in the word table from the location_postcode
292 with self.conn.cursor() as cur:
293 # This finds us the rows in location_postcode and word that are
294 # missing in the other table.
295 cur.execute("""SELECT * FROM
296 (SELECT pc, word FROM
297 (SELECT distinct(postcode) as pc FROM location_postcode) p
299 (SELECT word FROM word WHERE type = 'P') w
301 WHERE pc is null or word is null""")
303 with CopyBuffer() as copystr:
304 for postcode, word in cur:
306 to_delete.append(word)
308 copystr.add(self._search_normalized(postcode),
312 cur.execute("""DELETE FROM WORD
313 WHERE type ='P' and word = any(%s)
316 copystr.copy_out(cur, 'word',
317 columns=['word_token', 'type', 'word'])
320 def update_special_phrases(self, phrases, should_replace):
321 """ Replace the search index for special phrases with the new phrases.
322 If `should_replace` is True, then the previous set of will be
323 completely replaced. Otherwise the phrases are added to the
324 already existing ones.
326 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
329 with self.conn.cursor() as cur:
330 # Get the old phrases.
331 existing_phrases = set()
332 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
333 for word, info in cur:
334 existing_phrases.add((word, info['class'], info['type'],
335 info.get('op') or '-'))
337 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
339 deleted = self._remove_special_phrases(cur, norm_phrases,
344 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
345 len(norm_phrases), added, deleted)
348 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
349 """ Add all phrases to the database that are not yet there.
351 to_add = new_phrases - existing_phrases
354 with CopyBuffer() as copystr:
355 for word, cls, typ, oper in to_add:
356 term = self._search_normalized(word)
358 copystr.add(term, 'S', word,
359 json.dumps({'class': cls, 'type': typ,
360 'op': oper if oper in ('in', 'near') else None}))
363 copystr.copy_out(cursor, 'word',
364 columns=['word_token', 'type', 'word', 'info'])
370 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
371 """ Remove all phrases from the databse that are no longer in the
374 to_delete = existing_phrases - new_phrases
377 cursor.execute_values(
378 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
379 WHERE type = 'S' and word = name
380 and info->>'class' = in_class and info->>'type' = in_type
381 and ((op = '-' and info->>'op' is null) or op = info->>'op')
384 return len(to_delete)
387 def add_country_names(self, country_code, names):
388 """ Add default names for the given country to the search index.
390 # Make sure any name preprocessing for country names applies.
391 info = PlaceInfo({'name': names, 'country_code': country_code,
392 'rank_address': 4, 'class': 'boundary',
393 'type': 'administrative'})
394 self._add_country_full_names(country_code,
395 self.sanitizer.process_names(info)[0],
399 def _add_country_full_names(self, country_code, names, internal=False):
400 """ Add names for the given country from an already sanitized
405 norm_name = self._search_normalized(name.name)
407 word_tokens.add(norm_name)
409 with self.conn.cursor() as cur:
411 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
413 WHERE type = 'C' and word = %s""",
415 existing_tokens = {True: set(), False: set()} # internal/external names
417 existing_tokens[word[1]].add(word[0])
419 # Delete names that no longer exist.
420 gone_tokens = existing_tokens[internal] - word_tokens
422 gone_tokens.update(existing_tokens[False] & word_tokens)
424 cur.execute("""DELETE FROM word
425 USING unnest(%s) as token
426 WHERE type = 'C' and word = %s
427 and word_token = token""",
428 (list(gone_tokens), country_code))
430 # Only add those names that are not yet in the list.
431 new_tokens = word_tokens - existing_tokens[True]
433 new_tokens -= existing_tokens[False]
436 sql = """INSERT INTO word (word_token, type, word, info)
437 (SELECT token, 'C', %s, '{"internal": "yes"}'
438 FROM unnest(%s) as token)
441 sql = """INSERT INTO word (word_token, type, word)
442 (SELECT token, 'C', %s
443 FROM unnest(%s) as token)
445 cur.execute(sql, (country_code, list(new_tokens)))
448 def process_place(self, place):
449 """ Determine tokenizer information about the given place.
451 Returns a JSON-serializable structure that will be handed into
452 the database via the token_info field.
454 token_info = _TokenInfo()
456 names, address = self.sanitizer.process_names(place)
459 token_info.set_names(*self._compute_name_tokens(names))
461 if place.is_country():
462 self._add_country_full_names(place.country_code, names)
465 self._process_place_address(token_info, address)
467 return token_info.to_dict()
470 def _process_place_address(self, token_info, address):
472 if item.kind == 'postcode':
473 self._add_postcode(item.name)
474 elif item.kind == 'housenumber':
475 token_info.add_housenumber(*self._compute_housenumber_token(item))
476 elif item.kind == 'street':
477 token_info.add_street(self._retrieve_full_tokens(item.name))
478 elif item.kind == 'place':
480 token_info.add_place(self._compute_partial_tokens(item.name))
481 elif not item.kind.startswith('_') and not item.suffix and \
482 item.kind not in ('country', 'full'):
483 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
486 def _compute_housenumber_token(self, hnr):
487 """ Normalize the housenumber and return the word token and the
490 analyzer = self.token_analysis.analysis.get('@housenumber')
494 # When no custom analyzer is set, simply normalize and transliterate
495 norm_name = self._search_normalized(hnr.name)
497 result = self._cache.housenumbers.get(norm_name, result)
498 if result[0] is None:
499 with self.conn.cursor() as cur:
500 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
501 result = cur.fetchone()[0], norm_name
502 self._cache.housenumbers[norm_name] = result
504 # Otherwise use the analyzer to determine the canonical name.
505 # Per convention we use the first variant as the 'lookup name', the
506 # name that gets saved in the housenumber field of the place.
507 norm_name = analyzer.normalize(hnr.name)
509 result = self._cache.housenumbers.get(norm_name, result)
510 if result[0] is None:
511 variants = analyzer.get_variants_ascii(norm_name)
513 with self.conn.cursor() as cur:
514 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
515 (norm_name, list(variants)))
516 result = cur.fetchone()[0], variants[0]
517 self._cache.housenumbers[norm_name] = result
522 def _compute_partial_tokens(self, name):
523 """ Normalize the given term, split it into partial words and return
524 then token list for them.
526 norm_name = self._search_normalized(name)
530 for partial in norm_name.split():
531 token = self._cache.partials.get(partial)
535 need_lookup.append(partial)
538 with self.conn.cursor() as cur:
539 cur.execute("""SELECT word, getorcreate_partial_word(word)
540 FROM unnest(%s) word""",
543 for partial, token in cur:
545 self._cache.partials[partial] = token
550 def _retrieve_full_tokens(self, name):
551 """ Get the full name token for the given name, if it exists.
552 The name is only retrived for the standard analyser.
554 norm_name = self._search_normalized(name)
556 # return cached if possible
557 if norm_name in self._cache.fulls:
558 return self._cache.fulls[norm_name]
560 with self.conn.cursor() as cur:
561 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
563 full = [row[0] for row in cur]
565 self._cache.fulls[norm_name] = full
570 def _compute_name_tokens(self, names):
571 """ Computes the full name and partial name tokens for the given
575 partial_tokens = set()
578 analyzer_id = name.get_attr('analyzer')
579 analyzer = self.token_analysis.get_analyzer(analyzer_id)
580 norm_name = analyzer.normalize(name.name)
581 if analyzer_id is None:
584 token_id = f'{norm_name}@{analyzer_id}'
586 full, part = self._cache.names.get(token_id, (None, None))
588 variants = analyzer.get_variants_ascii(norm_name)
592 with self.conn.cursor() as cur:
593 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
594 (token_id, variants))
595 full, part = cur.fetchone()
597 self._cache.names[token_id] = (full, part)
599 full_tokens.add(full)
600 partial_tokens.update(part)
602 return full_tokens, partial_tokens
605 def _add_postcode(self, postcode):
606 """ Make sure the normalized postcode is present in the word table.
608 if re.search(r'[:,;]', postcode) is None:
609 postcode = self.normalize_postcode(postcode)
611 if postcode not in self._cache.postcodes:
612 term = self._search_normalized(postcode)
616 with self.conn.cursor() as cur:
617 # no word_id needed for postcodes
618 cur.execute("""INSERT INTO word (word_token, type, word)
619 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
622 WHERE type = 'P' and word = pc))
623 """, (term, postcode))
624 self._cache.postcodes.add(postcode)
628 """ Collect token information to be sent back to the database.
632 self.housenumbers = set()
633 self.housenumber_tokens = set()
634 self.street_tokens = set()
635 self.place_tokens = set()
636 self.address_tokens = {}
640 def _mk_array(tokens):
641 return f"{{{','.join((str(s) for s in tokens))}}}"
645 """ Return the token information in database importable format.
650 out['names'] = self.names
652 if self.housenumbers:
653 out['hnr'] = ';'.join(self.housenumbers)
654 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
656 if self.street_tokens:
657 out['street'] = self._mk_array(self.street_tokens)
659 if self.place_tokens:
660 out['place'] = self._mk_array(self.place_tokens)
662 if self.address_tokens:
663 out['addr'] = self.address_tokens
668 def set_names(self, fulls, partials):
669 """ Adds token information for the normalised names.
671 self.names = self._mk_array(itertools.chain(fulls, partials))
674 def add_housenumber(self, token, hnr):
675 """ Extract housenumber information from a list of normalised
679 self.housenumbers.add(hnr)
680 self.housenumber_tokens.add(token)
683 def add_street(self, tokens):
684 """ Add addr:street match terms.
686 self.street_tokens.update(tokens)
689 def add_place(self, tokens):
690 """ Add addr:place search and match terms.
692 self.place_tokens.update(tokens)
695 def add_address_term(self, key, partials):
696 """ Add additional address terms.
699 self.address_tokens[key] = self._mk_array(partials)
703 """ Cache for token information to avoid repeated database queries.
705 This cache is not thread-safe and needs to be instantiated per
712 self.postcodes = set()
713 self.housenumbers = {}