nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70
  71     def finalize_import(self, config):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75         with connect(self.dsn) as conn:
  76             sqlp = SQLPreprocessor(conn, config)
  77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  86
  87
  88     def check_database(self, config):
  89         """ Check that the tokenizer is set up correctly.
  90         """
  91         # Will throw an error if there is an issue.
  92         self.init_from_project(config)
  93
  94
  95     def update_statistics(self):
  96         """ Recompute frequencies for all name words.
  97         """
  98         with connect(self.dsn) as conn:
  99             if conn.table_exists('search_name'):
 100                 with conn.cursor() as cur:
 101                     cur.drop_table("word_frequencies")
 102                     LOG.info("Computing word frequencies")
 103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                      SELECT unnest(name_vector) as id, count(*)
 105                                      FROM search_name GROUP BY id""")
 106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                     LOG.info("Update word table with recomputed frequencies")
 108                     cur.execute("""UPDATE word
 109                                    SET info = info || jsonb_build_object('count', count)
 110                                    FROM word_frequencies WHERE word_id = id""")
 111                     cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def _cleanup_housenumbers(self):
 116         """ Remove unused house numbers.
 117         """
 118         with connect(self.dsn) as conn:
 119             with conn.cursor(name="hnr_counter") as cur:
 120                 cur.execute("""SELECT word_id, word_token FROM word
 121                                WHERE type = 'H'
 122                                  AND NOT EXISTS(SELECT * FROM search_name
 123                                                 WHERE ARRAY[word.word_id] && name_vector)
 124                                  AND (char_length(word_token) > 6
 125                                       OR word_token not similar to '\d+')
 126                             """)
 127                 candidates = {token: wid for wid, token in cur}
 128             with conn.cursor(name="hnr_counter") as cur:
 129                 cur.execute("""SELECT housenumber FROM placex
 130                                WHERE housenumber is not null
 131                                      AND (char_length(housenumber) > 6
 132                                           OR housenumber not similar to '\d+')
 133                             """)
 134                 for row in cur:
 135                     for hnr in row[0].split(';'):
 136                         candidates.pop(hnr, None)
 137             LOG.info("There are %s outdated housenumbers.", len(candidates))
 138             if candidates:
 139                 with conn.cursor() as cur:
 140                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 141                                 (list(candidates.values()), ))
 142                 conn.commit()
 143
 144
 145
 146     def update_word_tokens(self):
 147         """ Remove unused tokens.
 148         """
 149         LOG.warn("Cleaning up housenumber tokens.")
 150         self._cleanup_housenumbers()
 151         LOG.warn("Tokenizer house-keeping done.")
 152
 153
 154     def name_analyzer(self):
 155         """ Create a new analyzer for tokenizing names and queries
 156             using this tokinzer. Analyzers are context managers and should
 157             be used accordingly:
 158
 159             ```
 160             with tokenizer.name_analyzer() as analyzer:
 161                 analyser.tokenize()
 162             ```
 163
 164             When used outside the with construct, the caller must ensure to
 165             call the close() function before destructing the analyzer.
 166
 167             Analyzers are not thread-safe. You need to instantiate one per thread.
 168         """
 169         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 170                                      self.loader.make_token_analysis())
 171
 172
 173     def _install_php(self, phpdir):
 174         """ Install the php script for the tokenizer.
 175         """
 176         php_file = self.data_dir / "tokenizer.php"
 177         php_file.write_text(dedent(f"""\
 178             <?php
 179             @define('CONST_Max_Word_Frequency', 10000000);
 180             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 181             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 182             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 183
 184
 185     def _save_config(self):
 186         """ Save the configuration that needs to remain stable for the given
 187             database as database properties.
 188         """
 189         with connect(self.dsn) as conn:
 190             self.loader.save_config_to_db(conn)
 191
 192
 193     def _init_db_tables(self, config):
 194         """ Set up the word table and fill it with pre-computed word
 195             frequencies.
 196         """
 197         with connect(self.dsn) as conn:
 198             sqlp = SQLPreprocessor(conn, config)
 199             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 200             conn.commit()
 201
 202
 203 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 204     """ The legacy analyzer uses the ICU library for splitting names.
 205
 206         Each instance opens a connection to the database to request the
 207         normalization.
 208     """
 209
 210     def __init__(self, dsn, sanitizer, token_analysis):
 211         self.conn = connect(dsn).connection
 212         self.conn.autocommit = True
 213         self.sanitizer = sanitizer
 214         self.token_analysis = token_analysis
 215
 216         self._cache = _TokenCache()
 217
 218
 219     def close(self):
 220         """ Free all resources used by the analyzer.
 221         """
 222         if self.conn:
 223             self.conn.close()
 224             self.conn = None
 225
 226
 227     def _search_normalized(self, name):
 228         """ Return the search token transliteration of the given name.
 229         """
 230         return self.token_analysis.search.transliterate(name).strip()
 231
 232
 233     def _normalized(self, name):
 234         """ Return the normalized version of the given name with all
 235             non-relevant information removed.
 236         """
 237         return self.token_analysis.normalizer.transliterate(name).strip()
 238
 239
 240     def get_word_token_info(self, words):
 241         """ Return token information for the given list of words.
 242             If a word starts with # it is assumed to be a full name
 243             otherwise is a partial name.
 244
 245             The function returns a list of tuples with
 246             (original word, word token, word id).
 247
 248             The function is used for testing and debugging only
 249             and not necessarily efficient.
 250         """
 251         full_tokens = {}
 252         partial_tokens = {}
 253         for word in words:
 254             if word.startswith('#'):
 255                 full_tokens[word] = self._search_normalized(word[1:])
 256             else:
 257                 partial_tokens[word] = self._search_normalized(word)
 258
 259         with self.conn.cursor() as cur:
 260             cur.execute("""SELECT word_token, word_id
 261                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 262                         """, (list(full_tokens.values()),))
 263             full_ids = {r[0]: r[1] for r in cur}
 264             cur.execute("""SELECT word_token, word_id
 265                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 266                         (list(partial_tokens.values()),))
 267             part_ids = {r[0]: r[1] for r in cur}
 268
 269         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 270                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 271
 272
 273     @staticmethod
 274     def normalize_postcode(postcode):
 275         """ Convert the postcode to a standardized form.
 276
 277             This function must yield exactly the same result as the SQL function
 278             'token_normalized_postcode()'.
 279         """
 280         return postcode.strip().upper()
 281
 282
 283     def _make_standard_hnr(self, hnr):
 284         """ Create a normalised version of a housenumber.
 285
 286             This function takes minor shortcuts on transliteration.
 287         """
 288         return self._search_normalized(hnr)
 289
 290     def update_postcodes_from_db(self):
 291         """ Update postcode tokens in the word table from the location_postcode
 292             table.
 293         """
 294         to_delete = []
 295         with self.conn.cursor() as cur:
 296             # This finds us the rows in location_postcode and word that are
 297             # missing in the other table.
 298             cur.execute("""SELECT * FROM
 299                             (SELECT pc, word FROM
 300                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 301                               FULL JOIN
 302                               (SELECT word FROM word WHERE type = 'P') w
 303                               ON pc = word) x
 304                            WHERE pc is null or word is null""")
 305
 306             with CopyBuffer() as copystr:
 307                 for postcode, word in cur:
 308                     if postcode is None:
 309                         to_delete.append(word)
 310                     else:
 311                         copystr.add(self._search_normalized(postcode),
 312                                     'P', postcode)
 313
 314                 if to_delete:
 315                     cur.execute("""DELETE FROM WORD
 316                                    WHERE type ='P' and word = any(%s)
 317                                 """, (to_delete, ))
 318
 319                 copystr.copy_out(cur, 'word',
 320                                  columns=['word_token', 'type', 'word'])
 321
 322
 323     def update_special_phrases(self, phrases, should_replace):
 324         """ Replace the search index for special phrases with the new phrases.
 325             If `should_replace` is True, then the previous set of will be
 326             completely replaced. Otherwise the phrases are added to the
 327             already existing ones.
 328         """
 329         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 330                             for p in phrases))
 331
 332         with self.conn.cursor() as cur:
 333             # Get the old phrases.
 334             existing_phrases = set()
 335             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 336             for word, info in cur:
 337                 existing_phrases.add((word, info['class'], info['type'],
 338                                       info.get('op') or '-'))
 339
 340             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 341             if should_replace:
 342                 deleted = self._remove_special_phrases(cur, norm_phrases,
 343                                                        existing_phrases)
 344             else:
 345                 deleted = 0
 346
 347         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 348                  len(norm_phrases), added, deleted)
 349
 350
 351     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 352         """ Add all phrases to the database that are not yet there.
 353         """
 354         to_add = new_phrases - existing_phrases
 355
 356         added = 0
 357         with CopyBuffer() as copystr:
 358             for word, cls, typ, oper in to_add:
 359                 term = self._search_normalized(word)
 360                 if term:
 361                     copystr.add(term, 'S', word,
 362                                 json.dumps({'class': cls, 'type': typ,
 363                                             'op': oper if oper in ('in', 'near') else None}))
 364                     added += 1
 365
 366             copystr.copy_out(cursor, 'word',
 367                              columns=['word_token', 'type', 'word', 'info'])
 368
 369         return added
 370
 371
 372     @staticmethod
 373     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 374         """ Remove all phrases from the databse that are no longer in the
 375             new phrase list.
 376         """
 377         to_delete = existing_phrases - new_phrases
 378
 379         if to_delete:
 380             cursor.execute_values(
 381                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 382                     WHERE type = 'S' and word = name
 383                           and info->>'class' = in_class and info->>'type' = in_type
 384                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 385                 """, to_delete)
 386
 387         return len(to_delete)
 388
 389
 390     def add_country_names(self, country_code, names):
 391         """ Add names for the given country to the search index.
 392         """
 393         # Make sure any name preprocessing for country names applies.
 394         info = PlaceInfo({'name': names, 'country_code': country_code,
 395                           'rank_address': 4, 'class': 'boundary',
 396                           'type': 'administrative'})
 397         self._add_country_full_names(country_code,
 398                                      self.sanitizer.process_names(info)[0])
 399
 400
 401     def _add_country_full_names(self, country_code, names):
 402         """ Add names for the given country from an already sanitized
 403             name list.
 404         """
 405         word_tokens = set()
 406         for name in names:
 407             norm_name = self._search_normalized(name.name)
 408             if norm_name:
 409                 word_tokens.add(norm_name)
 410
 411         with self.conn.cursor() as cur:
 412             # Get existing names
 413             cur.execute("""SELECT word_token FROM word
 414                             WHERE type = 'C' and word = %s""",
 415                         (country_code, ))
 416             word_tokens.difference_update((t[0] for t in cur))
 417
 418             # Only add those names that are not yet in the list.
 419             if word_tokens:
 420                 cur.execute("""INSERT INTO word (word_token, type, word)
 421                                (SELECT token, 'C', %s
 422                                 FROM unnest(%s) as token)
 423                             """, (country_code, list(word_tokens)))
 424
 425             # No names are deleted at the moment.
 426             # If deletion is made possible, then the static names from the
 427             # initial 'country_name' table should be kept.
 428
 429
 430     def process_place(self, place):
 431         """ Determine tokenizer information about the given place.
 432
 433             Returns a JSON-serializable structure that will be handed into
 434             the database via the token_info field.
 435         """
 436         token_info = _TokenInfo(self._cache)
 437
 438         names, address = self.sanitizer.process_names(place)
 439
 440         if names:
 441             fulls, partials = self._compute_name_tokens(names)
 442
 443             token_info.add_names(fulls, partials)
 444
 445             if place.is_country():
 446                 self._add_country_full_names(place.country_code, names)
 447
 448         if address:
 449             self._process_place_address(token_info, address)
 450
 451         return token_info.data
 452
 453
 454     def _process_place_address(self, token_info, address):
 455         hnrs = set()
 456         addr_terms = []
 457         streets = []
 458         for item in address:
 459             if item.kind == 'postcode':
 460                 self._add_postcode(item.name)
 461             elif item.kind == 'housenumber':
 462                 norm_name = self._make_standard_hnr(item.name)
 463                 if norm_name:
 464                     hnrs.add(norm_name)
 465             elif item.kind == 'street':
 466                 streets.extend(self._retrieve_full_tokens(item.name))
 467             elif item.kind == 'place':
 468                 if not item.suffix:
 469                     token_info.add_place(self._compute_partial_tokens(item.name))
 470             elif not item.kind.startswith('_') and not item.suffix and \
 471                  item.kind not in ('country', 'full'):
 472                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 473
 474         if hnrs:
 475             token_info.add_housenumbers(self.conn, hnrs)
 476
 477         if addr_terms:
 478             token_info.add_address_terms(addr_terms)
 479
 480         if streets:
 481             token_info.add_street(streets)
 482
 483
 484     def _compute_partial_tokens(self, name):
 485         """ Normalize the given term, split it into partial words and return
 486             then token list for them.
 487         """
 488         norm_name = self._search_normalized(name)
 489
 490         tokens = []
 491         need_lookup = []
 492         for partial in norm_name.split():
 493             token = self._cache.partials.get(partial)
 494             if token:
 495                 tokens.append(token)
 496             else:
 497                 need_lookup.append(partial)
 498
 499         if need_lookup:
 500             with self.conn.cursor() as cur:
 501                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 502                                FROM unnest(%s) word""",
 503                             (need_lookup, ))
 504
 505                 for partial, token in cur:
 506                     tokens.append(token)
 507                     self._cache.partials[partial] = token
 508
 509         return tokens
 510
 511
 512     def _retrieve_full_tokens(self, name):
 513         """ Get the full name token for the given name, if it exists.
 514             The name is only retrived for the standard analyser.
 515         """
 516         norm_name = self._search_normalized(name)
 517
 518         # return cached if possible
 519         if norm_name in self._cache.fulls:
 520             return self._cache.fulls[norm_name]
 521
 522         with self.conn.cursor() as cur:
 523             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 524                         (norm_name, ))
 525             full = [row[0] for row in cur]
 526
 527         self._cache.fulls[norm_name] = full
 528
 529         return full
 530
 531
 532     def _compute_name_tokens(self, names):
 533         """ Computes the full name and partial name tokens for the given
 534             dictionary of names.
 535         """
 536         full_tokens = set()
 537         partial_tokens = set()
 538
 539         for name in names:
 540             analyzer_id = name.get_attr('analyzer')
 541             norm_name = self._normalized(name.name)
 542             if analyzer_id is None:
 543                 token_id = norm_name
 544             else:
 545                 token_id = f'{norm_name}@{analyzer_id}'
 546
 547             full, part = self._cache.names.get(token_id, (None, None))
 548             if full is None:
 549                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 550                 if not variants:
 551                     continue
 552
 553                 with self.conn.cursor() as cur:
 554                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 555                                 (token_id, variants))
 556                     full, part = cur.fetchone()
 557
 558                 self._cache.names[token_id] = (full, part)
 559
 560             full_tokens.add(full)
 561             partial_tokens.update(part)
 562
 563         return full_tokens, partial_tokens
 564
 565
 566     def _add_postcode(self, postcode):
 567         """ Make sure the normalized postcode is present in the word table.
 568         """
 569         if re.search(r'[:,;]', postcode) is None:
 570             postcode = self.normalize_postcode(postcode)
 571
 572             if postcode not in self._cache.postcodes:
 573                 term = self._search_normalized(postcode)
 574                 if not term:
 575                     return
 576
 577                 with self.conn.cursor() as cur:
 578                     # no word_id needed for postcodes
 579                     cur.execute("""INSERT INTO word (word_token, type, word)
 580                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 581                                     WHERE NOT EXISTS
 582                                      (SELECT * FROM word
 583                                       WHERE type = 'P' and word = pc))
 584                                 """, (term, postcode))
 585                 self._cache.postcodes.add(postcode)
 586
 587
 588 class _TokenInfo:
 589     """ Collect token information to be sent back to the database.
 590     """
 591     def __init__(self, cache):
 592         self._cache = cache
 593         self.data = {}
 594
 595     @staticmethod
 596     def _mk_array(tokens):
 597         return '{%s}' % ','.join((str(s) for s in tokens))
 598
 599
 600     def add_names(self, fulls, partials):
 601         """ Adds token information for the normalised names.
 602         """
 603         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 604
 605
 606     def add_housenumbers(self, conn, hnrs):
 607         """ Extract housenumber information from a list of normalised
 608             housenumbers.
 609         """
 610         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 611         self.data['hnr'] = ';'.join(hnrs)
 612
 613
 614     def add_street(self, tokens):
 615         """ Add addr:street match terms.
 616         """
 617         self.data['street'] = self._mk_array(tokens)
 618
 619
 620     def add_place(self, tokens):
 621         """ Add addr:place search and match terms.
 622         """
 623         if tokens:
 624             self.data['place'] = self._mk_array(tokens)
 625
 626
 627     def add_address_terms(self, terms):
 628         """ Add additional address terms.
 629         """
 630         tokens = {key: self._mk_array(partials)
 631                   for key, partials in terms if partials}
 632
 633         if tokens:
 634             self.data['addr'] = tokens
 635
 636
 637 class _TokenCache:
 638     """ Cache for token information to avoid repeated database queries.
 639
 640         This cache is not thread-safe and needs to be instantiated per
 641         analyzer.
 642     """
 643     def __init__(self):
 644         self.names = {}
 645         self.partials = {}
 646         self.fulls = {}
 647         self.postcodes = set()
 648         self.housenumbers = {}
 649
 650
 651     def get_hnr_tokens(self, conn, terms):
 652         """ Get token ids for a list of housenumbers, looking them up in the
 653             database if necessary. `terms` is an iterable of normalized
 654             housenumbers.
 655         """
 656         tokens = []
 657         askdb = []
 658
 659         for term in terms:
 660             token = self.housenumbers.get(term)
 661             if token is None:
 662                 askdb.append(term)
 663             else:
 664                 tokens.append(token)
 665
 666         if askdb:
 667             with conn.cursor() as cur:
 668                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 669                             (askdb, ))
 670                 for term, tid in cur:
 671                     self.housenumbers[term] = tid
 672                     tokens.append(tid)
 673
 674         return tokens