nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70
  71     def finalize_import(self, config):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75         with connect(self.dsn) as conn:
  76             sqlp = SQLPreprocessor(conn, config)
  77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  86
  87
  88     def check_database(self, config):
  89         """ Check that the tokenizer is set up correctly.
  90         """
  91         # Will throw an error if there is an issue.
  92         self.init_from_project(config)
  93
  94
  95     def update_statistics(self):
  96         """ Recompute frequencies for all name words.
  97         """
  98         with connect(self.dsn) as conn:
  99             if conn.table_exists('search_name'):
 100                 with conn.cursor() as cur:
 101                     cur.drop_table("word_frequencies")
 102                     LOG.info("Computing word frequencies")
 103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                      SELECT unnest(name_vector) as id, count(*)
 105                                      FROM search_name GROUP BY id""")
 106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                     LOG.info("Update word table with recomputed frequencies")
 108                     cur.execute("""UPDATE word
 109                                    SET info = info || jsonb_build_object('count', count)
 110                                    FROM word_frequencies WHERE word_id = id""")
 111                     cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def _cleanup_housenumbers(self):
 116         """ Remove unused house numbers.
 117         """
 118         with connect(self.dsn) as conn:
 119             if not conn.table_exists('search_name'):
 120                 return
 121             with conn.cursor(name="hnr_counter") as cur:
 122                 cur.execute("""SELECT word_id, word_token FROM word
 123                                WHERE type = 'H'
 124                                  AND NOT EXISTS(SELECT * FROM search_name
 125                                                 WHERE ARRAY[word.word_id] && name_vector)
 126                                  AND (char_length(word_token) > 6
 127                                       OR word_token not similar to '\\d+')
 128                             """)
 129                 candidates = {token: wid for wid, token in cur}
 130             with conn.cursor(name="hnr_counter") as cur:
 131                 cur.execute("""SELECT housenumber FROM placex
 132                                WHERE housenumber is not null
 133                                      AND (char_length(housenumber) > 6
 134                                           OR housenumber not similar to '\\d+')
 135                             """)
 136                 for row in cur:
 137                     for hnr in row[0].split(';'):
 138                         candidates.pop(hnr, None)
 139             LOG.info("There are %s outdated housenumbers.", len(candidates))
 140             if candidates:
 141                 with conn.cursor() as cur:
 142                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 143                                 (list(candidates.values()), ))
 144                 conn.commit()
 145
 146
 147
 148     def update_word_tokens(self):
 149         """ Remove unused tokens.
 150         """
 151         LOG.warning("Cleaning up housenumber tokens.")
 152         self._cleanup_housenumbers()
 153         LOG.warning("Tokenizer house-keeping done.")
 154
 155
 156     def name_analyzer(self):
 157         """ Create a new analyzer for tokenizing names and queries
 158             using this tokinzer. Analyzers are context managers and should
 159             be used accordingly:
 160
 161             ```
 162             with tokenizer.name_analyzer() as analyzer:
 163                 analyser.tokenize()
 164             ```
 165
 166             When used outside the with construct, the caller must ensure to
 167             call the close() function before destructing the analyzer.
 168
 169             Analyzers are not thread-safe. You need to instantiate one per thread.
 170         """
 171         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 172                                      self.loader.make_token_analysis())
 173
 174
 175     def _install_php(self, phpdir):
 176         """ Install the php script for the tokenizer.
 177         """
 178         php_file = self.data_dir / "tokenizer.php"
 179         php_file.write_text(dedent(f"""\
 180             <?php
 181             @define('CONST_Max_Word_Frequency', 10000000);
 182             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 183             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 184             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 185
 186
 187     def _save_config(self):
 188         """ Save the configuration that needs to remain stable for the given
 189             database as database properties.
 190         """
 191         with connect(self.dsn) as conn:
 192             self.loader.save_config_to_db(conn)
 193
 194
 195     def _init_db_tables(self, config):
 196         """ Set up the word table and fill it with pre-computed word
 197             frequencies.
 198         """
 199         with connect(self.dsn) as conn:
 200             sqlp = SQLPreprocessor(conn, config)
 201             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 202             conn.commit()
 203
 204
 205 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 206     """ The legacy analyzer uses the ICU library for splitting names.
 207
 208         Each instance opens a connection to the database to request the
 209         normalization.
 210     """
 211
 212     def __init__(self, dsn, sanitizer, token_analysis):
 213         self.conn = connect(dsn).connection
 214         self.conn.autocommit = True
 215         self.sanitizer = sanitizer
 216         self.token_analysis = token_analysis
 217
 218         self._cache = _TokenCache()
 219
 220
 221     def close(self):
 222         """ Free all resources used by the analyzer.
 223         """
 224         if self.conn:
 225             self.conn.close()
 226             self.conn = None
 227
 228
 229     def _search_normalized(self, name):
 230         """ Return the search token transliteration of the given name.
 231         """
 232         return self.token_analysis.search.transliterate(name).strip()
 233
 234
 235     def _normalized(self, name):
 236         """ Return the normalized version of the given name with all
 237             non-relevant information removed.
 238         """
 239         return self.token_analysis.normalizer.transliterate(name).strip()
 240
 241
 242     def get_word_token_info(self, words):
 243         """ Return token information for the given list of words.
 244             If a word starts with # it is assumed to be a full name
 245             otherwise is a partial name.
 246
 247             The function returns a list of tuples with
 248             (original word, word token, word id).
 249
 250             The function is used for testing and debugging only
 251             and not necessarily efficient.
 252         """
 253         full_tokens = {}
 254         partial_tokens = {}
 255         for word in words:
 256             if word.startswith('#'):
 257                 full_tokens[word] = self._search_normalized(word[1:])
 258             else:
 259                 partial_tokens[word] = self._search_normalized(word)
 260
 261         with self.conn.cursor() as cur:
 262             cur.execute("""SELECT word_token, word_id
 263                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 264                         """, (list(full_tokens.values()),))
 265             full_ids = {r[0]: r[1] for r in cur}
 266             cur.execute("""SELECT word_token, word_id
 267                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 268                         (list(partial_tokens.values()),))
 269             part_ids = {r[0]: r[1] for r in cur}
 270
 271         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 272                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 273
 274
 275     @staticmethod
 276     def normalize_postcode(postcode):
 277         """ Convert the postcode to a standardized form.
 278
 279             This function must yield exactly the same result as the SQL function
 280             'token_normalized_postcode()'.
 281         """
 282         return postcode.strip().upper()
 283
 284
 285     def update_postcodes_from_db(self):
 286         """ Update postcode tokens in the word table from the location_postcode
 287             table.
 288         """
 289         to_delete = []
 290         with self.conn.cursor() as cur:
 291             # This finds us the rows in location_postcode and word that are
 292             # missing in the other table.
 293             cur.execute("""SELECT * FROM
 294                             (SELECT pc, word FROM
 295                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 296                               FULL JOIN
 297                               (SELECT word FROM word WHERE type = 'P') w
 298                               ON pc = word) x
 299                            WHERE pc is null or word is null""")
 300
 301             with CopyBuffer() as copystr:
 302                 for postcode, word in cur:
 303                     if postcode is None:
 304                         to_delete.append(word)
 305                     else:
 306                         copystr.add(self._search_normalized(postcode),
 307                                     'P', postcode)
 308
 309                 if to_delete:
 310                     cur.execute("""DELETE FROM WORD
 311                                    WHERE type ='P' and word = any(%s)
 312                                 """, (to_delete, ))
 313
 314                 copystr.copy_out(cur, 'word',
 315                                  columns=['word_token', 'type', 'word'])
 316
 317
 318     def update_special_phrases(self, phrases, should_replace):
 319         """ Replace the search index for special phrases with the new phrases.
 320             If `should_replace` is True, then the previous set of will be
 321             completely replaced. Otherwise the phrases are added to the
 322             already existing ones.
 323         """
 324         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 325                             for p in phrases))
 326
 327         with self.conn.cursor() as cur:
 328             # Get the old phrases.
 329             existing_phrases = set()
 330             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 331             for word, info in cur:
 332                 existing_phrases.add((word, info['class'], info['type'],
 333                                       info.get('op') or '-'))
 334
 335             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 336             if should_replace:
 337                 deleted = self._remove_special_phrases(cur, norm_phrases,
 338                                                        existing_phrases)
 339             else:
 340                 deleted = 0
 341
 342         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 343                  len(norm_phrases), added, deleted)
 344
 345
 346     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 347         """ Add all phrases to the database that are not yet there.
 348         """
 349         to_add = new_phrases - existing_phrases
 350
 351         added = 0
 352         with CopyBuffer() as copystr:
 353             for word, cls, typ, oper in to_add:
 354                 term = self._search_normalized(word)
 355                 if term:
 356                     copystr.add(term, 'S', word,
 357                                 json.dumps({'class': cls, 'type': typ,
 358                                             'op': oper if oper in ('in', 'near') else None}))
 359                     added += 1
 360
 361             copystr.copy_out(cursor, 'word',
 362                              columns=['word_token', 'type', 'word', 'info'])
 363
 364         return added
 365
 366
 367     @staticmethod
 368     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 369         """ Remove all phrases from the databse that are no longer in the
 370             new phrase list.
 371         """
 372         to_delete = existing_phrases - new_phrases
 373
 374         if to_delete:
 375             cursor.execute_values(
 376                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 377                     WHERE type = 'S' and word = name
 378                           and info->>'class' = in_class and info->>'type' = in_type
 379                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 380                 """, to_delete)
 381
 382         return len(to_delete)
 383
 384
 385     def add_country_names(self, country_code, names):
 386         """ Add default names for the given country to the search index.
 387         """
 388         # Make sure any name preprocessing for country names applies.
 389         info = PlaceInfo({'name': names, 'country_code': country_code,
 390                           'rank_address': 4, 'class': 'boundary',
 391                           'type': 'administrative'})
 392         self._add_country_full_names(country_code,
 393                                      self.sanitizer.process_names(info)[0],
 394                                      internal=True)
 395
 396
 397     def _add_country_full_names(self, country_code, names, internal=False):
 398         """ Add names for the given country from an already sanitized
 399             name list.
 400         """
 401         word_tokens = set()
 402         for name in names:
 403             norm_name = self._search_normalized(name.name)
 404             if norm_name:
 405                 word_tokens.add(norm_name)
 406
 407         with self.conn.cursor() as cur:
 408             # Get existing names
 409             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 410                              FROM word
 411                              WHERE type = 'C' and word = %s""",
 412                         (country_code, ))
 413             existing_tokens = {True: set(), False: set()} # internal/external names
 414             for word in cur:
 415                 existing_tokens[word[1]].add(word[0])
 416
 417             # Delete names that no longer exist.
 418             gone_tokens = existing_tokens[internal] - word_tokens
 419             if internal:
 420                 gone_tokens.update(existing_tokens[False] & word_tokens)
 421             if gone_tokens:
 422                 cur.execute("""DELETE FROM word
 423                                USING unnest(%s) as token
 424                                WHERE type = 'C' and word = %s
 425                                      and word_token = token""",
 426                             (list(gone_tokens), country_code))
 427
 428             # Only add those names that are not yet in the list.
 429             new_tokens = word_tokens - existing_tokens[True]
 430             if not internal:
 431                 new_tokens -= existing_tokens[False]
 432             if new_tokens:
 433                 if internal:
 434                     sql = """INSERT INTO word (word_token, type, word, info)
 435                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 436                                   FROM unnest(%s) as token)
 437                            """
 438                 else:
 439                     sql = """INSERT INTO word (word_token, type, word)
 440                                    (SELECT token, 'C', %s
 441                                     FROM unnest(%s) as token)
 442                           """
 443                 cur.execute(sql, (country_code, list(new_tokens)))
 444
 445
 446     def process_place(self, place):
 447         """ Determine tokenizer information about the given place.
 448
 449             Returns a JSON-serializable structure that will be handed into
 450             the database via the token_info field.
 451         """
 452         token_info = _TokenInfo()
 453
 454         names, address = self.sanitizer.process_names(place)
 455
 456         if names:
 457             token_info.set_names(*self._compute_name_tokens(names))
 458
 459             if place.is_country():
 460                 self._add_country_full_names(place.country_code, names)
 461
 462         if address:
 463             self._process_place_address(token_info, address)
 464
 465         return token_info.to_dict()
 466
 467
 468     def _process_place_address(self, token_info, address):
 469         for item in address:
 470             if item.kind == 'postcode':
 471                 self._add_postcode(item.name)
 472             elif item.kind == 'housenumber':
 473                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 474             elif item.kind == 'street':
 475                 token_info.add_street(self._retrieve_full_tokens(item.name))
 476             elif item.kind == 'place':
 477                 if not item.suffix:
 478                     token_info.add_place(self._compute_partial_tokens(item.name))
 479             elif not item.kind.startswith('_') and not item.suffix and \
 480                  item.kind not in ('country', 'full'):
 481                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 482
 483
 484     def _compute_housenumber_token(self, hnr):
 485         """ Normalize the housenumber and return the word token and the
 486             canonical form.
 487         """
 488         norm_name = self._search_normalized(hnr.name)
 489         if not norm_name:
 490             return None, None
 491
 492         token = self._cache.housenumbers.get(norm_name)
 493         if token is None:
 494             with self.conn.cursor() as cur:
 495                 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 496                 token = cur.fetchone()[0]
 497                 self._cache.housenumbers[norm_name] = token
 498
 499         return token, norm_name
 500
 501
 502     def _compute_partial_tokens(self, name):
 503         """ Normalize the given term, split it into partial words and return
 504             then token list for them.
 505         """
 506         norm_name = self._search_normalized(name)
 507
 508         tokens = []
 509         need_lookup = []
 510         for partial in norm_name.split():
 511             token = self._cache.partials.get(partial)
 512             if token:
 513                 tokens.append(token)
 514             else:
 515                 need_lookup.append(partial)
 516
 517         if need_lookup:
 518             with self.conn.cursor() as cur:
 519                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 520                                FROM unnest(%s) word""",
 521                             (need_lookup, ))
 522
 523                 for partial, token in cur:
 524                     tokens.append(token)
 525                     self._cache.partials[partial] = token
 526
 527         return tokens
 528
 529
 530     def _retrieve_full_tokens(self, name):
 531         """ Get the full name token for the given name, if it exists.
 532             The name is only retrived for the standard analyser.
 533         """
 534         norm_name = self._search_normalized(name)
 535
 536         # return cached if possible
 537         if norm_name in self._cache.fulls:
 538             return self._cache.fulls[norm_name]
 539
 540         with self.conn.cursor() as cur:
 541             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 542                         (norm_name, ))
 543             full = [row[0] for row in cur]
 544
 545         self._cache.fulls[norm_name] = full
 546
 547         return full
 548
 549
 550     def _compute_name_tokens(self, names):
 551         """ Computes the full name and partial name tokens for the given
 552             dictionary of names.
 553         """
 554         full_tokens = set()
 555         partial_tokens = set()
 556
 557         for name in names:
 558             analyzer_id = name.get_attr('analyzer')
 559             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 560             norm_name = analyzer.normalize(name.name)
 561             if analyzer_id is None:
 562                 token_id = norm_name
 563             else:
 564                 token_id = f'{norm_name}@{analyzer_id}'
 565
 566             full, part = self._cache.names.get(token_id, (None, None))
 567             if full is None:
 568                 variants = analyzer.get_variants_ascii(norm_name)
 569                 if not variants:
 570                     continue
 571
 572                 with self.conn.cursor() as cur:
 573                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 574                                 (token_id, variants))
 575                     full, part = cur.fetchone()
 576
 577                 self._cache.names[token_id] = (full, part)
 578
 579             full_tokens.add(full)
 580             partial_tokens.update(part)
 581
 582         return full_tokens, partial_tokens
 583
 584
 585     def _add_postcode(self, postcode):
 586         """ Make sure the normalized postcode is present in the word table.
 587         """
 588         if re.search(r'[:,;]', postcode) is None:
 589             postcode = self.normalize_postcode(postcode)
 590
 591             if postcode not in self._cache.postcodes:
 592                 term = self._search_normalized(postcode)
 593                 if not term:
 594                     return
 595
 596                 with self.conn.cursor() as cur:
 597                     # no word_id needed for postcodes
 598                     cur.execute("""INSERT INTO word (word_token, type, word)
 599                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 600                                     WHERE NOT EXISTS
 601                                      (SELECT * FROM word
 602                                       WHERE type = 'P' and word = pc))
 603                                 """, (term, postcode))
 604                 self._cache.postcodes.add(postcode)
 605
 606
 607 class _TokenInfo:
 608     """ Collect token information to be sent back to the database.
 609     """
 610     def __init__(self):
 611         self.names = None
 612         self.housenumbers = set()
 613         self.housenumber_tokens = set()
 614         self.street_tokens = set()
 615         self.place_tokens = set()
 616         self.address_tokens = {}
 617
 618
 619     @staticmethod
 620     def _mk_array(tokens):
 621         return f"{{{','.join((str(s) for s in tokens))}}}"
 622
 623
 624     def to_dict(self):
 625         """ Return the token information in database importable format.
 626         """
 627         out = {}
 628
 629         if self.names:
 630             out['names'] = self.names
 631
 632         if self.housenumbers:
 633             out['hnr'] = ';'.join(self.housenumbers)
 634             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 635
 636         if self.street_tokens:
 637             out['street'] = self._mk_array(self.street_tokens)
 638
 639         if self.place_tokens:
 640             out['place'] = self._mk_array(self.place_tokens)
 641
 642         if self.address_tokens:
 643             out['addr'] = self.address_tokens
 644
 645         return out
 646
 647
 648     def set_names(self, fulls, partials):
 649         """ Adds token information for the normalised names.
 650         """
 651         self.names = self._mk_array(itertools.chain(fulls, partials))
 652
 653
 654     def add_housenumber(self, token, hnr):
 655         """ Extract housenumber information from a list of normalised
 656             housenumbers.
 657         """
 658         if token:
 659             self.housenumbers.add(hnr)
 660             self.housenumber_tokens.add(token)
 661
 662
 663     def add_street(self, tokens):
 664         """ Add addr:street match terms.
 665         """
 666         self.street_tokens.update(tokens)
 667
 668
 669     def add_place(self, tokens):
 670         """ Add addr:place search and match terms.
 671         """
 672         self.place_tokens.update(tokens)
 673
 674
 675     def add_address_term(self, key, partials):
 676         """ Add additional address terms.
 677         """
 678         if partials:
 679             self.address_tokens[key] = self._mk_array(partials)
 680
 681
 682 class _TokenCache:
 683     """ Cache for token information to avoid repeated database queries.
 684
 685         This cache is not thread-safe and needs to be instantiated per
 686         analyzer.
 687     """
 688     def __init__(self):
 689         self.names = {}
 690         self.partials = {}
 691         self.fulls = {}
 692         self.postcodes = set()
 693         self.housenumbers = {}