nominatim/tokenizer/legacy_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4.
   9 """
  10 from collections import OrderedDict
  11 import logging
  12 import re
  13 import shutil
  14 from textwrap import dedent
  15
  16 from icu import Transliterator
  17 import psycopg2
  18 import psycopg2.extras
  19
  20 from nominatim.db.connection import connect
  21 from nominatim.db import properties
  22 from nominatim.db import utils as db_utils
  23 from nominatim.db.sql_preprocessor import SQLPreprocessor
  24 from nominatim.errors import UsageError
  25 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  26
  27 DBCFG_NORMALIZATION = "tokenizer_normalization"
  28 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  29
  30 LOG = logging.getLogger()
  31
  32 def create(dsn, data_dir):
  33     """ Create a new instance of the tokenizer provided by this module.
  34     """
  35     return LegacyTokenizer(dsn, data_dir)
  36
  37
  38 def _install_module(config_module_path, src_dir, module_dir):
  39     """ Copies the PostgreSQL normalisation module into the project
  40         directory if necessary. For historical reasons the module is
  41         saved in the '/module' subdirectory and not with the other tokenizer
  42         data.
  43
  44         The function detects when the installation is run from the
  45         build directory. It doesn't touch the module in that case.
  46     """
  47     # Custom module locations are simply used as is.
  48     if config_module_path:
  49         LOG.info("Using custom path for database module at '%s'", config_module_path)
  50         return config_module_path
  51
  52     # Compatibility mode for builddir installations.
  53     if module_dir.exists() and src_dir.samefile(module_dir):
  54         LOG.info('Running from build directory. Leaving database module as is.')
  55         return module_dir
  56
  57     # In any other case install the module in the project directory.
  58     if not module_dir.exists():
  59         module_dir.mkdir()
  60
  61     destfile = module_dir / 'nominatim.so'
  62     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  63     destfile.chmod(0o755)
  64
  65     LOG.info('Database module installed at %s', str(destfile))
  66
  67     return module_dir
  68
  69
  70 def _check_module(module_dir, conn):
  71     """ Try to use the PostgreSQL module to confirm that it is correctly
  72         installed and accessible from PostgreSQL.
  73     """
  74     with conn.cursor() as cur:
  75         try:
  76             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
  77                            RETURNS text AS '{}/nominatim.so', 'transliteration'
  78                            LANGUAGE c IMMUTABLE STRICT;
  79                            DROP FUNCTION nominatim_test_import_func(text)
  80                         """.format(module_dir))
  81         except psycopg2.DatabaseError as err:
  82             LOG.fatal("Error accessing database module: %s", err)
  83             raise UsageError("Database module cannot be accessed.") from err
  84
  85
  86 class LegacyTokenizer(AbstractTokenizer):
  87     """ The legacy tokenizer uses a special PostgreSQL module to normalize
  88         names and queries. The tokenizer thus implements normalization through
  89         calls to the database.
  90     """
  91
  92     def __init__(self, dsn, data_dir):
  93         self.dsn = dsn
  94         self.data_dir = data_dir
  95         self.normalization = None
  96
  97
  98     def init_new_db(self, config, init_db=True):
  99         """ Set up a new tokenizer for the database.
 100
 101             This copies all necessary data in the project directory to make
 102             sure the tokenizer remains stable even over updates.
 103         """
 104         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 105                                      config.lib_dir.module,
 106                                      config.project_dir / 'module')
 107
 108         self.normalization = config.TERM_NORMALIZATION
 109
 110         self._install_php(config)
 111
 112         with connect(self.dsn) as conn:
 113             _check_module(module_dir, conn)
 114             self._save_config(conn, config)
 115             conn.commit()
 116
 117         if init_db:
 118             self.update_sql_functions(config)
 119             self._init_db_tables(config)
 120
 121
 122     def init_from_project(self, _):
 123         """ Initialise the tokenizer from the project directory.
 124         """
 125         with connect(self.dsn) as conn:
 126             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 127
 128
 129     def finalize_import(self, config):
 130         """ Do any required postprocessing to make the tokenizer data ready
 131             for use.
 132         """
 133         with connect(self.dsn) as conn:
 134             sqlp = SQLPreprocessor(conn, config)
 135             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
 136
 137
 138     def update_sql_functions(self, config):
 139         """ Reimport the SQL functions for this tokenizer.
 140         """
 141         with connect(self.dsn) as conn:
 142             max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
 143             modulepath = config.DATABASE_MODULE_PATH or \
 144                          str((config.project_dir / 'module').resolve())
 145             sqlp = SQLPreprocessor(conn, config)
 146             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
 147                               max_word_freq=max_word_freq,
 148                               modulepath=modulepath)
 149
 150
 151     def check_database(self, _):
 152         """ Check that the tokenizer is set up correctly.
 153         """
 154         hint = """\
 155              The Postgresql extension nominatim.so was not correctly loaded.
 156
 157              Error: {error}
 158
 159              Hints:
 160              * Check the output of the CMmake/make installation step
 161              * Does nominatim.so exist?
 162              * Does nominatim.so exist on the database server?
 163              * Can nominatim.so be accessed by the database user?
 164              """
 165         with connect(self.dsn) as conn:
 166             with conn.cursor() as cur:
 167                 try:
 168                     out = cur.scalar("SELECT make_standard_name('a')")
 169                 except psycopg2.Error as err:
 170                     return hint.format(error=str(err))
 171
 172         if out != 'a':
 173             return hint.format(error='Unexpected result for make_standard_name()')
 174
 175         return None
 176
 177
 178     def migrate_database(self, config):
 179         """ Initialise the project directory of an existing database for
 180             use with this tokenizer.
 181
 182             This is a special migration function for updating existing databases
 183             to new software versions.
 184         """
 185         self.normalization = config.TERM_NORMALIZATION
 186         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 187                                      config.lib_dir.module,
 188                                      config.project_dir / 'module')
 189
 190         with connect(self.dsn) as conn:
 191             _check_module(module_dir, conn)
 192             self._save_config(conn, config)
 193
 194
 195     def update_statistics(self):
 196         """ Recompute the frequency of full words.
 197         """
 198         with connect(self.dsn) as conn:
 199             if conn.table_exists('search_name'):
 200                 with conn.cursor() as cur:
 201                     cur.drop_table("word_frequencies")
 202                     LOG.info("Computing word frequencies")
 203                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 204                                      SELECT unnest(name_vector) as id, count(*)
 205                                      FROM search_name GROUP BY id""")
 206                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 207                     LOG.info("Update word table with recomputed frequencies")
 208                     cur.execute("""UPDATE word SET search_name_count = count
 209                                    FROM word_frequencies
 210                                    WHERE word_token like ' %' and word_id = id""")
 211                     cur.drop_table("word_frequencies")
 212             conn.commit()
 213
 214     def name_analyzer(self):
 215         """ Create a new analyzer for tokenizing names and queries
 216             using this tokinzer. Analyzers are context managers and should
 217             be used accordingly:
 218
 219             ```
 220             with tokenizer.name_analyzer() as analyzer:
 221                 analyser.tokenize()
 222             ```
 223
 224             When used outside the with construct, the caller must ensure to
 225             call the close() function before destructing the analyzer.
 226
 227             Analyzers are not thread-safe. You need to instantiate one per thread.
 228         """
 229         normalizer = Transliterator.createFromRules("phrase normalizer",
 230                                                     self.normalization)
 231         return LegacyNameAnalyzer(self.dsn, normalizer)
 232
 233
 234     def _install_php(self, config):
 235         """ Install the php script for the tokenizer.
 236         """
 237         php_file = self.data_dir / "tokenizer.php"
 238         php_file.write_text(dedent("""\
 239             <?php
 240             @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
 241             @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
 242             require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
 243             """.format(config)))
 244
 245
 246     def _init_db_tables(self, config):
 247         """ Set up the word table and fill it with pre-computed word
 248             frequencies.
 249         """
 250         with connect(self.dsn) as conn:
 251             sqlp = SQLPreprocessor(conn, config)
 252             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 253             conn.commit()
 254
 255         LOG.warning("Precomputing word tokens")
 256         db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
 257
 258
 259     def _save_config(self, conn, config):
 260         """ Save the configuration that needs to remain stable for the given
 261             database as database properties.
 262         """
 263         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 264         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 265
 266
 267 class LegacyNameAnalyzer(AbstractAnalyzer):
 268     """ The legacy analyzer uses the special Postgresql module for
 269         splitting names.
 270
 271         Each instance opens a connection to the database to request the
 272         normalization.
 273     """
 274
 275     def __init__(self, dsn, normalizer):
 276         self.conn = connect(dsn).connection
 277         self.conn.autocommit = True
 278         self.normalizer = normalizer
 279         psycopg2.extras.register_hstore(self.conn)
 280
 281         self._cache = _TokenCache(self.conn)
 282
 283
 284     def close(self):
 285         """ Free all resources used by the analyzer.
 286         """
 287         if self.conn:
 288             self.conn.close()
 289             self.conn = None
 290
 291
 292     def get_word_token_info(self, words):
 293         """ Return token information for the given list of words.
 294             If a word starts with # it is assumed to be a full name
 295             otherwise is a partial name.
 296
 297             The function returns a list of tuples with
 298             (original word, word token, word id).
 299
 300             The function is used for testing and debugging only
 301             and not necessarily efficient.
 302         """
 303         with self.conn.cursor() as cur:
 304             cur.execute("""SELECT t.term, word_token, word_id
 305                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 306                            WHERE word_token = (CASE
 307                                    WHEN left(t.term, 1) = '#' THEN
 308                                      ' ' || make_standard_name(substring(t.term from 2))
 309                                    ELSE
 310                                      make_standard_name(t.term)
 311                                    END)
 312                                  and class is null and country_code is null""",
 313                         (words, ))
 314
 315             return [(r[0], r[1], r[2]) for r in cur]
 316
 317
 318     def normalize(self, phrase):
 319         """ Normalize the given phrase, i.e. remove all properties that
 320             are irrelevant for search.
 321         """
 322         return self.normalizer.transliterate(phrase)
 323
 324
 325     @staticmethod
 326     def normalize_postcode(postcode):
 327         """ Convert the postcode to a standardized form.
 328
 329             This function must yield exactly the same result as the SQL function
 330             'token_normalized_postcode()'.
 331         """
 332         return postcode.strip().upper()
 333
 334
 335     def update_postcodes_from_db(self):
 336         """ Update postcode tokens in the word table from the location_postcode
 337             table.
 338         """
 339         with self.conn.cursor() as cur:
 340             # This finds us the rows in location_postcode and word that are
 341             # missing in the other table.
 342             cur.execute("""SELECT * FROM
 343                             (SELECT pc, word FROM
 344                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 345                               FULL JOIN
 346                               (SELECT word FROM word
 347                                 WHERE class ='place' and type = 'postcode') w
 348                               ON pc = word) x
 349                            WHERE pc is null or word is null""")
 350
 351             to_delete = []
 352             to_add = []
 353
 354             for postcode, word in cur:
 355                 if postcode is None:
 356                     to_delete.append(word)
 357                 else:
 358                     to_add.append(postcode)
 359
 360             if to_delete:
 361                 cur.execute("""DELETE FROM WORD
 362                                WHERE class ='place' and type = 'postcode'
 363                                      and word = any(%s)
 364                             """, (to_delete, ))
 365             if to_add:
 366                 cur.execute("""SELECT count(create_postcode_id(pc))
 367                                FROM unnest(%s) as pc
 368                             """, (to_add, ))
 369
 370
 371
 372     def update_special_phrases(self, phrases, should_replace):
 373         """ Replace the search index for special phrases with the new phrases.
 374         """
 375         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 376                             for p in phrases))
 377
 378         with self.conn.cursor() as cur:
 379             # Get the old phrases.
 380             existing_phrases = set()
 381             cur.execute("""SELECT word, class, type, operator FROM word
 382                            WHERE class != 'place'
 383                                  OR (type != 'house' AND type != 'postcode')""")
 384             for label, cls, typ, oper in cur:
 385                 existing_phrases.add((label, cls, typ, oper or '-'))
 386
 387             to_add = norm_phrases - existing_phrases
 388             to_delete = existing_phrases - norm_phrases
 389
 390             if to_add:
 391                 cur.execute_values(
 392                     """ INSERT INTO word (word_id, word_token, word, class, type,
 393                                           search_name_count, operator)
 394                         (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
 395                                 class, type, 0,
 396                                 CASE WHEN op in ('in', 'near') THEN op ELSE null END
 397                            FROM (VALUES %s) as v(name, class, type, op))""",
 398                     to_add)
 399
 400             if to_delete and should_replace:
 401                 cur.execute_values(
 402                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 403                         WHERE word = name and class = in_class and type = in_type
 404                               and ((op = '-' and operator is null) or op = operator)""",
 405                     to_delete)
 406
 407         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 408                  len(norm_phrases), len(to_add), len(to_delete))
 409
 410
 411     def add_country_names(self, country_code, names):
 412         """ Add names for the given country to the search index.
 413         """
 414         with self.conn.cursor() as cur:
 415             cur.execute(
 416                 """INSERT INTO word (word_id, word_token, country_code)
 417                    (SELECT nextval('seq_word'), lookup_token, %s
 418                       FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
 419                             FROM unnest(%s)n) y
 420                       WHERE NOT EXISTS(SELECT * FROM word
 421                                        WHERE word_token = lookup_token and country_code = %s))
 422                 """, (country_code, list(names.values()), country_code))
 423
 424
 425     def process_place(self, place):
 426         """ Determine tokenizer information about the given place.
 427
 428             Returns a JSON-serialisable structure that will be handed into
 429             the database via the token_info field.
 430         """
 431         token_info = _TokenInfo(self._cache)
 432
 433         names = place.name
 434
 435         if names:
 436             token_info.add_names(self.conn, names)
 437
 438             if place.is_country():
 439                 self.add_country_names(place.country_code, names)
 440
 441         address = place.address
 442         if address:
 443             self._process_place_address(token_info, address)
 444
 445         return token_info.data
 446
 447
 448     def _process_place_address(self, token_info, address):
 449         hnrs = []
 450         addr_terms = []
 451
 452         for key, value in address.items():
 453             if key == 'postcode':
 454                 # Make sure the normalized postcode is present in the word table.
 455                 if re.search(r'[:,;]', value) is None:
 456                     self._cache.add_postcode(self.conn,
 457                                              self.normalize_postcode(value))
 458             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 459                 hnrs.append(value)
 460             elif key == 'street':
 461                 token_info.add_street(self.conn, value)
 462             elif key == 'place':
 463                 token_info.add_place(self.conn, value)
 464             elif not key.startswith('_') and key not in ('country', 'full'):
 465                 addr_terms.append((key, value))
 466
 467         if hnrs:
 468             token_info.add_housenumbers(self.conn, hnrs)
 469
 470         if addr_terms:
 471             token_info.add_address_terms(self.conn, addr_terms)
 472
 473
 474
 475 class _TokenInfo:
 476     """ Collect token information to be sent back to the database.
 477     """
 478     def __init__(self, cache):
 479         self.cache = cache
 480         self.data = {}
 481
 482
 483     def add_names(self, conn, names):
 484         """ Add token information for the names of the place.
 485         """
 486         with conn.cursor() as cur:
 487             # Create the token IDs for all names.
 488             self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
 489                                             (names, ))
 490
 491
 492     def add_housenumbers(self, conn, hnrs):
 493         """ Extract housenumber information from the address.
 494         """
 495         if len(hnrs) == 1:
 496             token = self.cache.get_housenumber(hnrs[0])
 497             if token is not None:
 498                 self.data['hnr_tokens'] = token
 499                 self.data['hnr'] = hnrs[0]
 500                 return
 501
 502         # split numbers if necessary
 503         simple_list = []
 504         for hnr in hnrs:
 505             simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 506
 507         if len(simple_list) > 1:
 508             simple_list = list(set(simple_list))
 509
 510         with conn.cursor() as cur:
 511             cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
 512             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 513
 514
 515     def add_street(self, conn, street):
 516         """ Add addr:street match terms.
 517         """
 518         def _get_street(name):
 519             with conn.cursor() as cur:
 520                 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
 521
 522         tokens = self.cache.streets.get(street, _get_street)
 523         if tokens:
 524             self.data['street'] = tokens
 525
 526
 527     def add_place(self, conn, place):
 528         """ Add addr:place search and match terms.
 529         """
 530         def _get_place(name):
 531             with conn.cursor() as cur:
 532                 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
 533                                       word_ids_from_name(%s)::text""",
 534                             (name, name))
 535                 return cur.fetchone()
 536
 537         self.data['place_search'], self.data['place_match'] = \
 538             self.cache.places.get(place, _get_place)
 539
 540
 541     def add_address_terms(self, conn, terms):
 542         """ Add additional address terms.
 543         """
 544         def _get_address_term(name):
 545             with conn.cursor() as cur:
 546                 cur.execute("""SELECT addr_ids_from_name(%s)::text,
 547                                       word_ids_from_name(%s)::text""",
 548                             (name, name))
 549                 return cur.fetchone()
 550
 551         tokens = {}
 552         for key, value in terms:
 553             items = self.cache.address_terms.get(value, _get_address_term)
 554             if items[0] or items[1]:
 555                 tokens[key] = items
 556
 557         if tokens:
 558             self.data['addr'] = tokens
 559
 560
 561 class _LRU:
 562     """ Least recently used cache that accepts a generator function to
 563         produce the item when there is a cache miss.
 564     """
 565
 566     def __init__(self, maxsize=128, init_data=None):
 567         self.data = init_data or OrderedDict()
 568         self.maxsize = maxsize
 569         if init_data is not None and len(init_data) > maxsize:
 570             self.maxsize = len(init_data)
 571
 572     def get(self, key, generator):
 573         """ Get the item with the given key from the cache. If nothing
 574             is found in the cache, generate the value through the
 575             generator function and store it in the cache.
 576         """
 577         value = self.data.get(key)
 578         if value is not None:
 579             self.data.move_to_end(key)
 580         else:
 581             value = generator(key)
 582             if len(self.data) >= self.maxsize:
 583                 self.data.popitem(last=False)
 584             self.data[key] = value
 585
 586         return value
 587
 588
 589 class _TokenCache:
 590     """ Cache for token information to avoid repeated database queries.
 591
 592         This cache is not thread-safe and needs to be instantiated per
 593         analyzer.
 594     """
 595     def __init__(self, conn):
 596         # various LRU caches
 597         self.streets = _LRU(maxsize=256)
 598         self.places = _LRU(maxsize=128)
 599         self.address_terms = _LRU(maxsize=1024)
 600
 601         # Lookup houseunumbers up to 100 and cache them
 602         with conn.cursor() as cur:
 603             cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
 604                            FROM generate_series(1, 100) as i""")
 605             self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
 606
 607         # For postcodes remember the ones that have already been added
 608         self.postcodes = set()
 609
 610     def get_housenumber(self, number):
 611         """ Get a housenumber token from the cache.
 612         """
 613         return self._cached_housenumbers.get(number)
 614
 615
 616     def add_postcode(self, conn, postcode):
 617         """ Make sure the given postcode is in the database.
 618         """
 619         if postcode not in self.postcodes:
 620             with conn.cursor() as cur:
 621                 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
 622             self.postcodes.add(postcode)