nominatim/tokenizer/legacy_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4.
   3 """
   4 import logging
   5 import shutil
   6
   7 import psycopg2
   8
   9 from nominatim.db.connection import connect
  10 from nominatim.db import properties
  11 from nominatim.db import utils as db_utils
  12 from nominatim.db.sql_preprocessor import SQLPreprocessor
  13 from nominatim.errors import UsageError
  14
  15 DBCFG_NORMALIZATION = "tokenizer_normalization"
  16 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  17
  18 LOG = logging.getLogger()
  19
  20 def create(dsn, data_dir):
  21     """ Create a new instance of the tokenizer provided by this module.
  22     """
  23     return LegacyTokenizer(dsn, data_dir)
  24
  25
  26 def _install_module(config_module_path, src_dir, module_dir):
  27     """ Copies the PostgreSQL normalisation module into the project
  28         directory if necessary. For historical reasons the module is
  29         saved in the '/module' subdirectory and not with the other tokenizer
  30         data.
  31
  32         The function detects when the installation is run from the
  33         build directory. It doesn't touch the module in that case.
  34     """
  35     # Custom module locations are simply used as is.
  36     if config_module_path:
  37         LOG.info("Using custom path for database module at '%s'", config_module_path)
  38         return config_module_path
  39
  40     # Compatibility mode for builddir installations.
  41     if module_dir.exists() and src_dir.samefile(module_dir):
  42         LOG.info('Running from build directory. Leaving database module as is.')
  43         return module_dir
  44
  45     # In any other case install the module in the project directory.
  46     if not module_dir.exists():
  47         module_dir.mkdir()
  48
  49     destfile = module_dir / 'nominatim.so'
  50     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  51     destfile.chmod(0o755)
  52
  53     LOG.info('Database module installed at %s', str(destfile))
  54
  55     return module_dir
  56
  57
  58 def _check_module(module_dir, conn):
  59     """ Try to use the PostgreSQL module to confirm that it is correctly
  60         installed and accessible from PostgreSQL.
  61     """
  62     with conn.cursor() as cur:
  63         try:
  64             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
  65                            RETURNS text AS '{}/nominatim.so', 'transliteration'
  66                            LANGUAGE c IMMUTABLE STRICT;
  67                            DROP FUNCTION nominatim_test_import_func(text)
  68                         """.format(module_dir))
  69         except psycopg2.DatabaseError as err:
  70             LOG.fatal("Error accessing database module: %s", err)
  71             raise UsageError("Database module cannot be accessed.") from err
  72
  73
  74 class LegacyTokenizer:
  75     """ The legacy tokenizer uses a special PostgreSQL module to normalize
  76         names and queries. The tokenizer thus implements normalization through
  77         calls to the database.
  78     """
  79
  80     def __init__(self, dsn, data_dir):
  81         self.dsn = dsn
  82         self.data_dir = data_dir
  83         self.normalization = None
  84
  85
  86     def init_new_db(self, config):
  87         """ Set up a new tokenizer for the database.
  88
  89             This copies all necessary data in the project directory to make
  90             sure the tokenizer remains stable even over updates.
  91         """
  92         module_dir = _install_module(config.DATABASE_MODULE_PATH,
  93                                      config.lib_dir.module,
  94                                      config.project_dir / 'module')
  95
  96         self.normalization = config.TERM_NORMALIZATION
  97
  98         with connect(self.dsn) as conn:
  99             _check_module(module_dir, conn)
 100             self._save_config(conn, config)
 101             conn.commit()
 102
 103         self.update_sql_functions(config)
 104         self._init_db_tables(config)
 105
 106
 107     def init_from_project(self):
 108         """ Initialise the tokenizer from the project directory.
 109         """
 110         with connect(self.dsn) as conn:
 111             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 112
 113
 114     def update_sql_functions(self, config):
 115         """ Reimport the SQL functions for this tokenizer.
 116         """
 117         with connect(self.dsn) as conn:
 118             max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
 119             modulepath = config.DATABASE_MODULE_PATH or \
 120                          str((config.project_dir / 'module').resolve())
 121             sqlp = SQLPreprocessor(conn, config)
 122             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
 123                               max_word_freq=max_word_freq,
 124                               modulepath=modulepath)
 125
 126
 127     def migrate_database(self, config):
 128         """ Initialise the project directory of an existing database for
 129             use with this tokenizer.
 130
 131             This is a special migration function for updating existing databases
 132             to new software versions.
 133         """
 134         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 135                                      config.lib_dir.module,
 136                                      config.project_dir / 'module')
 137
 138         with connect(self.dsn) as conn:
 139             _check_module(module_dir, conn)
 140             self._save_config(conn, config)
 141
 142
 143     def _init_db_tables(self, config):
 144         """ Set up the word table and fill it with pre-computed word
 145             frequencies.
 146         """
 147         with connect(self.dsn) as conn:
 148             sqlp = SQLPreprocessor(conn, config)
 149             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 150             conn.commit()
 151
 152         LOG.warning("Precomputing word tokens")
 153         db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
 154
 155
 156     def _save_config(self, conn, config):
 157         """ Save the configuration that needs to remain stable for the given
 158             database as database properties.
 159         """
 160         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 161         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)