nominatim/tokenizer/legacy_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4.
   3 """
   4 import logging
   5 import shutil
   6
   7 import psycopg2
   8 import psycopg2.extras
   9
  10 from nominatim.db.connection import connect
  11 from nominatim.db import properties
  12 from nominatim.db import utils as db_utils
  13 from nominatim.db.sql_preprocessor import SQLPreprocessor
  14 from nominatim.errors import UsageError
  15
  16 DBCFG_NORMALIZATION = "tokenizer_normalization"
  17 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  18
  19 LOG = logging.getLogger()
  20
  21 def create(dsn, data_dir):
  22     """ Create a new instance of the tokenizer provided by this module.
  23     """
  24     return LegacyTokenizer(dsn, data_dir)
  25
  26
  27 def _install_module(config_module_path, src_dir, module_dir):
  28     """ Copies the PostgreSQL normalisation module into the project
  29         directory if necessary. For historical reasons the module is
  30         saved in the '/module' subdirectory and not with the other tokenizer
  31         data.
  32
  33         The function detects when the installation is run from the
  34         build directory. It doesn't touch the module in that case.
  35     """
  36     # Custom module locations are simply used as is.
  37     if config_module_path:
  38         LOG.info("Using custom path for database module at '%s'", config_module_path)
  39         return config_module_path
  40
  41     # Compatibility mode for builddir installations.
  42     if module_dir.exists() and src_dir.samefile(module_dir):
  43         LOG.info('Running from build directory. Leaving database module as is.')
  44         return module_dir
  45
  46     # In any other case install the module in the project directory.
  47     if not module_dir.exists():
  48         module_dir.mkdir()
  49
  50     destfile = module_dir / 'nominatim.so'
  51     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  52     destfile.chmod(0o755)
  53
  54     LOG.info('Database module installed at %s', str(destfile))
  55
  56     return module_dir
  57
  58
  59 def _check_module(module_dir, conn):
  60     """ Try to use the PostgreSQL module to confirm that it is correctly
  61         installed and accessible from PostgreSQL.
  62     """
  63     with conn.cursor() as cur:
  64         try:
  65             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
  66                            RETURNS text AS '{}/nominatim.so', 'transliteration'
  67                            LANGUAGE c IMMUTABLE STRICT;
  68                            DROP FUNCTION nominatim_test_import_func(text)
  69                         """.format(module_dir))
  70         except psycopg2.DatabaseError as err:
  71             LOG.fatal("Error accessing database module: %s", err)
  72             raise UsageError("Database module cannot be accessed.") from err
  73
  74
  75 class LegacyTokenizer:
  76     """ The legacy tokenizer uses a special PostgreSQL module to normalize
  77         names and queries. The tokenizer thus implements normalization through
  78         calls to the database.
  79     """
  80
  81     def __init__(self, dsn, data_dir):
  82         self.dsn = dsn
  83         self.data_dir = data_dir
  84         self.normalization = None
  85
  86
  87     def init_new_db(self, config):
  88         """ Set up a new tokenizer for the database.
  89
  90             This copies all necessary data in the project directory to make
  91             sure the tokenizer remains stable even over updates.
  92         """
  93         module_dir = _install_module(config.DATABASE_MODULE_PATH,
  94                                      config.lib_dir.module,
  95                                      config.project_dir / 'module')
  96
  97         self.normalization = config.TERM_NORMALIZATION
  98
  99         with connect(self.dsn) as conn:
 100             _check_module(module_dir, conn)
 101             self._save_config(conn, config)
 102             conn.commit()
 103
 104         self.update_sql_functions(config)
 105         self._init_db_tables(config)
 106
 107
 108     def init_from_project(self):
 109         """ Initialise the tokenizer from the project directory.
 110         """
 111         with connect(self.dsn) as conn:
 112             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 113
 114
 115     def update_sql_functions(self, config):
 116         """ Reimport the SQL functions for this tokenizer.
 117         """
 118         with connect(self.dsn) as conn:
 119             max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
 120             modulepath = config.DATABASE_MODULE_PATH or \
 121                          str((config.project_dir / 'module').resolve())
 122             sqlp = SQLPreprocessor(conn, config)
 123             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
 124                               max_word_freq=max_word_freq,
 125                               modulepath=modulepath)
 126
 127
 128     def migrate_database(self, config):
 129         """ Initialise the project directory of an existing database for
 130             use with this tokenizer.
 131
 132             This is a special migration function for updating existing databases
 133             to new software versions.
 134         """
 135         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 136                                      config.lib_dir.module,
 137                                      config.project_dir / 'module')
 138
 139         with connect(self.dsn) as conn:
 140             _check_module(module_dir, conn)
 141             self._save_config(conn, config)
 142
 143
 144     def name_analyzer(self):
 145         """ Create a new analyzer for tokenizing names and queries
 146             using this tokinzer. Analyzers are context managers and should
 147             be used accordingly:
 148
 149             ```
 150             with tokenizer.name_analyzer() as analyzer:
 151                 analyser.tokenize()
 152             ```
 153
 154             When used outside the with construct, the caller must ensure to
 155             call the close() function before destructing the analyzer.
 156
 157             Analyzers are not thread-safe. You need to instantiate one per thread.
 158         """
 159         return LegacyNameAnalyzer(self.dsn)
 160
 161
 162     def _init_db_tables(self, config):
 163         """ Set up the word table and fill it with pre-computed word
 164             frequencies.
 165         """
 166         with connect(self.dsn) as conn:
 167             sqlp = SQLPreprocessor(conn, config)
 168             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 169             conn.commit()
 170
 171         LOG.warning("Precomputing word tokens")
 172         db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
 173
 174
 175     def _save_config(self, conn, config):
 176         """ Save the configuration that needs to remain stable for the given
 177             database as database properties.
 178         """
 179         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 180         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 181
 182
 183
 184 class LegacyNameAnalyzer:
 185     """ The legacy analyzer uses the special Postgresql module for
 186         splitting names.
 187
 188         Each instance opens a connection to the database to request the
 189         normalization.
 190     """
 191
 192     def __init__(self, dsn):
 193         self.conn = connect(dsn).connection
 194         self.conn.autocommit = True
 195         psycopg2.extras.register_hstore(self.conn)
 196
 197
 198     def __enter__(self):
 199         return self
 200
 201
 202     def __exit__(self, exc_type, exc_value, traceback):
 203         self.close()
 204
 205
 206     def close(self):
 207         """ Free all resources used by the analyzer.
 208         """
 209         if self.conn:
 210             self.conn.close()
 211             self.conn = None
 212
 213     def process_place(self, place):
 214         """ Determine tokenizer information about the given place.
 215
 216             Returns a JSON-serialisable structure that will be handed into
 217             the database via the token_info field.
 218         """
 219         return {}