2 Tokenizer implementing normalisation as used before Nominatim 4.
9 from nominatim.db.connection import connect
10 from nominatim.db import properties
11 from nominatim.db import utils as db_utils
12 from nominatim.db.sql_preprocessor import SQLPreprocessor
13 from nominatim.errors import UsageError
15 DBCFG_NORMALIZATION = "tokenizer_normalization"
16 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
18 LOG = logging.getLogger()
20 def create(dsn, data_dir):
21 """ Create a new instance of the tokenizer provided by this module.
23 return LegacyTokenizer(dsn, data_dir)
26 def _install_module(config_module_path, src_dir, module_dir):
27 """ Copies the PostgreSQL normalisation module into the project
28 directory if necessary. For historical reasons the module is
29 saved in the '/module' subdirectory and not with the other tokenizer
32 The function detects when the installation is run from the
33 build directory. It doesn't touch the module in that case.
35 # Custom module locations are simply used as is.
36 if config_module_path:
37 LOG.info("Using custom path for database module at '%s'", config_module_path)
38 return config_module_path
40 # Compatibility mode for builddir installations.
41 if module_dir.exists() and src_dir.samefile(module_dir):
42 LOG.info('Running from build directory. Leaving database module as is.')
45 # In any other case install the module in the project directory.
46 if not module_dir.exists():
49 destfile = module_dir / 'nominatim.so'
50 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
53 LOG.info('Database module installed at %s', str(destfile))
58 def _check_module(module_dir, conn):
59 """ Try to use the PostgreSQL module to confirm that it is correctly
60 installed and accessible from PostgreSQL.
62 with conn.cursor() as cur:
64 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
65 RETURNS text AS '{}/nominatim.so', 'transliteration'
66 LANGUAGE c IMMUTABLE STRICT;
67 DROP FUNCTION nominatim_test_import_func(text)
68 """.format(module_dir))
69 except psycopg2.DatabaseError as err:
70 LOG.fatal("Error accessing database module: %s", err)
71 raise UsageError("Database module cannot be accessed.") from err
74 class LegacyTokenizer:
75 """ The legacy tokenizer uses a special PostgreSQL module to normalize
76 names and queries. The tokenizer thus implements normalization through
77 calls to the database.
80 def __init__(self, dsn, data_dir):
82 self.data_dir = data_dir
83 self.normalization = None
86 def init_new_db(self, config):
87 """ Set up a new tokenizer for the database.
89 This copies all necessary data in the project directory to make
90 sure the tokenizer remains stable even over updates.
92 module_dir = _install_module(config.DATABASE_MODULE_PATH,
93 config.lib_dir.module,
94 config.project_dir / 'module')
96 self.normalization = config.TERM_NORMALIZATION
98 with connect(self.dsn) as conn:
99 _check_module(module_dir, conn)
100 self._save_config(conn, config)
103 self.update_sql_functions(config)
104 self._init_db_tables(config)
107 def init_from_project(self):
108 """ Initialise the tokenizer from the project directory.
110 with connect(self.dsn) as conn:
111 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
114 def update_sql_functions(self, config):
115 """ Reimport the SQL functions for this tokenizer.
117 with connect(self.dsn) as conn:
118 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
119 modulepath = config.DATABASE_MODULE_PATH or \
120 str((config.project_dir / 'module').resolve())
121 sqlp = SQLPreprocessor(conn, config)
122 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
123 max_word_freq=max_word_freq,
124 modulepath=modulepath)
127 def migrate_database(self, config):
128 """ Initialise the project directory of an existing database for
129 use with this tokenizer.
131 This is a special migration function for updating existing databases
132 to new software versions.
134 module_dir = _install_module(config.DATABASE_MODULE_PATH,
135 config.lib_dir.module,
136 config.project_dir / 'module')
138 with connect(self.dsn) as conn:
139 _check_module(module_dir, conn)
140 self._save_config(conn, config)
143 def _init_db_tables(self, config):
144 """ Set up the word table and fill it with pre-computed word
147 with connect(self.dsn) as conn:
148 sqlp = SQLPreprocessor(conn, config)
149 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
152 LOG.warning("Precomputing word tokens")
153 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
156 def _save_config(self, conn, config):
157 """ Save the configuration that needs to remain stable for the given
158 database as database properties.
160 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
161 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)