2 Tokenizer implementing normalisation as used before Nominatim 4.
10 from nominatim.db.connection import connect
11 from nominatim.db import properties
12 from nominatim.db import utils as db_utils
13 from nominatim.db.sql_preprocessor import SQLPreprocessor
14 from nominatim.errors import UsageError
16 DBCFG_NORMALIZATION = "tokenizer_normalization"
17 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
19 LOG = logging.getLogger()
21 def create(dsn, data_dir):
22 """ Create a new instance of the tokenizer provided by this module.
24 return LegacyTokenizer(dsn, data_dir)
27 def _install_module(config_module_path, src_dir, module_dir):
28 """ Copies the PostgreSQL normalisation module into the project
29 directory if necessary. For historical reasons the module is
30 saved in the '/module' subdirectory and not with the other tokenizer
33 The function detects when the installation is run from the
34 build directory. It doesn't touch the module in that case.
36 # Custom module locations are simply used as is.
37 if config_module_path:
38 LOG.info("Using custom path for database module at '%s'", config_module_path)
39 return config_module_path
41 # Compatibility mode for builddir installations.
42 if module_dir.exists() and src_dir.samefile(module_dir):
43 LOG.info('Running from build directory. Leaving database module as is.')
46 # In any other case install the module in the project directory.
47 if not module_dir.exists():
50 destfile = module_dir / 'nominatim.so'
51 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
54 LOG.info('Database module installed at %s', str(destfile))
59 def _check_module(module_dir, conn):
60 """ Try to use the PostgreSQL module to confirm that it is correctly
61 installed and accessible from PostgreSQL.
63 with conn.cursor() as cur:
65 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
66 RETURNS text AS '{}/nominatim.so', 'transliteration'
67 LANGUAGE c IMMUTABLE STRICT;
68 DROP FUNCTION nominatim_test_import_func(text)
69 """.format(module_dir))
70 except psycopg2.DatabaseError as err:
71 LOG.fatal("Error accessing database module: %s", err)
72 raise UsageError("Database module cannot be accessed.") from err
75 class LegacyTokenizer:
76 """ The legacy tokenizer uses a special PostgreSQL module to normalize
77 names and queries. The tokenizer thus implements normalization through
78 calls to the database.
81 def __init__(self, dsn, data_dir):
83 self.data_dir = data_dir
84 self.normalization = None
87 def init_new_db(self, config):
88 """ Set up a new tokenizer for the database.
90 This copies all necessary data in the project directory to make
91 sure the tokenizer remains stable even over updates.
93 module_dir = _install_module(config.DATABASE_MODULE_PATH,
94 config.lib_dir.module,
95 config.project_dir / 'module')
97 self.normalization = config.TERM_NORMALIZATION
99 with connect(self.dsn) as conn:
100 _check_module(module_dir, conn)
101 self._save_config(conn, config)
104 self.update_sql_functions(config)
105 self._init_db_tables(config)
108 def init_from_project(self):
109 """ Initialise the tokenizer from the project directory.
111 with connect(self.dsn) as conn:
112 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
115 def update_sql_functions(self, config):
116 """ Reimport the SQL functions for this tokenizer.
118 with connect(self.dsn) as conn:
119 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
120 modulepath = config.DATABASE_MODULE_PATH or \
121 str((config.project_dir / 'module').resolve())
122 sqlp = SQLPreprocessor(conn, config)
123 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
124 max_word_freq=max_word_freq,
125 modulepath=modulepath)
128 def migrate_database(self, config):
129 """ Initialise the project directory of an existing database for
130 use with this tokenizer.
132 This is a special migration function for updating existing databases
133 to new software versions.
135 module_dir = _install_module(config.DATABASE_MODULE_PATH,
136 config.lib_dir.module,
137 config.project_dir / 'module')
139 with connect(self.dsn) as conn:
140 _check_module(module_dir, conn)
141 self._save_config(conn, config)
144 def name_analyzer(self):
145 """ Create a new analyzer for tokenizing names and queries
146 using this tokinzer. Analyzers are context managers and should
150 with tokenizer.name_analyzer() as analyzer:
154 When used outside the with construct, the caller must ensure to
155 call the close() function before destructing the analyzer.
157 Analyzers are not thread-safe. You need to instantiate one per thread.
159 return LegacyNameAnalyzer(self.dsn)
162 def _init_db_tables(self, config):
163 """ Set up the word table and fill it with pre-computed word
166 with connect(self.dsn) as conn:
167 sqlp = SQLPreprocessor(conn, config)
168 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
171 LOG.warning("Precomputing word tokens")
172 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
175 def _save_config(self, conn, config):
176 """ Save the configuration that needs to remain stable for the given
177 database as database properties.
179 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
180 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
184 class LegacyNameAnalyzer:
185 """ The legacy analyzer uses the special Postgresql module for
188 Each instance opens a connection to the database to request the
192 def __init__(self, dsn):
193 self.conn = connect(dsn).connection
194 self.conn.autocommit = True
195 psycopg2.extras.register_hstore(self.conn)
202 def __exit__(self, exc_type, exc_value, traceback):
207 """ Free all resources used by the analyzer.
213 def process_place(self, place):
214 """ Determine tokenizer information about the given place.
216 Returns a JSON-serialisable structure that will be handed into
217 the database via the token_info field.