]> git.openstreetmap.org Git - nominatim.git/commitdiff
move index creation for word table to tokenizer
authorSarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 15:28:34 +0000 (17:28 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 15:41:08 +0000 (17:41 +0200)
This introduces a finalization routing for the tokenizer
where it can post-process the import if necessary.

lib-sql/indices.sql
lib-sql/tokenizer/legacy_tokenizer_indices.sql [new file with mode: 0644]
nominatim/clicmd/setup.py
nominatim/tokenizer/legacy_tokenizer.py
test/python/dummy_tokenizer.py
test/python/test_cli.py

index a6f7cf95fcb6c7e0346ee0cdc8ca54fe2a77be81..81299544573c0c4c1ffea2850b8be54477f6a2bb 100644 (file)
@@ -1,9 +1,6 @@
 -- Indices used only during search and update.
 -- These indices are created only after the indexing process is done.
 
-CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
-  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
-
 CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
   ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};
 
diff --git a/lib-sql/tokenizer/legacy_tokenizer_indices.sql b/lib-sql/tokenizer/legacy_tokenizer_indices.sql
new file mode 100644 (file)
index 0000000..44a2909
--- /dev/null
@@ -0,0 +1,2 @@
+CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
+  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
index 0f19d0975d460b6a243151e9c902ec460ed7afbd..eb0178a9f560f563a867488b5e608e67a0c024ad 100644 (file)
@@ -135,6 +135,7 @@ class SetupAll:
             LOG.warning('Create search index for default country names.')
             database_import.create_country_names(conn, tokenizer,
                                                  args.config.LANGUAGES)
+        tokenizer.finalize_import(args.config)
 
         webdir = args.project_dir / 'website'
         LOG.warning('Setup website at %s', webdir)
index b1fd9e9673febce83bdc04cedbf38e0482529db9..2f060b84aa98761b653f15a41f68566dc31cb2d8 100644 (file)
@@ -119,6 +119,15 @@ class LegacyTokenizer:
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
 
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
     def update_sql_functions(self, config):
         """ Reimport the SQL functions for this tokenizer.
         """
index d3f006deb3b029ffe05db5b9b6d7f13ba3d20b91..6352a644c9988d52ea187634c10519642caa7cc6 100644 (file)
@@ -26,6 +26,10 @@ class DummyTokenizer:
         self.init_state = "loaded"
 
 
+    def finalize_import(self, _):
+        pass
+
+
     def name_analyzer(self):
         return DummyNameAnalyzer(self.analyser_cache)
 
index e0d4fb86ca0c7b8f9638fea54b5a1afc14eed89b..a286995611ddf524c54ae5c1657b3e3e15bf55c2 100644 (file)
@@ -62,13 +62,19 @@ def tokenizer_mock(monkeypatch):
     class DummyTokenizer:
         def __init__(self, *args, **kwargs):
             self.update_sql_functions_called = False
+            self.finalize_import_called = False
 
         def update_sql_functions(self, *args):
             self.update_sql_functions_called = True
 
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
     tok = DummyTokenizer()
     monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
                         lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
 
     return tok
 
@@ -101,7 +107,7 @@ def test_import_bad_file(temp_db):
     assert 1 == call_nominatim('import', '--osm-file', '.')
 
 
-def test_import_full(temp_db, mock_func_factory):
+def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
     mocks = [
         mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
         mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
@@ -113,7 +119,6 @@ def test_import_full(temp_db, mock_func_factory):
         mock_func_factory(nominatim.tools.database_import, 'create_partition_tables'),
         mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
         mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
-        mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
         mock_func_factory(nominatim.tools.refresh, 'load_address_levels_from_file'),
         mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'),
         mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
@@ -124,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory):
     cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
 
     assert 0 == call_nominatim('import', '--osm-file', __file__)
+    assert tokenizer_mock.finalize_import_called
 
     assert cf_mock.called > 1
 
@@ -131,13 +137,12 @@ def test_import_full(temp_db, mock_func_factory):
         assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
 
 
-def test_import_continue_load_data(temp_db, mock_func_factory):
+def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
     mocks = [
         mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
         mock_func_factory(nominatim.tools.database_import, 'load_data'),
         mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
         mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
-        mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
         mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'),
         mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
         mock_func_factory(nominatim.tools.refresh, 'setup_website'),
@@ -145,17 +150,18 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
     ]
 
     assert 0 == call_nominatim('import', '--continue', 'load-data')
+    assert tokenizer_mock.finalize_import_called
 
     for mock in mocks:
         assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
 
 
-def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn):
+def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
+                                  temp_db_conn, tokenizer_mock):
     mocks = [
         mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
         mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
         mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
-        mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
         mock_func_factory(nominatim.tools.refresh, 'setup_website'),
         mock_func_factory(nominatim.db.properties, 'set_property')
     ]
@@ -172,17 +178,18 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
     assert temp_db_conn.index_exists('idx_placex_pendingsector')
 
 
-def test_import_continue_postprocess(temp_db, mock_func_factory):
+def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
     mocks = [
         mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
         mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
         mock_func_factory(nominatim.tools.refresh, 'setup_website'),
-        mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
         mock_func_factory(nominatim.db.properties, 'set_property')
     ]
 
     assert 0 == call_nominatim('import', '--continue', 'db-postprocess')
 
+    assert tokenizer_mock.finalize_import_called
+
     for mock in mocks:
         assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)