]> git.openstreetmap.org Git - nominatim.git/commitdiff
restore the tokenizer directory when missing
authorSarah Hoffmann <lonvia@denofr.de>
Sun, 20 Mar 2022 10:31:42 +0000 (11:31 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Sun, 20 Mar 2022 10:31:42 +0000 (11:31 +0100)
Automatically repopulate the tokenizer/ directory with the PHP stub
and the postgresql module, when the directory is missing. This allows
to switch working directories and in particular run the service
from a different maschine then where it was installed.
Users still need to make sure that .env files are set up correctly
or they will shoot themselves in the foot.

See #2515.

nominatim/db/properties.py
nominatim/tokenizer/factory.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
test/bdd/steps/nominatim_environment.py
test/python/tokenizer/test_factory.py

index 19c090069ac9ab9ccaf33bd20caff1f53088fcd7..270204872dd56459691c4105156fe8073d6aa815 100644 (file)
@@ -27,6 +27,9 @@ def get_property(conn, name):
     """ Return the current value of the given propery or None if the property
         is not set.
     """
+    if not conn.table_exists('nominatim_properties'):
+        return None
+
     with conn.cursor() as cur:
         cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
                     (name, ))
index fbda246238f16bebb6b75806a735975e564ca815..108c7841e0c7c3e4f8bf6bd25b3aa8d9c35bba42 100644 (file)
@@ -78,8 +78,8 @@ def get_tokenizer_for_db(config):
     """
     basedir = config.project_dir / 'tokenizer'
     if not basedir.is_dir():
-        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
-        raise UsageError('Cannot initialize tokenizer.')
+        # Directory will be repopulated by tokenizer below.
+        basedir.mkdir()
 
     with connect(config.get_libpq_dsn()) as conn:
         name = properties.get_property(conn, 'tokenizer')
index 1799ae86d0330ee61c2fc5fe05118ff00e0ef162..b553dbc641d708175e8f7281f05cf14cf4673484 100644 (file)
@@ -51,7 +51,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """
         self.loader = ICURuleLoader(config)
 
-        self._install_php(config.lib_dir.php)
+        self._install_php(config.lib_dir.php, overwrite=True)
         self._save_config()
 
         if init_db:
@@ -67,6 +67,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
         with connect(self.dsn) as conn:
             self.loader.load_config_from_db(conn)
 
+        self._install_php(config.lib_dir.php, overwrite=False)
+
 
     def finalize_import(self, config):
         """ Do any required postprocessing to make the tokenizer data ready
@@ -174,16 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                      self.loader.make_token_analysis())
 
 
-    def _install_php(self, phpdir):
+    def _install_php(self, phpdir, overwrite=True):
         """ Install the php script for the tokenizer.
         """
         php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent(f"""\
-            <?php
-            @define('CONST_Max_Word_Frequency', 10000000);
-            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent(f"""\
+                <?php
+                @define('CONST_Max_Word_Frequency', 10000000);
+                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
     def _save_config(self):
index 28f4b32756c0756ea172ca3aa16a458ac6ce929d..3b8f75692f964e9c2e84dc3ada92b156dd0afb7b 100644 (file)
@@ -107,7 +107,7 @@ class LegacyTokenizer(AbstractTokenizer):
 
         self.normalization = config.TERM_NORMALIZATION
 
-        self._install_php(config)
+        self._install_php(config, overwrite=True)
 
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
@@ -119,12 +119,18 @@ class LegacyTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
-    def init_from_project(self, _):
+    def init_from_project(self, config):
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
+        if not (config.project_dir / 'module' / 'nominatim.so').exists():
+            _install_module(config.DATABASE_MODULE_PATH,
+                            config.lib_dir.module,
+                            config.project_dir / 'module')
+
+        self._install_php(config, overwrite=False)
 
     def finalize_import(self, config):
         """ Do any required postprocessing to make the tokenizer data ready
@@ -238,16 +244,18 @@ class LegacyTokenizer(AbstractTokenizer):
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
-    def _install_php(self, config):
+    def _install_php(self, config, overwrite=True):
         """ Install the php script for the tokenizer.
         """
         php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
-            <?php
-            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
-            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-            """.format(config)))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent("""\
+                <?php
+                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                """.format(config)))
 
 
     def _init_db_tables(self, config):
index 6f4f14a71b8a0f44fb190ea41d2c7722af70f8a4..7de32e484f50f1f645c79137a45541b2820538ca 100644 (file)
@@ -217,7 +217,7 @@ class NominatimEnvironment:
                     self.db_drop_database(self.api_test_db)
                     raise
 
-        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+        tokenizer_factory.get_tokenizer_for_db(self.get_test_config())
 
 
     def setup_unknown_db(self):
index aa763e28fd835f3b0a7119b254b52ac5fd059b4f..166e6ba6388f424dbbd2347751398294eec45a96 100644 (file)
@@ -63,13 +63,13 @@ class TestFactory:
         assert tokenizer.init_state == "loaded"
 
 
-    def test_load_no_tokenizer_dir(self):
+    def test_load_repopulate_tokenizer_dir(self):
         factory.create_tokenizer(self.config)
 
-        self.config.project_dir = self.config.project_dir / 'foo'
+        self.config.project_dir = self.config.project_dir
 
-        with pytest.raises(UsageError):
-            factory.get_tokenizer_for_db(self.config)
+        factory.get_tokenizer_for_db(self.config)
+        assert (self.config.project_dir / 'tokenizer').exists()
 
 
     def test_load_missing_property(self, temp_db_cursor):