]> git.openstreetmap.org Git - nominatim.git/commitdiff
port wikipedia importance functions to python
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 24 Feb 2021 21:02:13 +0000 (22:02 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 25 Feb 2021 17:42:54 +0000 (18:42 +0100)
lib-php/admin/setup.php
lib-php/admin/update.php
lib-php/setup/SetupClass.php
nominatim/clicmd/refresh.py
nominatim/db/utils.py
nominatim/tools/refresh.py
test/python/conftest.py
test/python/test_cli.py
test/python/test_db_connection.py
test/python/test_db_utils.py
test/python/test_tools_refresh_address_levels.py

index d38313221c1ad4e7307ccd1ce1bee7abad4750e1..91b72c15f844298d57726c5e429b3cab9d20f86b 100644 (file)
@@ -131,7 +131,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) {
 
 if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) {
     $bDidSomething = true;
-    $oSetup->importWikipediaArticles();
+    (clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run();
 }
 
 if ($aCMDResult['load-data'] || $aCMDResult['all']) {
@@ -157,7 +157,7 @@ if ($aCMDResult['index'] || $aCMDResult['all']) {
 
 if ($aCMDResult['drop']) {
     $bDidSomething = true;
-    $oSetup->drop($aCMDResult);
+    (clone($oNominatimCmd))->addParams('freeze')->run(true);
 }
 
 if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) {
@@ -172,7 +172,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) {
 
 if ($aCMDResult['setup-website'] || $aCMDResult['all']) {
     $bDidSomething = true;
-    $oSetup->setupWebsite();
+    (clone($oNominatimCmd))->addParams('refresh', '--website')->run(true);
 }
 
 // ******************************************************
index fba5300b07ee4ec0d5791d01895bfe16eb1310e3..4f639c8d5b262b697b4250f59bf98703452a42e0 100644 (file)
@@ -211,20 +211,7 @@ if ($aResult['update-address-levels']) {
 }
 
 if ($aResult['recompute-importance']) {
-    echo "Updating importance values for database.\n";
-    $oDB = new Nominatim\DB();
-    $oDB->connect();
-
-    $sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
-    $sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
-    $sSQL .= '   (SELECT wikipedia, importance';
-    $sSQL .= '    FROM compute_importance(extratags, country_code, osm_type, osm_id));';
-    $sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
-    $sSQL .= ' FROM placex d';
-    $sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
-    $sSQL .= '       and (s.wikipedia is null or s.importance < d.importance);';
-    $sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
-    $oDB->exec($sSQL);
+    (clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true);
 }
 
 if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
index e11ecd137c65dc211970089387193972b123c899..66093322bc5d11f2f55064e346f41158176934cc 100755 (executable)
@@ -6,7 +6,6 @@ require_once(CONST_LibDir.'/Shell.php');
 
 class SetupFunctions
 {
-    protected $iCacheMemory;
     protected $iInstances;
     protected $aDSNInfo;
     protected $bQuiet;
@@ -31,16 +30,6 @@ class SetupFunctions
             warn('resetting threads to '.$this->iInstances);
         }
 
-        if (isset($aCMDResult['osm2pgsql-cache'])) {
-            $this->iCacheMemory = $aCMDResult['osm2pgsql-cache'];
-        } elseif (getSetting('FLATNODE_FILE')) {
-            // When flatnode files are enabled then disable cache per default.
-            $this->iCacheMemory = 0;
-        } else {
-            // Otherwise: Assume we can steal all the cache memory in the box.
-            $this->iCacheMemory = getCacheMemoryMB();
-        }
-
         // parse database string
         $this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN'));
         if (!isset($this->aDSNInfo['port'])) {
@@ -82,6 +71,7 @@ class SetupFunctions
         if ($this->bVerbose) {
             $this->oNominatimCmd->addParams('--verbose');
         }
+        $this->oNominatimCmd->addParams('--threads', $this->iInstances);
     }
 
     public function createFunctions()
@@ -136,20 +126,6 @@ class SetupFunctions
         $this->createSqlFunctions(); // also create partition functions
     }
 
-    public function importWikipediaArticles()
-    {
-        $sWikiArticlePath = getSetting('WIKIPEDIA_DATA_PATH', CONST_InstallDir);
-        $sWikiArticlesFile = $sWikiArticlePath.'/wikimedia-importance.sql.gz';
-        if (file_exists($sWikiArticlesFile)) {
-            info('Importing wikipedia articles and redirects');
-            $this->dropTable('wikipedia_article');
-            $this->dropTable('wikipedia_redirect');
-            $this->pgsqlRunScriptFile($sWikiArticlesFile);
-        } else {
-            warn('wikipedia importance dump file not found - places will have default importance');
-        }
-    }
-
     public function loadData($bDisableTokenPrecalc)
     {
         info('Drop old Data');
@@ -505,21 +481,6 @@ class SetupFunctions
         $this->pgsqlRunScript($sSQL);
     }
 
-    public function drop()
-    {
-        (clone($this->oNominatimCmd))->addParams('freeze')->run();
-    }
-
-    /**
-     * Setup the directory for the API scripts.
-     *
-     * @return null
-     */
-    public function setupWebsite()
-    {
-        (clone($this->oNominatimCmd))->addParams('refresh', '--website')->run();
-    }
-
     /**
      * Return the connection to the database.
      *
@@ -538,15 +499,6 @@ class SetupFunctions
         return $this->oDB;
     }
 
-    private function removeFlatnodeFile()
-    {
-        $sFName = getSetting('FLATNODE_FILE');
-        if ($sFName && file_exists($sFName)) {
-            if ($this->bVerbose) echo 'Deleting '.$sFName."\n";
-            unlink($sFName);
-        }
-    }
-
     private function pgsqlRunScript($sScript, $bfatal = true)
     {
         runSQLScript(
@@ -570,7 +522,7 @@ class SetupFunctions
             $oCmd->addParams('--enable-debug-statements');
         }
 
-        $oCmd->run();
+        $oCmd->run(!$this->sIgnoreErrors);
     }
 
     private function pgsqlRunPartitionScript($sTemplate)
index 5dca41defc3a2241c0f5c1ef39624fa0819d02d1..9dca4e42e073db9317ba5d02578a93a9340261b9 100644 (file)
@@ -5,7 +5,6 @@ import logging
 from pathlib import Path
 
 from ..db.connection import connect
-from ..tools.exec_utils import run_legacy_script
 
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
@@ -69,12 +68,20 @@ class UpdateRefresh:
                                          args.diffs, args.enable_debug_statements)
 
         if args.wiki_data:
-            run_legacy_script('setup.php', '--import-wikipedia-articles',
-                              nominatim_env=args, throw_on_fail=True)
+            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
+                             or args.project_dir)
+            LOG.warning('Import wikipdia article importance from %s', data_path)
+            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
+                                                 data_path) > 0:
+                LOG.fatal('FATAL: Wikipedia importance dump file not found')
+                return 1
+
         # Attention: importance MUST come after wiki data import.
         if args.importance:
-            run_legacy_script('update.php', '--recompute-importance',
-                              nominatim_env=args, throw_on_fail=True)
+            LOG.warning('Update importance values for database')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.recompute_importance(conn)
+
         if args.website:
             webdir = args.project_dir / 'website'
             LOG.warning('Setting up website directory at %s', webdir)
index 575f301082e044e2d04a3a31fbcc69c8dfd2f8ac..0490bbc869fe3f19a4bc87eac2fbed73d4b54d6f 100644 (file)
@@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc):
 
     return len(chunk)
 
-def execute_file(dsn, fname, ignore_errors=False):
+def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None):
     """ Read an SQL file and run its contents against the given database
-        using psql.
+        using psql. Use `pre_code` and `post_code` to run extra commands
+        before or after executing the file. The commands are run within the
+        same session, so they may be used to wrap the file execution in a
+        transaction.
     """
     cmd = ['psql']
     if not ignore_errors:
@@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False):
     if not LOG.isEnabledFor(logging.INFO):
         proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
 
+    if pre_code:
+        proc.stdin.write((pre_code + ';').encode('utf-8'))
+
     if fname.suffix == '.gz':
         with gzip.open(str(fname), 'rb') as fdesc:
             remain = _pipe_to_proc(proc, fdesc)
@@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False):
         with fname.open('rb') as fdesc:
             remain = _pipe_to_proc(proc, fdesc)
 
+    if remain == 0 and post_code:
+        proc.stdin.write((';' + post_code).encode('utf-8'))
+
     proc.stdin.close()
 
     ret = proc.wait()
index 33efe8f875121dad7acd3868432756e79c1a8452..b59c37bf9d5b46f1e74053ea3fd2c14621f5714b 100644 (file)
@@ -200,6 +200,53 @@ PHP_CONST_DEFS = (
 )
 
 
+def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
+    """ Replaces the wikipedia importance tables with new data.
+        The import is run in a single transaction so that the new data
+        is replace seemlessly.
+
+        Returns 0 if all was well and 1 if the importance file could not
+        be found. Throws an exception if there was an error reading the file.
+    """
+    datafile = data_path / 'wikimedia-importance.sql.gz'
+
+    if not datafile.exists():
+        return 1
+
+    pre_code = """BEGIN;
+                  DROP TABLE IF EXISTS "wikipedia_article";
+                  DROP TABLE IF EXISTS "wikipedia_redirect"
+               """
+    post_code = "COMMIT"
+    execute_file(dsn, datafile, ignore_errors=ignore_errors,
+                 pre_code=pre_code, post_code=post_code)
+
+    return 0
+
+
+def recompute_importance(conn):
+    """ Recompute wikipedia links and importance for all entries in placex.
+        This is a long-running operations that must not be executed in
+        parallel with updates.
+    """
+    with conn.cursor() as cur:
+        cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
+        cur.execute("""
+            UPDATE placex SET (wikipedia, importance) =
+               (SELECT wikipedia, importance
+                FROM compute_importance(extratags, country_code, osm_type, osm_id))
+            """)
+        cur.execute("""
+            UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
+             FROM placex d
+             WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
+                   and (s.wikipedia is null or s.importance < d.importance);
+            """)
+
+        cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
+    conn.commit()
+
+
 def setup_website(basedir, phplib_dir, config):
     """ Create the website script stubs.
     """
index f0569ab80477c4e6e1394881695bbf6b03c60096..40b611c03a6bd4168ba62dcd0078d4c0c6e70962 100644 (file)
@@ -71,6 +71,12 @@ def temp_db(monkeypatch):
 
     conn.close()
 
+
+@pytest.fixture
+def dsn(temp_db):
+    return 'dbname=' + temp_db
+
+
 @pytest.fixture
 def temp_db_with_extensions(temp_db):
     conn = psycopg2.connect(database=temp_db)
@@ -101,6 +107,14 @@ def temp_db_cursor(temp_db):
     conn.close()
 
 
+@pytest.fixture
+def table_factory(temp_db_cursor):
+    def mk_table(name, definition='id INT'):
+        temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
+
+    return mk_table
+
+
 @pytest.fixture
 def def_config():
     return Configuration(None, SRC_DIR.resolve() / 'settings')
index 9170406d7c13b727445bb67804d13dc3c7f4dd66..e2a44e37f44122c81a927bf11d54b770d6145106 100644 (file)
@@ -135,24 +135,13 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
     assert rank_mock.called == do_ranks
 
 
-@pytest.mark.parametrize("command,params", [
-                         ('wiki-data', ('setup.php', '--import-wikipedia-articles')),
-                         ('importance', ('update.php', '--recompute-importance')),
-                         ])
-def test_refresh_legacy_command(mock_func_factory, temp_db, command, params):
-    mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
-
-    assert 0 == call_nominatim('refresh', '--' + command)
-
-    assert mock_run_legacy.called == 1
-    assert len(mock_run_legacy.last_args) >= len(params)
-    assert mock_run_legacy.last_args[:len(params)] == params
-
 @pytest.mark.parametrize("command,func", [
                          ('postcodes', 'update_postcodes'),
                          ('word-counts', 'recompute_word_counts'),
                          ('address-levels', 'load_address_levels_from_file'),
                          ('functions', 'create_functions'),
+                         ('wiki-data', 'import_wikipedia_articles'),
+                         ('importance', 'recompute_importance'),
                          ('website', 'setup_website'),
                          ])
 def test_refresh_command(mock_func_factory, temp_db, command, func):
@@ -162,13 +151,16 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
     assert func_mock.called == 1
 
 
-def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_db):
-    mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
+def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
+    calls = []
+    monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
+                        lambda *args, **kwargs: calls.append('import') or 0)
+    monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
+                        lambda *args, **kwargs: calls.append('update'))
 
     assert 0 == call_nominatim('refresh', '--importance', '--wiki-data')
 
-    assert mock_run_legacy.called == 2
-    assert mock_run_legacy.last_args == ('update.php', '--recompute-importance')
+    assert calls == ['import', 'update']
 
 
 def test_serve_command(mock_func_factory):
index 635465f5fafffa15b6c107a42ff096170102cb80..ce81c4f3995330c9f5d6b89b462d3e05669a112c 100644 (file)
@@ -12,10 +12,10 @@ def db(temp_db):
         yield conn
 
 
-def test_connection_table_exists(db, temp_db_cursor):
+def test_connection_table_exists(db, table_factory):
     assert db.table_exists('foobar') == False
 
-    temp_db_cursor.execute('CREATE TABLE foobar (id INT)')
+    table_factory('foobar')
 
     assert db.table_exists('foobar') == True
 
@@ -31,10 +31,10 @@ def test_connection_index_exists(db, temp_db_cursor):
     assert db.index_exists('some_index', table='bar') == False
 
 
-def test_drop_table_existing(db, temp_db_cursor):
-    temp_db_cursor.execute('CREATE TABLE dummy (id INT)')
-
+def test_drop_table_existing(db, table_factory):
+    table_factory('dummy')
     assert db.table_exists('dummy')
+
     db.drop_table('dummy')
     assert not db.table_exists('dummy')
 
@@ -65,8 +65,8 @@ def test_connection_postgis_version_tuple(db, temp_db_cursor):
     assert ver[0] >= 2
 
 
-def test_cursor_scalar(db, temp_db_cursor):
-    temp_db_cursor.execute('CREATE TABLE dummy (id INT)')
+def test_cursor_scalar(db, table_factory):
+    table_factory('dummy')
 
     with db.cursor() as cur:
         assert cur.scalar('SELECT count(*) FROM dummy') == 0
index 1c3b834a6727f0befbf9f85edca30bce98383f44..b8a49ccf604801ff673d9606c44e8789c24649ed 100644 (file)
@@ -7,10 +7,6 @@ import pytest
 import nominatim.db.utils as db_utils
 from nominatim.errors import UsageError
 
-@pytest.fixture
-def dsn(temp_db):
-    return 'dbname=' + temp_db
-
 def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
     tmpfile = tmp_path / 'test.sql'
     tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')
@@ -40,3 +36,27 @@ def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path):
     tmpfile.write_text('CREATE STABLE test (id INT)')
 
     db_utils.execute_file(dsn, tmpfile, ignore_errors=True)
+
+
+def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):
+    tmpfile = tmp_path / 'test.sql'
+    tmpfile.write_text('INSERT INTO test VALUES(4)')
+
+    db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)')
+
+    temp_db_cursor.execute('SELECT * FROM test')
+
+    assert temp_db_cursor.rowcount == 1
+    assert temp_db_cursor.fetchone()[0] == 4
+
+
+def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor):
+    tmpfile = tmp_path / 'test.sql'
+    tmpfile.write_text('CREATE TABLE test (id INT)')
+
+    db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)')
+
+    temp_db_cursor.execute('SELECT * FROM test')
+
+    assert temp_db_cursor.rowcount == 1
+    assert temp_db_cursor.fetchone()[0] == 23
index 87e34c6166990cb6fbefde16affd3dd363480c33..2bd917209ed517c3438d4417c7a29ebbb22dd9f1 100644 (file)
@@ -2,9 +2,10 @@
 Tests for function for importing address ranks.
 """
 import json
-import pytest
 from pathlib import Path
 
+import pytest
+
 from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file
 
 def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):