From: Sarah Hoffmann Date: Wed, 24 Feb 2021 21:02:13 +0000 (+0100) Subject: port wikipedia importance functions to python X-Git-Tag: v3.7.0~29^2~11 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/c7fd0a7af4ce261e61feb0e63d0a6db736280081?hp=32683f73c787464e16f2a146d4c08c4041087dd5 port wikipedia importance functions to python --- diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index d3831322..91b72c15 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -131,7 +131,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) { if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->importWikipediaArticles(); + (clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run(); } if ($aCMDResult['load-data'] || $aCMDResult['all']) { @@ -157,7 +157,7 @@ if ($aCMDResult['index'] || $aCMDResult['all']) { if ($aCMDResult['drop']) { $bDidSomething = true; - $oSetup->drop($aCMDResult); + (clone($oNominatimCmd))->addParams('freeze')->run(true); } if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) { @@ -172,7 +172,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) { if ($aCMDResult['setup-website'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->setupWebsite(); + (clone($oNominatimCmd))->addParams('refresh', '--website')->run(true); } // ****************************************************** diff --git a/lib-php/admin/update.php b/lib-php/admin/update.php index fba5300b..4f639c8d 100644 --- a/lib-php/admin/update.php +++ b/lib-php/admin/update.php @@ -211,20 +211,7 @@ if ($aResult['update-address-levels']) { } if ($aResult['recompute-importance']) { - echo "Updating importance values for database.\n"; - $oDB = new Nominatim\DB(); - $oDB->connect(); - - $sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;'; - $sSQL .= 'UPDATE placex SET (wikipedia, importance) ='; - $sSQL .= ' (SELECT wikipedia, importance'; - $sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));'; - $sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance'; - $sSQL .= ' FROM placex d'; - $sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null'; - $sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);'; - $sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;'; - $oDB->exec($sSQL); + (clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true); } if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index e11ecd13..66093322 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -6,7 +6,6 @@ require_once(CONST_LibDir.'/Shell.php'); class SetupFunctions { - protected $iCacheMemory; protected $iInstances; protected $aDSNInfo; protected $bQuiet; @@ -31,16 +30,6 @@ class SetupFunctions warn('resetting threads to '.$this->iInstances); } - if (isset($aCMDResult['osm2pgsql-cache'])) { - $this->iCacheMemory = $aCMDResult['osm2pgsql-cache']; - } elseif (getSetting('FLATNODE_FILE')) { - // When flatnode files are enabled then disable cache per default. - $this->iCacheMemory = 0; - } else { - // Otherwise: Assume we can steal all the cache memory in the box. - $this->iCacheMemory = getCacheMemoryMB(); - } - // parse database string $this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN')); if (!isset($this->aDSNInfo['port'])) { @@ -82,6 +71,7 @@ class SetupFunctions if ($this->bVerbose) { $this->oNominatimCmd->addParams('--verbose'); } + $this->oNominatimCmd->addParams('--threads', $this->iInstances); } public function createFunctions() @@ -136,20 +126,6 @@ class SetupFunctions $this->createSqlFunctions(); // also create partition functions } - public function importWikipediaArticles() - { - $sWikiArticlePath = getSetting('WIKIPEDIA_DATA_PATH', CONST_InstallDir); - $sWikiArticlesFile = $sWikiArticlePath.'/wikimedia-importance.sql.gz'; - if (file_exists($sWikiArticlesFile)) { - info('Importing wikipedia articles and redirects'); - $this->dropTable('wikipedia_article'); - $this->dropTable('wikipedia_redirect'); - $this->pgsqlRunScriptFile($sWikiArticlesFile); - } else { - warn('wikipedia importance dump file not found - places will have default importance'); - } - } - public function loadData($bDisableTokenPrecalc) { info('Drop old Data'); @@ -505,21 +481,6 @@ class SetupFunctions $this->pgsqlRunScript($sSQL); } - public function drop() - { - (clone($this->oNominatimCmd))->addParams('freeze')->run(); - } - - /** - * Setup the directory for the API scripts. - * - * @return null - */ - public function setupWebsite() - { - (clone($this->oNominatimCmd))->addParams('refresh', '--website')->run(); - } - /** * Return the connection to the database. * @@ -538,15 +499,6 @@ class SetupFunctions return $this->oDB; } - private function removeFlatnodeFile() - { - $sFName = getSetting('FLATNODE_FILE'); - if ($sFName && file_exists($sFName)) { - if ($this->bVerbose) echo 'Deleting '.$sFName."\n"; - unlink($sFName); - } - } - private function pgsqlRunScript($sScript, $bfatal = true) { runSQLScript( @@ -570,7 +522,7 @@ class SetupFunctions $oCmd->addParams('--enable-debug-statements'); } - $oCmd->run(); + $oCmd->run(!$this->sIgnoreErrors); } private function pgsqlRunPartitionScript($sTemplate) diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index 5dca41de..9dca4e42 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -5,7 +5,6 @@ import logging from pathlib import Path from ..db.connection import connect -from ..tools.exec_utils import run_legacy_script # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 @@ -69,12 +68,20 @@ class UpdateRefresh: args.diffs, args.enable_debug_statements) if args.wiki_data: - run_legacy_script('setup.php', '--import-wikipedia-articles', - nominatim_env=args, throw_on_fail=True) + data_path = Path(args.config.WIKIPEDIA_DATA_PATH + or args.project_dir) + LOG.warning('Import wikipdia article importance from %s', data_path) + if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), + data_path) > 0: + LOG.fatal('FATAL: Wikipedia importance dump file not found') + return 1 + # Attention: importance MUST come after wiki data import. if args.importance: - run_legacy_script('update.php', '--recompute-importance', - nominatim_env=args, throw_on_fail=True) + LOG.warning('Update importance values for database') + with connect(args.config.get_libpq_dsn()) as conn: + refresh.recompute_importance(conn) + if args.website: webdir = args.project_dir / 'website' LOG.warning('Setting up website directory at %s', webdir) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 575f3010..0490bbc8 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc): return len(chunk) -def execute_file(dsn, fname, ignore_errors=False): +def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None): """ Read an SQL file and run its contents against the given database - using psql. + using psql. Use `pre_code` and `post_code` to run extra commands + before or after executing the file. The commands are run within the + same session, so they may be used to wrap the file execution in a + transaction. """ cmd = ['psql'] if not ignore_errors: @@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False): if not LOG.isEnabledFor(logging.INFO): proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) + if pre_code: + proc.stdin.write((pre_code + ';').encode('utf-8')) + if fname.suffix == '.gz': with gzip.open(str(fname), 'rb') as fdesc: remain = _pipe_to_proc(proc, fdesc) @@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False): with fname.open('rb') as fdesc: remain = _pipe_to_proc(proc, fdesc) + if remain == 0 and post_code: + proc.stdin.write((';' + post_code).encode('utf-8')) + proc.stdin.close() ret = proc.wait() diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 33efe8f8..b59c37bf 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -200,6 +200,53 @@ PHP_CONST_DEFS = ( ) +def import_wikipedia_articles(dsn, data_path, ignore_errors=False): + """ Replaces the wikipedia importance tables with new data. + The import is run in a single transaction so that the new data + is replace seemlessly. + + Returns 0 if all was well and 1 if the importance file could not + be found. Throws an exception if there was an error reading the file. + """ + datafile = data_path / 'wikimedia-importance.sql.gz' + + if not datafile.exists(): + return 1 + + pre_code = """BEGIN; + DROP TABLE IF EXISTS "wikipedia_article"; + DROP TABLE IF EXISTS "wikipedia_redirect" + """ + post_code = "COMMIT" + execute_file(dsn, datafile, ignore_errors=ignore_errors, + pre_code=pre_code, post_code=post_code) + + return 0 + + +def recompute_importance(conn): + """ Recompute wikipedia links and importance for all entries in placex. + This is a long-running operations that must not be executed in + parallel with updates. + """ + with conn.cursor() as cur: + cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL') + cur.execute(""" + UPDATE placex SET (wikipedia, importance) = + (SELECT wikipedia, importance + FROM compute_importance(extratags, country_code, osm_type, osm_id)) + """) + cur.execute(""" + UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance + FROM placex d + WHERE s.place_id = d.linked_place_id and d.wikipedia is not null + and (s.wikipedia is null or s.importance < d.importance); + """) + + cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL') + conn.commit() + + def setup_website(basedir, phplib_dir, config): """ Create the website script stubs. """ diff --git a/test/python/conftest.py b/test/python/conftest.py index f0569ab8..40b611c0 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -71,6 +71,12 @@ def temp_db(monkeypatch): conn.close() + +@pytest.fixture +def dsn(temp_db): + return 'dbname=' + temp_db + + @pytest.fixture def temp_db_with_extensions(temp_db): conn = psycopg2.connect(database=temp_db) @@ -101,6 +107,14 @@ def temp_db_cursor(temp_db): conn.close() +@pytest.fixture +def table_factory(temp_db_cursor): + def mk_table(name, definition='id INT'): + temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition)) + + return mk_table + + @pytest.fixture def def_config(): return Configuration(None, SRC_DIR.resolve() / 'settings') diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 9170406d..e2a44e37 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -135,24 +135,13 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra assert rank_mock.called == do_ranks -@pytest.mark.parametrize("command,params", [ - ('wiki-data', ('setup.php', '--import-wikipedia-articles')), - ('importance', ('update.php', '--recompute-importance')), - ]) -def test_refresh_legacy_command(mock_func_factory, temp_db, command, params): - mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script') - - assert 0 == call_nominatim('refresh', '--' + command) - - assert mock_run_legacy.called == 1 - assert len(mock_run_legacy.last_args) >= len(params) - assert mock_run_legacy.last_args[:len(params)] == params - @pytest.mark.parametrize("command,func", [ ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), ('address-levels', 'load_address_levels_from_file'), ('functions', 'create_functions'), + ('wiki-data', 'import_wikipedia_articles'), + ('importance', 'recompute_importance'), ('website', 'setup_website'), ]) def test_refresh_command(mock_func_factory, temp_db, command, func): @@ -162,13 +151,16 @@ def test_refresh_command(mock_func_factory, temp_db, command, func): assert func_mock.called == 1 -def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_db): - mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script') +def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db): + calls = [] + monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles', + lambda *args, **kwargs: calls.append('import') or 0) + monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance', + lambda *args, **kwargs: calls.append('update')) assert 0 == call_nominatim('refresh', '--importance', '--wiki-data') - assert mock_run_legacy.called == 2 - assert mock_run_legacy.last_args == ('update.php', '--recompute-importance') + assert calls == ['import', 'update'] def test_serve_command(mock_func_factory): diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 635465f5..ce81c4f3 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -12,10 +12,10 @@ def db(temp_db): yield conn -def test_connection_table_exists(db, temp_db_cursor): +def test_connection_table_exists(db, table_factory): assert db.table_exists('foobar') == False - temp_db_cursor.execute('CREATE TABLE foobar (id INT)') + table_factory('foobar') assert db.table_exists('foobar') == True @@ -31,10 +31,10 @@ def test_connection_index_exists(db, temp_db_cursor): assert db.index_exists('some_index', table='bar') == False -def test_drop_table_existing(db, temp_db_cursor): - temp_db_cursor.execute('CREATE TABLE dummy (id INT)') - +def test_drop_table_existing(db, table_factory): + table_factory('dummy') assert db.table_exists('dummy') + db.drop_table('dummy') assert not db.table_exists('dummy') @@ -65,8 +65,8 @@ def test_connection_postgis_version_tuple(db, temp_db_cursor): assert ver[0] >= 2 -def test_cursor_scalar(db, temp_db_cursor): - temp_db_cursor.execute('CREATE TABLE dummy (id INT)') +def test_cursor_scalar(db, table_factory): + table_factory('dummy') with db.cursor() as cur: assert cur.scalar('SELECT count(*) FROM dummy') == 0 diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 1c3b834a..b8a49ccf 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -7,10 +7,6 @@ import pytest import nominatim.db.utils as db_utils from nominatim.errors import UsageError -@pytest.fixture -def dsn(temp_db): - return 'dbname=' + temp_db - def test_execute_file_success(dsn, temp_db_cursor, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') @@ -40,3 +36,27 @@ def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path): tmpfile.write_text('CREATE STABLE test (id INT)') db_utils.execute_file(dsn, tmpfile, ignore_errors=True) + + +def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('INSERT INTO test VALUES(4)') + + db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)') + + temp_db_cursor.execute('SELECT * FROM test') + + assert temp_db_cursor.rowcount == 1 + assert temp_db_cursor.fetchone()[0] == 4 + + +def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('CREATE TABLE test (id INT)') + + db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)') + + temp_db_cursor.execute('SELECT * FROM test') + + assert temp_db_cursor.rowcount == 1 + assert temp_db_cursor.fetchone()[0] == 23 diff --git a/test/python/test_tools_refresh_address_levels.py b/test/python/test_tools_refresh_address_levels.py index 87e34c61..2bd91720 100644 --- a/test/python/test_tools_refresh_address_levels.py +++ b/test/python/test_tools_refresh_address_levels.py @@ -2,9 +2,10 @@ Tests for function for importing address ranks. """ import json -import pytest from pathlib import Path +import pytest + from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):