From: Sarah Hoffmann Date: Tue, 19 Jan 2021 07:42:22 +0000 (+0100) Subject: Merge pull request #2143 from lonvia/integrate-indexer-into-nominatim-tool X-Git-Tag: v3.7.0~52 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/3475e1dfd6c1ef44b7e2cc046d62a6f50108b6a0?hp=cd0001b55a4706e5c20a72e4209ccc7a330ba878 Merge pull request #2143 from lonvia/integrate-indexer-into-nominatim-tool Integrate indexer into nominatim tool --- diff --git a/cmake/script.tmpl b/cmake/script.tmpl index 30b8717b..aa25a124 100755 --- a/cmake/script.tmpl +++ b/cmake/script.tmpl @@ -8,5 +8,6 @@ require('@CMAKE_SOURCE_DIR@/lib/dotenv_loader.php'); @define('CONST_DataDir', '@CMAKE_SOURCE_DIR@'); loadDotEnv(); +$_SERVER['NOMINATIM_NOMINATIM_TOOL'] = '@CMAKE_BINARY_DIR@/nominatim'; require_once('@CMAKE_SOURCE_DIR@/lib/admin/@script_source@'); diff --git a/cmake/tool.tmpl b/cmake/tool.tmpl index 40f2b8ea..43646792 100755 --- a/cmake/tool.tmpl +++ b/cmake/tool.tmpl @@ -1,8 +1,11 @@ #!/usr/bin/env python3 import sys +import os sys.path.insert(1, '@CMAKE_SOURCE_DIR@') +os.environ['NOMINATIM_NOMINATIM_TOOL'] = __file__ + from nominatim import cli exit(cli.nominatim(module_dir='@CMAKE_BINARY_DIR@/module', diff --git a/lib/Shell.php b/lib/Shell.php index 59c4473b..72f90735 100644 --- a/lib/Shell.php +++ b/lib/Shell.php @@ -7,7 +7,7 @@ class Shell public function __construct($sBaseCmd, ...$aParams) { if (!$sBaseCmd) { - throw new Exception('Command missing in new() call'); + throw new \Exception('Command missing in new() call'); } $this->baseCmd = $sBaseCmd; $this->aParams = array(); diff --git a/lib/admin/update.php b/lib/admin/update.php index 50f611d7..fe9658b5 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -105,25 +105,14 @@ if ($fPostgresVersion >= 11.0) { } -$oIndexCmd = (new \Nominatim\Shell(CONST_DataDir.'/nominatim/nominatim.py')) - ->addParams('--database', $aDSNInfo['database']) - ->addParams('--port', $aDSNInfo['port']) - ->addParams('--threads', $aResult['index-instances']); -if (!$aResult['quiet']) { - $oIndexCmd->addParams('--verbose'); +$oIndexCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'))) + ->addParams('index'); +if ($aResult['quiet']) { + $oIndexCmd->addParams('--quiet'); } if ($aResult['verbose']) { $oIndexCmd->addParams('--verbose'); } -if (isset($aDSNInfo['hostspec']) && $aDSNInfo['hostspec']) { - $oIndexCmd->addParams('--host', $aDSNInfo['hostspec']); -} -if (isset($aDSNInfo['username']) && $aDSNInfo['username']) { - $oIndexCmd->addParams('--username', $aDSNInfo['username']); -} -if (isset($aDSNInfo['password']) && $aDSNInfo['password']) { - $oIndexCmd->addEnvPair('PGPASSWORD', $aDSNInfo['password']); -} $sPyosmiumBin = getSetting('PYOSMIUM_BINARY'); $sBaseURL = getSetting('REPLICATION_URL'); @@ -288,15 +277,9 @@ if ($aResult['recompute-word-counts']) { } if ($aResult['index']) { - $oCmd = (clone $oIndexCmd) - ->addParams('--minrank', $aResult['index-rank'], '-b'); - $oCmd->run(); - $oCmd = (clone $oIndexCmd) ->addParams('--minrank', $aResult['index-rank']); $oCmd->run(); - - $oDB->exec('update import_status set indexed = true'); } if ($aResult['update-address-levels']) { @@ -438,15 +421,6 @@ if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { if (!$aResult['no-index']) { $fCMDStartTime = time(); - $oThisIndexCmd = clone($oIndexCmd); - $oThisIndexCmd->addParams('-b'); - echo $oThisIndexCmd->escapedCmd()."\n"; - $iErrorLevel = $oThisIndexCmd->run(); - if ($iErrorLevel) { - echo "Error: $iErrorLevel\n"; - exit($iErrorLevel); - } - $oThisIndexCmd = clone($oIndexCmd); echo $oThisIndexCmd->escapedCmd()."\n"; $iErrorLevel = $oThisIndexCmd->run(); @@ -463,9 +437,6 @@ if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { var_Dump($sSQL); $oDB->exec($sSQL); echo date('Y-m-d H:i:s')." Completed index step for $sBatchEnd in ".round((time()-$fCMDStartTime)/60, 2)." minutes\n"; - - $sSQL = 'update import_status set indexed = true'; - $oDB->exec($sSQL); } else { if ($aResult['import-osmosis-all']) { echo "Error: --no-index cannot be used with continuous imports (--import-osmosis-all).\n"; diff --git a/lib/setup/SetupClass.php b/lib/setup/SetupClass.php index 77b14a8a..d17fdca7 100755 --- a/lib/setup/SetupClass.php +++ b/lib/setup/SetupClass.php @@ -549,26 +549,15 @@ class SetupFunctions { $this->checkModulePresence(); // raises exception on failure - $oBaseCmd = (new \Nominatim\Shell(CONST_DataDir.'/nominatim/nominatim.py')) - ->addParams('--database', $this->aDSNInfo['database']) - ->addParams('--port', $this->aDSNInfo['port']) - ->addParams('--threads', $this->iInstances); + $oBaseCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'))) + ->addParams('index'); - if (!$this->bQuiet) { - $oBaseCmd->addParams('-v'); + if ($this->bQuiet) { + $oBaseCmd->addParams('-q'); } if ($this->bVerbose) { $oBaseCmd->addParams('-v'); } - if (isset($this->aDSNInfo['hostspec'])) { - $oBaseCmd->addParams('--host', $this->aDSNInfo['hostspec']); - } - if (isset($this->aDSNInfo['username'])) { - $oBaseCmd->addParams('--user', $this->aDSNInfo['username']); - } - if (isset($this->aDSNInfo['password'])) { - $oBaseCmd->addEnvPair('PGPASSWORD', $this->aDSNInfo['password']); - } info('Index ranks 0 - 4'); $oCmd = (clone $oBaseCmd)->addParams('--maxrank', 4); @@ -581,14 +570,14 @@ class SetupFunctions if (!$bIndexNoanalyse) $this->pgsqlRunScript('ANALYSE'); info('Index administrative boundaries'); - $oCmd = (clone $oBaseCmd)->addParams('-b'); + $oCmd = (clone $oBaseCmd)->addParams('--boundaries-only'); $iStatus = $oCmd->run(); if ($iStatus != 0) { fail('error status ' . $iStatus . ' running nominatim!'); } info('Index ranks 5 - 25'); - $oCmd = (clone $oBaseCmd)->addParams('--minrank', 5, '--maxrank', 25); + $oCmd = (clone $oBaseCmd)->addParams('--no-boundaries', '--minrank', 5, '--maxrank', 25); $iStatus = $oCmd->run(); if ($iStatus != 0) { fail('error status ' . $iStatus . ' running nominatim!'); @@ -597,7 +586,7 @@ class SetupFunctions if (!$bIndexNoanalyse) $this->pgsqlRunScript('ANALYSE'); info('Index ranks 26 - 30'); - $oCmd = (clone $oBaseCmd)->addParams('--minrank', 26); + $oCmd = (clone $oBaseCmd)->addParams('--no-boundaries', '--minrank', 26); $iStatus = $oCmd->run(); if ($iStatus != 0) { fail('error status ' . $iStatus . ' running nominatim!'); diff --git a/nominatim/cli.py b/nominatim/cli.py index 8d4071db..65ea90bb 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -11,6 +11,17 @@ from pathlib import Path from .config import Configuration from .admin.exec_utils import run_legacy_script +from .indexer.indexer import Indexer + +def _num_system_cpus(): + try: + cpus = len(os.sched_getaffinity(0)) + except NotImplementedError: + cpus = None + + return cpus or os.cpu_count() + + class CommandlineParser: """ Wraps some of the common functions for parsing the command line and setting up subcommands. @@ -67,7 +78,7 @@ class CommandlineParser: args.project_dir = Path(args.project_dir) logging.basicConfig(stream=sys.stderr, - format='%(asctime)s %(levelname)s: %(message)s', + format='%(asctime)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=max(4 - args.verbose, 1) * 10) @@ -297,11 +308,30 @@ class UpdateIndex: @staticmethod def add_args(parser): - pass + group = parser.add_argument_group('Filter arguments') + group.add_argument('--boundaries-only', action='store_true', + help="""Index only administrative boundaries.""") + group.add_argument('--no-boundaries', action='store_true', + help="""Index everything except administrative boundaries.""") + group.add_argument('--minrank', '-r', type=int, metavar='RANK', default=0, + help='Minimum/starting rank') + group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30, + help='Maximum/finishing rank') @staticmethod def run(args): - return run_legacy_script('update.php', '--index', nominatim_env=args) + indexer = Indexer(args.config.get_libpq_dsn(), + args.threads or _num_system_cpus() or 1) + + if not args.no_boundaries: + indexer.index_boundaries(args.minrank, args.maxrank) + if not args.boundaries_only: + indexer.index_by_rank(args.minrank, args.maxrank) + + if not args.no_boundaries and not args.boundaries_only: + indexer.update_status_table() + + return 0 class UpdateRefresh: diff --git a/nominatim/config.py b/nominatim/config.py index 911c7ddf..458c828f 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -29,6 +29,18 @@ class Configuration: return os.environ.get(name) or self._config[name] + def get_libpq_dsn(self): + """ Get configured database DSN converted into the key/value format + understood by libpq and psycopg. + """ + dsn = self.DATABASE_DSN + + if dsn.startswith('pgsql:'): + # Old PHP DSN format. Convert before returning. + return dsn[6:].replace(';', ' ') + + return dsn + def get_os_env(self): """ Return a copy of the OS environment with the Nominatim configuration merged in. diff --git a/nominatim/db/__init__.py b/nominatim/db/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/indexer/db.py b/nominatim/db/async_connection.py similarity index 87% rename from nominatim/indexer/db.py rename to nominatim/db/async_connection.py index 85b84431..45e83664 100644 --- a/nominatim/indexer/db.py +++ b/nominatim/db/async_connection.py @@ -11,26 +11,14 @@ from psycopg2.extras import wait_select LOG = logging.getLogger() -def make_connection(options, asynchronous=False): - """ Create a psycopg2 connection from the given options. - """ - params = {'dbname' : options.dbname, - 'user' : options.user, - 'password' : options.password, - 'host' : options.host, - 'port' : options.port, - 'async' : asynchronous} - - return psycopg2.connect(**params) - class DBConnection: """ A single non-blocking database connection. """ - def __init__(self, options): + def __init__(self, dsn): self.current_query = None self.current_params = None - self.options = options + self.dsn = dsn self.conn = None self.cursor = None @@ -46,7 +34,9 @@ class DBConnection: self.cursor.close() self.conn.close() - self.conn = make_connection(self.options, asynchronous=True) + # Use a dict to hand in the parameters because async is a reserved + # word in Python3. + self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True}) self.wait() self.cursor = self.conn.cursor() diff --git a/nominatim/nominatim.py b/nominatim/indexer/indexer.py old mode 100755 new mode 100644 similarity index 54% rename from nominatim/nominatim.py rename to nominatim/indexer/indexer.py index 8cac583e..094d1279 --- a/nominatim/nominatim.py +++ b/nominatim/indexer/indexer.py @@ -1,35 +1,14 @@ -#! /usr/bin/env python3 -#----------------------------------------------------------------------------- -# nominatim - [description] -#----------------------------------------------------------------------------- -# -# Indexing tool for the Nominatim database. -# -# Based on C version by Brian Quinion -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -#----------------------------------------------------------------------------- +""" +Main work horse for indexing (computing addresses) the database. +""" # pylint: disable=C0111 -from argparse import ArgumentParser, RawDescriptionHelpFormatter import logging -import sys -import getpass import select -from indexer.progress import ProgressLogger # pylint: disable=E0401 -from indexer.db import DBConnection, make_connection # pylint: disable=E0401 +import psycopg2 + +from .progress import ProgressLogger +from ..db.async_connection import DBConnection LOG = logging.getLogger() @@ -117,34 +96,40 @@ class Indexer: """ Main indexing routine. """ - def __init__(self, opts): - self.minrank = max(1, opts.minrank) - self.maxrank = min(30, opts.maxrank) - self.conn = make_connection(opts) - self.threads = [DBConnection(opts) for _ in range(opts.threads)] + def __init__(self, dsn, num_threads): + self.conn = psycopg2.connect(dsn) + self.threads = [DBConnection(dsn) for _ in range(num_threads)] - def index_boundaries(self): + def index_boundaries(self, minrank, maxrank): LOG.warning("Starting indexing boundaries using %s threads", len(self.threads)) - for rank in range(max(self.minrank, 5), min(self.maxrank, 26)): + for rank in range(max(minrank, 5), min(maxrank, 26)): self.index(BoundaryRunner(rank)) - def index_by_rank(self): + def index_by_rank(self, minrank, maxrank): """ Run classic indexing by rank. """ + maxrank = min(maxrank, 30) LOG.warning("Starting indexing rank (%i to %i) using %i threads", - self.minrank, self.maxrank, len(self.threads)) + minrank, maxrank, len(self.threads)) - for rank in range(max(1, self.minrank), self.maxrank): + for rank in range(max(1, minrank), maxrank): self.index(RankRunner(rank)) - if self.maxrank == 30: + if maxrank == 30: self.index(RankRunner(0)) self.index(InterpolationRunner(), 20) - self.index(RankRunner(self.maxrank), 20) + self.index(RankRunner(30), 20) else: - self.index(RankRunner(self.maxrank)) + self.index(RankRunner(maxrank)) + + def update_status_table(self): + """ Update the status in the status table to 'indexed'. + """ + with self.conn.cursor() as cur: + cur.execute('UPDATE import_status SET indexed = true') + self.conn.commit() def index(self, obj, batch=1): """ Index a single rank or table. `obj` describes the SQL to use @@ -212,60 +197,3 @@ class Indexer: ready, _, _ = select.select(self.threads, [], []) assert False, "Unreachable code" - - -def nominatim_arg_parser(): - """ Setup the command-line parser for the tool. - """ - parser = ArgumentParser(description="Indexing tool for Nominatim.", - formatter_class=RawDescriptionHelpFormatter) - - parser.add_argument('-d', '--database', - dest='dbname', action='store', default='nominatim', - help='Name of the PostgreSQL database to connect to.') - parser.add_argument('-U', '--username', - dest='user', action='store', - help='PostgreSQL user name.') - parser.add_argument('-W', '--password', - dest='password_prompt', action='store_true', - help='Force password prompt.') - parser.add_argument('-H', '--host', - dest='host', action='store', - help='PostgreSQL server hostname or socket location.') - parser.add_argument('-P', '--port', - dest='port', action='store', - help='PostgreSQL server port') - parser.add_argument('-b', '--boundary-only', - dest='boundary_only', action='store_true', - help='Only index administrative boundaries (ignores min/maxrank).') - parser.add_argument('-r', '--minrank', - dest='minrank', type=int, metavar='RANK', default=0, - help='Minimum/starting rank.') - parser.add_argument('-R', '--maxrank', - dest='maxrank', type=int, metavar='RANK', default=30, - help='Maximum/finishing rank.') - parser.add_argument('-t', '--threads', - dest='threads', type=int, metavar='NUM', default=1, - help='Number of threads to create for indexing.') - parser.add_argument('-v', '--verbose', - dest='loglevel', action='count', default=0, - help='Increase verbosity') - - return parser - -if __name__ == '__main__': - logging.basicConfig(stream=sys.stderr, format='%(levelname)s: %(message)s') - - OPTIONS = nominatim_arg_parser().parse_args(sys.argv[1:]) - - LOG.setLevel(max(3 - OPTIONS.loglevel, 0) * 10) - - OPTIONS.password = None - if OPTIONS.password_prompt: - PASSWORD = getpass.getpass("Database password: ") - OPTIONS.password = PASSWORD - - if OPTIONS.boundary_only: - Indexer(OPTIONS).index_boundaries() - else: - Indexer(OPTIONS).index_by_rank() diff --git a/nominatim/indexer/progress.py b/nominatim/indexer/progress.py index 99120673..c9d8816b 100644 --- a/nominatim/indexer/progress.py +++ b/nominatim/indexer/progress.py @@ -26,7 +26,7 @@ class ProgressLogger: self.done_places = 0 self.rank_start_time = datetime.now() self.log_interval = log_interval - self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.INFO) else total + 1 + self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1 def add(self, num=1): """ Mark `num` places as processed. Print a log message if the @@ -47,9 +47,9 @@ class ProgressLogger: places_per_sec = self.done_places / done_time eta = (self.total_places - self.done_places) / places_per_sec - LOG.info("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f", - self.done_places, int(done_time), - places_per_sec, self.name, eta) + LOG.warning("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f", + self.done_places, int(done_time), + places_per_sec, self.name, eta) self.next_info += int(places_per_sec) * self.log_interval diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index 68d7b2f4..0ee92137 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -91,6 +91,7 @@ class NominatimEnvironment: self.test_env['NOMINATIM_BINDIR'] = self.src_dir / 'utils' self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.build_dir / 'module' self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = self.build_dir / 'osm2pgsql' / 'osm2pgsql' + self.test_env['NOMINATIM_NOMINATIM_TOOL'] = self.build_dir / 'nominatim' if self.server_module_path: self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path diff --git a/test/python/test_config.py b/test/python/test_config.py index 03e4a800..e5d18f91 100644 --- a/test/python/test_config.py +++ b/test/python/test_config.py @@ -54,3 +54,22 @@ def test_get_os_env_prefer_os_environ(): assert config.get_os_env()['NOMINATIM_DATABASE_WEBUSER'] == 'nobody' del os.environ['NOMINATIM_DATABASE_WEBUSER'] + +def test_get_libpq_dsn_convert_default(): + config = Configuration(None, DEFCFG_DIR) + + assert config.get_libpq_dsn() == 'dbname=nominatim' + +def test_get_libpq_dsn_convert_php(): + config = Configuration(None, DEFCFG_DIR) + + os.environ['NOMINATIM_DATABASE_DSN'] = 'pgsql:dbname=gis;password=foo;host=localhost' + + assert config.get_libpq_dsn() == 'dbname=gis password=foo host=localhost' + +def test_get_libpq_dsn_convert_libpq(): + config = Configuration(None, DEFCFG_DIR) + + os.environ['NOMINATIM_DATABASE_DSN'] = 'host=localhost dbname=gis password=foo' + + assert config.get_libpq_dsn() == 'host=localhost dbname=gis password=foo'