From: Sarah Hoffmann Date: Tue, 18 May 2021 09:31:18 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~167 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/f5e56d1ef1e2631c67cc95bd124403410248fbdd?hp=3f73a363f42cb3cb6627ddcea371e3a22810daa6 Merge remote-tracking branch 'upstream/master' --- diff --git a/.pylintrc b/.pylintrc index 6cf97be9..022243ad 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,4 +12,4 @@ ignored-modules=icu ignored-classes=NominatimArgs,closing disable=too-few-public-methods,duplicate-code -good-names=i,x,y +good-names=i,x,y,fd diff --git a/CMakeLists.txt b/CMakeLists.txt index a2a98455..b8a6a2e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -279,4 +279,5 @@ install(FILES settings/env.defaults settings/import-address.style settings/import-full.style settings/import-extratags.style + settings/legacy_icu_tokenizer.json DESTINATION ${NOMINATIM_CONFIGDIR}) diff --git a/docs/admin/Customization.md b/docs/admin/Customization.md index 10c8d3dd..76f0f85a 100644 --- a/docs/admin/Customization.md +++ b/docs/admin/Customization.md @@ -51,11 +51,11 @@ entire US adds about 10GB to your database. 1. Get preprocessed TIGER 2020 data: cd $PROJECT_DIR - wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.tar.gz + wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.csv.tar.gz 2. Import the data into your Nominatim database: - nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.tar.gz + nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.csv.tar.gz 3. Enable use of the Tiger data in your `.env` by adding: diff --git a/docs/admin/Import.md b/docs/admin/Import.md index 2a9a86e6..2686942e 100644 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@ -252,6 +252,9 @@ to verify that your installation is working. Go to `http://localhost:8088/status.php` and you should see the message `OK`. You can also run a search query, e.g. `http://localhost:8088/search.php?q=Berlin`. +Note that search query is not supported for reverse-only imports. You can run a +reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=78.04209025`. + To run Nominatim via webservers like Apache or nginx, please read the [Deployment chapter](Deployment.md). diff --git a/lib-php/template/details-json.php b/lib-php/template/details-json.php index 0449dbb9..a813b9a6 100644 --- a/lib-php/template/details-json.php +++ b/lib-php/template/details-json.php @@ -81,10 +81,14 @@ if ($bIncludeKeywords) { if ($aPlaceSearchNameKeywords) { $aPlaceDetails['keywords']['name'] = array_map($funcMapKeyword, $aPlaceSearchNameKeywords); + } else { + $aPlaceDetails['keywords']['name'] = array(); } if ($aPlaceSearchAddressKeywords) { $aPlaceDetails['keywords']['address'] = array_map($funcMapKeyword, $aPlaceSearchAddressKeywords); + } else { + $aPlaceDetails['keywords']['address'] = array(); } } diff --git a/lib-php/website/reverse-only-search.php b/lib-php/website/reverse-only-search.php new file mode 100644 index 00000000..719f1001 --- /dev/null +++ b/lib-php/website/reverse-only-search.php @@ -0,0 +1,12 @@ +getSet('format', array('xml', 'json', 'jsonv2', 'geojson', 'geocodejson'), 'jsonv2'); +set_exception_handler_by_format($sOutputFormat); + +throw new Exception('Reverse-only import does not support forward searching.', 404); diff --git a/lib-sql/tiger_import_finish.sql b/lib-sql/tiger_import_finish.sql index 85bb5e3e..a084a2e2 100644 --- a/lib-sql/tiger_import_finish.sql +++ b/lib-sql/tiger_import_finish.sql @@ -12,4 +12,6 @@ ALTER TABLE location_property_tiger_import RENAME TO location_property_tiger; ALTER INDEX IF EXISTS idx_location_property_tiger_parent_place_id_imp RENAME TO idx_location_property_tiger_housenumber_parent_place_id; ALTER INDEX IF EXISTS idx_location_property_tiger_place_id_imp RENAME TO idx_location_property_tiger_place_id; -DROP FUNCTION tiger_line_import (linegeo geometry, in_startnumber integer, in_endnumber integer, interpolationtype text, in_street text, in_isin text, in_postcode text); +DROP FUNCTION tiger_line_import (linegeo GEOMETRY, in_startnumber INTEGER, + in_endnumber INTEGER, interpolationtype TEXT, + token_info JSONB, in_postcode TEXT); diff --git a/lib-sql/tiger_import_start.sql b/lib-sql/tiger_import_start.sql index 4b9c33fc..faa4efbb 100644 --- a/lib-sql/tiger_import_start.sql +++ b/lib-sql/tiger_import_start.sql @@ -1,9 +1,9 @@ DROP TABLE IF EXISTS location_property_tiger_import; CREATE TABLE location_property_tiger_import (linegeo GEOMETRY, place_id BIGINT, partition INTEGER, parent_place_id BIGINT, startnumber INTEGER, endnumber INTEGER, interpolationtype TEXT, postcode TEXT); -CREATE OR REPLACE FUNCTION tiger_line_import(linegeo GEOMETRY, in_startnumber INTEGER, - in_endnumber INTEGER, interpolationtype TEXT, - in_street TEXT, in_isin TEXT, in_postcode TEXT) RETURNS INTEGER +CREATE OR REPLACE FUNCTION tiger_line_import(linegeo GEOMETRY, in_startnumber INTEGER, + in_endnumber INTEGER, interpolationtype TEXT, + token_info JSONB, in_postcode TEXT) RETURNS INTEGER AS $$ DECLARE startnumber INTEGER; @@ -27,13 +27,13 @@ BEGIN END IF; IF startnumber < 0 THEN - RAISE WARNING 'Negative house number range (% to %) on %, %', startnumber, endnumber, in_street, in_isin; + RAISE WARNING 'Negative house number range (% to %)', startnumber, endnumber; RETURN 0; END IF; numberrange := endnumber - startnumber; - IF (interpolationtype = 'odd' AND startnumber%2 = 0) OR (interpolationtype = 'even' AND startnumber%2 = 1) THEN + IF (interpolationtype = 'odd' AND startnumber % 2 = 0) OR (interpolationtype = 'even' AND startnumber % 2 = 1) THEN startnumber := startnumber + 1; stepsize := 2; ELSE @@ -45,10 +45,10 @@ BEGIN END IF; -- Filter out really broken tiger data - IF numberrange > 0 AND (numberrange::float/stepsize::float > 500) + IF numberrange > 0 AND (numberrange::float/stepsize::float > 500) AND ST_length(linegeo)/(numberrange::float/stepsize::float) < 0.000001 THEN - RAISE WARNING 'Road too short for number range % to % on %, % (%)',startnumber,endnumber,in_street,in_isin, - ST_length(linegeo)/(numberrange::float/stepsize::float); + RAISE WARNING 'Road too short for number range % to % (%)',startnumber,endnumber, + ST_length(linegeo)/(numberrange::float/stepsize::float); RETURN 0; END IF; @@ -56,7 +56,7 @@ BEGIN out_partition := get_partition('us'); out_parent_place_id := null; - address_street_word_ids := word_ids_from_name(in_street); + address_street_word_ids := token_addr_street_match_tokens(token_info); IF address_street_word_ids IS NOT NULL THEN out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid, address_street_word_ids); diff --git a/nominatim/cli.py b/nominatim/cli.py index 20a9c5f1..c66e78d3 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -13,7 +13,6 @@ from nominatim.tools.exec_utils import run_legacy_script, run_php_server from nominatim.errors import UsageError from nominatim import clicmd from nominatim.clicmd.args import NominatimArgs -from nominatim.tools import tiger_data LOG = logging.getLogger() @@ -147,9 +146,14 @@ class UpdateAddData: @staticmethod def run(args): + from nominatim.tokenizer import factory as tokenizer_factory + from nominatim.tools import tiger_data + if args.tiger_data: + tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) return tiger_data.add_tiger_data(args.tiger_data, - args.config, args.threads or 1) + args.config, args.threads or 1, + tokenizer) params = ['update.php'] if args.file: diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index 567c416b..e696e7b6 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -94,6 +94,6 @@ class UpdateRefresh: if args.website: webdir = args.project_dir / 'website' LOG.warning('Setting up website directory at %s', webdir) - refresh.setup_website(webdir, args.config) - + with connect(args.config.get_libpq_dsn()) as conn: + refresh.setup_website(webdir, args.config, conn) return 0 diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 236a28bc..3f590686 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -139,7 +139,8 @@ class SetupAll: webdir = args.project_dir / 'website' LOG.warning('Setup website at %s', webdir) - refresh.setup_website(webdir, args.config) + with connect(args.config.get_libpq_dsn()) as conn: + refresh.setup_website(webdir, args.config, conn) with connect(args.config.get_libpq_dsn()) as conn: try: diff --git a/nominatim/db/async_connection.py b/nominatim/db/async_connection.py index a4f55496..db4b89ce 100644 --- a/nominatim/db/async_connection.py +++ b/nominatim/db/async_connection.py @@ -6,6 +6,9 @@ """ Database helper functions for the indexer. """ import logging +import select +import time + import psycopg2 from psycopg2.extras import wait_select @@ -25,8 +28,9 @@ class DeadlockHandler: normally. """ - def __init__(self, handler): + def __init__(self, handler, ignore_sql_errors=False): self.handler = handler + self.ignore_sql_errors = ignore_sql_errors def __enter__(self): pass @@ -41,6 +45,11 @@ class DeadlockHandler: if exc_value.pgcode == '40P01': self.handler() return True + + if self.ignore_sql_errors and isinstance(exc_value, psycopg2.Error): + LOG.info("SQL error ignored: %s", exc_value) + return True + return False @@ -48,10 +57,11 @@ class DBConnection: """ A single non-blocking database connection. """ - def __init__(self, dsn, cursor_factory=None): + def __init__(self, dsn, cursor_factory=None, ignore_sql_errors=False): self.current_query = None self.current_params = None self.dsn = dsn + self.ignore_sql_errors = ignore_sql_errors self.conn = None self.cursor = None @@ -98,7 +108,7 @@ class DBConnection: """ Block until any pending operation is done. """ while True: - with DeadlockHandler(self._deadlock_handler): + with DeadlockHandler(self._deadlock_handler, self.ignore_sql_errors): wait_select(self.conn) self.current_query = None return @@ -125,9 +135,78 @@ class DBConnection: if self.current_query is None: return True - with DeadlockHandler(self._deadlock_handler): + with DeadlockHandler(self._deadlock_handler, self.ignore_sql_errors): if self.conn.poll() == psycopg2.extensions.POLL_OK: self.current_query = None return True return False + + +class WorkerPool: + """ A pool of asynchronous database connections. + + The pool may be used as a context manager. + """ + REOPEN_CONNECTIONS_AFTER = 100000 + + def __init__(self, dsn, pool_size, ignore_sql_errors=False): + self.threads = [DBConnection(dsn, ignore_sql_errors=ignore_sql_errors) + for _ in range(pool_size)] + self.free_workers = self._yield_free_worker() + self.wait_time = 0 + + + def finish_all(self): + """ Wait for all connection to finish. + """ + for thread in self.threads: + while not thread.is_done(): + thread.wait() + + self.free_workers = self._yield_free_worker() + + def close(self): + """ Close all connections and clear the pool. + """ + for thread in self.threads: + thread.close() + self.threads = [] + self.free_workers = None + + + def next_free_worker(self): + """ Get the next free connection. + """ + return next(self.free_workers) + + + def _yield_free_worker(self): + ready = self.threads + command_stat = 0 + while True: + for thread in ready: + if thread.is_done(): + command_stat += 1 + yield thread + + if command_stat > self.REOPEN_CONNECTIONS_AFTER: + for thread in self.threads: + while not thread.is_done(): + thread.wait() + thread.connect() + ready = self.threads + command_stat = 0 + else: + tstart = time.time() + _, ready, _ = select.select([], self.threads, []) + self.wait_time += time.time() - tstart + + + def __enter__(self): + return self + + + def __exit__(self, exc_type, exc_value, traceback): + self.finish_all() + self.close() diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index b7673aba..5ab0eac3 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -2,14 +2,13 @@ Main work horse for indexing (computing addresses) the database. """ import logging -import select import time import psycopg2.extras from nominatim.indexer.progress import ProgressLogger from nominatim.indexer import runners -from nominatim.db.async_connection import DBConnection +from nominatim.db.async_connection import DBConnection, WorkerPool from nominatim.db.connection import connect LOG = logging.getLogger() @@ -81,73 +80,6 @@ class PlaceFetcher: self.conn.wait() self.close() -class WorkerPool: - """ A pool of asynchronous database connections. - - The pool may be used as a context manager. - """ - REOPEN_CONNECTIONS_AFTER = 100000 - - def __init__(self, dsn, pool_size): - self.threads = [DBConnection(dsn) for _ in range(pool_size)] - self.free_workers = self._yield_free_worker() - self.wait_time = 0 - - - def finish_all(self): - """ Wait for all connection to finish. - """ - for thread in self.threads: - while not thread.is_done(): - thread.wait() - - self.free_workers = self._yield_free_worker() - - def close(self): - """ Close all connections and clear the pool. - """ - for thread in self.threads: - thread.close() - self.threads = [] - self.free_workers = None - - - def next_free_worker(self): - """ Get the next free connection. - """ - return next(self.free_workers) - - - def _yield_free_worker(self): - ready = self.threads - command_stat = 0 - while True: - for thread in ready: - if thread.is_done(): - command_stat += 1 - yield thread - - if command_stat > self.REOPEN_CONNECTIONS_AFTER: - for thread in self.threads: - while not thread.is_done(): - thread.wait() - thread.connect() - ready = self.threads - command_stat = 0 - else: - tstart = time.time() - _, ready, _ = select.select([], self.threads, []) - self.wait_time += time.time() - tstart - - - def __enter__(self): - return self - - - def __exit__(self, exc_type, exc_value, traceback): - self.finish_all() - self.close() - class Indexer: """ Main indexing routine. diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 805bd634..25a97127 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -155,7 +155,7 @@ def recompute_importance(conn): conn.commit() -def setup_website(basedir, config): +def setup_website(basedir, config, conn): """ Create the website script stubs. """ if not basedir.exists(): @@ -187,5 +187,10 @@ def setup_website(basedir, config): template += "\nrequire_once('{}/website/{{}}');\n".format(config.lib_dir.php) + search_name_table_exists = bool(conn and conn.table_exists('search_name')) + for script in WEBSITE_SCRIPTS: - (basedir / script).write_text(template.format(script), 'utf-8') + if not search_name_table_exists and script == 'search.php': + (basedir / script).write_text(template.format('reverse-only-search.php'), 'utf-8') + else: + (basedir / script).write_text(template.format(script), 'utf-8') diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py index 07772c70..ff498f77 100644 --- a/nominatim/tools/tiger_data.py +++ b/nominatim/tools/tiger_data.py @@ -1,15 +1,18 @@ """ Functions for importing tiger data and handling tarbar and directory files """ +import csv +import io import logging import os import tarfile -import selectors + +import psycopg2.extras from nominatim.db.connection import connect -from nominatim.db.async_connection import DBConnection +from nominatim.db.async_connection import WorkerPool from nominatim.db.sql_preprocessor import SQLPreprocessor - +from nominatim.errors import UsageError LOG = logging.getLogger() @@ -20,96 +23,81 @@ def handle_tarfile_or_directory(data_dir): tar = None if data_dir.endswith('.tar.gz'): - tar = tarfile.open(data_dir) - sql_files = [i for i in tar.getmembers() if i.name.endswith('.sql')] - LOG.warning("Found %d SQL files in tarfile with path %s", len(sql_files), data_dir) - if not sql_files: + try: + tar = tarfile.open(data_dir) + except tarfile.ReadError as err: + LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir) + raise UsageError("Cannot open Tiger data file.") from err + + csv_files = [i for i in tar.getmembers() if i.name.endswith('.csv')] + LOG.warning("Found %d CSV files in tarfile with path %s", len(csv_files), data_dir) + if not csv_files: LOG.warning("Tiger data import selected but no files in tarfile's path %s", data_dir) return None, None else: files = os.listdir(data_dir) - sql_files = [os.path.join(data_dir, i) for i in files if i.endswith('.sql')] - LOG.warning("Found %d SQL files in path %s", len(sql_files), data_dir) - if not sql_files: + csv_files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')] + LOG.warning("Found %d CSV files in path %s", len(csv_files), data_dir) + if not csv_files: LOG.warning("Tiger data import selected but no files found in path %s", data_dir) return None, None - return sql_files, tar + return csv_files, tar -def handle_threaded_sql_statements(sel, file): +def handle_threaded_sql_statements(pool, fd, analyzer): """ Handles sql statement with multiplexing """ - lines = 0 - end_of_file = False # Using pool of database connections to execute sql statements - while not end_of_file: - for key, _ in sel.select(1): - conn = key.data - try: - if conn.is_done(): - sql_query = file.readline() - lines += 1 - if not sql_query: - end_of_file = True - break - conn.perform(sql_query) - if lines == 1000: - print('. ', end='', flush=True) - lines = 0 - except Exception as exc: # pylint: disable=broad-except - LOG.info('Wrong SQL statement: %s', exc) - -def handle_unregister_connection_pool(sel, place_threads): - """ Handles unregistering pool of connections - """ - while place_threads > 0: - for key, _ in sel.select(1): - conn = key.data - sel.unregister(conn) - try: - conn.wait() - except Exception as exc: # pylint: disable=broad-except - LOG.info('Wrong SQL statement: %s', exc) - conn.close() - place_threads -= 1 - -def add_tiger_data(data_dir, config, threads): + sql = "SELECT tiger_line_import(%s, %s, %s, %s, %s, %s)" + + for row in csv.DictReader(fd, delimiter=';'): + try: + address = dict(street=row['street'], postcode=row['postcode']) + args = ('SRID=4326;' + row['geometry'], + int(row['from']), int(row['to']), row['interpolation'], + psycopg2.extras.Json(analyzer.process_place(dict(address=address))), + analyzer.normalize_postcode(row['postcode'])) + except ValueError: + continue + pool.next_free_worker().perform(sql, args=args) + + lines += 1 + if lines == 1000: + print('.', end='', flush=True) + lines = 0 + + +def add_tiger_data(data_dir, config, threads, tokenizer): """ Import tiger data from directory or tar file `data dir`. """ dsn = config.get_libpq_dsn() - sql_files, tar = handle_tarfile_or_directory(data_dir) + files, tar = handle_tarfile_or_directory(data_dir) - if not sql_files: + if not files: return with connect(dsn) as conn: sql = SQLPreprocessor(conn, config) sql.run_sql_file(conn, 'tiger_import_start.sql') - # Reading sql_files and then for each file line handling + # Reading files and then for each file line handling # sql_query in chunks. - sel = selectors.DefaultSelector() place_threads = max(1, threads - 1) - # Creates a pool of database connections - for _ in range(place_threads): - conn = DBConnection(dsn) - conn.connect() - sel.register(conn, selectors.EVENT_WRITE, conn) - - for sql_file in sql_files: - if not tar: - file = open(sql_file) - else: - file = tar.extractfile(sql_file) + with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool: + with tokenizer.name_analyzer() as analyzer: + for fname in files: + if not tar: + fd = open(fname) + else: + fd = io.TextIOWrapper(tar.extractfile(fname)) - handle_threaded_sql_statements(sel, file) + handle_threaded_sql_statements(pool, fd, analyzer) - # Unregistering pool of database connections - handle_unregister_connection_pool(sel, place_threads) + fd.close() if tar: tar.close() diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index de02e346..7eb6f3dd 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -9,6 +9,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve())) from nominatim import cli from nominatim.config import Configuration +from nominatim.db.connection import _Connection from nominatim.tools import refresh from nominatim.tokenizer import factory as tokenizer_factory from steps.utils import run_script @@ -54,7 +55,7 @@ class NominatimEnvironment: dbargs['user'] = self.db_user if self.db_pass: dbargs['password'] = self.db_pass - conn = psycopg2.connect(**dbargs) + conn = psycopg2.connect(connection_factory=_Connection, **dbargs) return conn def next_code_coverage_file(self): @@ -110,8 +111,13 @@ class NominatimEnvironment: self.website_dir.cleanup() self.website_dir = tempfile.TemporaryDirectory() + + try: + conn = self.connect_database(dbname) + except: + conn = False refresh.setup_website(Path(self.website_dir.name) / 'website', - self.get_test_config()) + self.get_test_config(), conn) def get_test_config(self): @@ -231,13 +237,13 @@ class NominatimEnvironment: """ Setup a test against a fresh, empty test database. """ self.setup_template_db() - self.write_nominatim_config(self.test_db) conn = self.connect_database(self.template_db) conn.set_isolation_level(0) cur = conn.cursor() cur.execute('DROP DATABASE IF EXISTS {}'.format(self.test_db)) cur.execute('CREATE DATABASE {} TEMPLATE = {}'.format(self.test_db, self.template_db)) conn.close() + self.write_nominatim_config(self.test_db) context.db = self.connect_database(self.test_db) context.db.autocommit = True psycopg2.extras.register_hstore(context.db, globally=False) diff --git a/test/python/test_db_async_connection.py b/test/python/test_db_async_connection.py index b52f7053..330b86f7 100644 --- a/test/python/test_db_async_connection.py +++ b/test/python/test_db_async_connection.py @@ -56,13 +56,21 @@ def test_bad_query(conn): conn.wait() +def test_bad_query_ignore(temp_db): + with closing(DBConnection('dbname=' + temp_db, ignore_sql_errors=True)) as conn: + conn.connect() + + conn.perform('SELECT efasfjsea') + + conn.wait() + + def exec_with_deadlock(cur, sql, detector): with DeadlockHandler(lambda *args: detector.append(1)): cur.execute(sql) def test_deadlock(simple_conns): - print(psycopg2.__version__) cur1, cur2 = simple_conns cur1.execute("""CREATE TABLE t1 (id INT PRIMARY KEY, t TEXT); diff --git a/test/python/test_tools_refresh_setup_website.py b/test/python/test_tools_refresh_setup_website.py index dc822e3c..9b60c0e5 100644 --- a/test/python/test_tools_refresh_setup_website.py +++ b/test/python/test_tools_refresh_setup_website.py @@ -18,16 +18,16 @@ def envdir(tmpdir): @pytest.fixture def test_script(envdir): def _create_file(code): - outfile = envdir / 'php' / 'website' / 'search.php' + outfile = envdir / 'php' / 'website' / 'reverse-only-search.php' outfile.write_text('