X-Git-Url: https://git.openstreetmap.org/nominatim.git/blobdiff_plain/20891abe1c0f1e07a160d13a9bc044e05da8ee8a..a2ee58d8a13bf79cc4280c3eae550d53019347db:/nominatim/indexer/indexer.py diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index b7673aba..233423f0 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -1,16 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. """ Main work horse for indexing (computing addresses) the database. """ +from typing import Optional, Any, cast import logging -import select import time import psycopg2.extras +from nominatim.tokenizer.base import AbstractTokenizer from nominatim.indexer.progress import ProgressLogger from nominatim.indexer import runners -from nominatim.db.async_connection import DBConnection -from nominatim.db.connection import connect +from nominatim.db.async_connection import DBConnection, WorkerPool +from nominatim.db.connection import connect, Connection, Cursor +from nominatim.typing import DictCursorResults LOG = logging.getLogger() @@ -18,10 +26,11 @@ LOG = logging.getLogger() class PlaceFetcher: """ Asynchronous connection that fetches place details for processing. """ - def __init__(self, dsn, setup_conn): - self.wait_time = 0 - self.current_ids = None - self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor) + def __init__(self, dsn: str, setup_conn: Connection) -> None: + self.wait_time = 0.0 + self.current_ids: Optional[DictCursorResults] = None + self.conn: Optional[DBConnection] = DBConnection(dsn, + cursor_factory=psycopg2.extras.DictCursor) with setup_conn.cursor() as cur: # need to fetch those manually because register_hstore cannot @@ -32,7 +41,7 @@ class PlaceFetcher: psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid, array_oid=hstore_array_oid) - def close(self): + def close(self) -> None: """ Close the underlying asynchronous connection. """ if self.conn: @@ -40,127 +49,73 @@ class PlaceFetcher: self.conn = None - def fetch_next_batch(self, cur, runner): + def fetch_next_batch(self, cur: Cursor, runner: runners.Runner) -> bool: """ Send a request for the next batch of places. If details for the places are required, they will be fetched asynchronously. Returns true if there is still data available. """ - ids = cur.fetchmany(100) + ids = cast(Optional[DictCursorResults], cur.fetchmany(100)) if not ids: self.current_ids = None return False - if hasattr(runner, 'get_place_details'): - runner.get_place_details(self.conn, ids) - self.current_ids = [] - else: - self.current_ids = ids + assert self.conn is not None + self.current_ids = runner.get_place_details(self.conn, ids) return True - def get_batch(self): + def get_batch(self) -> DictCursorResults: """ Get the next batch of data, previously requested with `fetch_next_batch`. """ + assert self.conn is not None + assert self.conn.cursor is not None + if self.current_ids is not None and not self.current_ids: tstart = time.time() self.conn.wait() self.wait_time += time.time() - tstart - self.current_ids = self.conn.cursor.fetchall() + self.current_ids = cast(Optional[DictCursorResults], + self.conn.cursor.fetchall()) - return self.current_ids + return self.current_ids if self.current_ids is not None else [] - def __enter__(self): + def __enter__(self) -> 'PlaceFetcher': return self - def __exit__(self, exc_type, exc_value, traceback): + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + assert self.conn is not None self.conn.wait() self.close() -class WorkerPool: - """ A pool of asynchronous database connections. - - The pool may be used as a context manager. - """ - REOPEN_CONNECTIONS_AFTER = 100000 - - def __init__(self, dsn, pool_size): - self.threads = [DBConnection(dsn) for _ in range(pool_size)] - self.free_workers = self._yield_free_worker() - self.wait_time = 0 - - - def finish_all(self): - """ Wait for all connection to finish. - """ - for thread in self.threads: - while not thread.is_done(): - thread.wait() - - self.free_workers = self._yield_free_worker() - - def close(self): - """ Close all connections and clear the pool. - """ - for thread in self.threads: - thread.close() - self.threads = [] - self.free_workers = None - - - def next_free_worker(self): - """ Get the next free connection. - """ - return next(self.free_workers) - - - def _yield_free_worker(self): - ready = self.threads - command_stat = 0 - while True: - for thread in ready: - if thread.is_done(): - command_stat += 1 - yield thread - - if command_stat > self.REOPEN_CONNECTIONS_AFTER: - for thread in self.threads: - while not thread.is_done(): - thread.wait() - thread.connect() - ready = self.threads - command_stat = 0 - else: - tstart = time.time() - _, ready, _ = select.select([], self.threads, []) - self.wait_time += time.time() - tstart - - - def __enter__(self): - return self - - - def __exit__(self, exc_type, exc_value, traceback): - self.finish_all() - self.close() - class Indexer: """ Main indexing routine. """ - def __init__(self, dsn, tokenizer, num_threads): + def __init__(self, dsn: str, tokenizer: AbstractTokenizer, num_threads: int): self.dsn = dsn self.tokenizer = tokenizer self.num_threads = num_threads - def index_full(self, analyse=True): - """ Index the complete database. This will first index boudnaries + def has_pending(self) -> bool: + """ Check if any data still needs indexing. + This function must only be used after the import has finished. + Otherwise it will be very expensive. + """ + with connect(self.dsn) as conn: + with conn.cursor() as cur: + cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1") + return cur.rowcount > 0 + + + def index_full(self, analyse: bool = True) -> None: + """ Index the complete database. This will first index boundaries followed by all other objects. When `analyse` is True, then the database will be analysed at the appropriate places to ensure that database statistics are updated. @@ -168,72 +123,72 @@ class Indexer: with connect(self.dsn) as conn: conn.autocommit = True - if analyse: - def _analyze(): + def _analyze() -> None: + if analyse: with conn.cursor() as cur: cur.execute('ANALYZE') - else: - def _analyze(): - pass - self.index_by_rank(0, 4) - _analyze() + if self.index_by_rank(0, 4) > 0: + _analyze() - self.index_boundaries(0, 30) - _analyze() + if self.index_boundaries(0, 30) > 100: + _analyze() - self.index_by_rank(5, 25) - _analyze() + if self.index_by_rank(5, 25) > 100: + _analyze() - self.index_by_rank(26, 30) - _analyze() + if self.index_by_rank(26, 30) > 1000: + _analyze() - self.index_postcodes() - _analyze() + if self.index_postcodes() > 100: + _analyze() - def index_boundaries(self, minrank, maxrank): + def index_boundaries(self, minrank: int, maxrank: int) -> int: """ Index only administrative boundaries within the given rank range. """ + total = 0 LOG.warning("Starting indexing boundaries using %s threads", self.num_threads) with self.tokenizer.name_analyzer() as analyzer: for rank in range(max(minrank, 4), min(maxrank, 26)): - self._index(runners.BoundaryRunner(rank, analyzer)) + total += self._index(runners.BoundaryRunner(rank, analyzer)) + + return total - def index_by_rank(self, minrank, maxrank): + def index_by_rank(self, minrank: int, maxrank: int) -> int: """ Index all entries of placex in the given rank range (inclusive) in order of their address rank. When rank 30 is requested then also interpolations and places with address rank 0 will be indexed. """ + total = 0 maxrank = min(maxrank, 30) LOG.warning("Starting indexing rank (%i to %i) using %i threads", minrank, maxrank, self.num_threads) with self.tokenizer.name_analyzer() as analyzer: - for rank in range(max(1, minrank), maxrank): - self._index(runners.RankRunner(rank, analyzer)) + for rank in range(max(1, minrank), maxrank + 1): + total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1) if maxrank == 30: - self._index(runners.RankRunner(0, analyzer)) - self._index(runners.InterpolationRunner(analyzer), 20) - self._index(runners.RankRunner(30, analyzer), 20) - else: - self._index(runners.RankRunner(maxrank, analyzer)) + total += self._index(runners.RankRunner(0, analyzer)) + total += self._index(runners.InterpolationRunner(analyzer), 20) + + return total - def index_postcodes(self): - """Index the entries ofthe location_postcode table. + def index_postcodes(self) -> int: + """Index the entries of the location_postcode table. """ LOG.warning("Starting indexing postcodes using %s threads", self.num_threads) - self._index(runners.PostcodeRunner(), 20) + return self._index(runners.PostcodeRunner(), 20) - def update_status_table(self): + def update_status_table(self) -> None: """ Update the status in the status table to 'indexed'. """ with connect(self.dsn) as conn: @@ -242,7 +197,7 @@ class Indexer: conn.commit() - def _index(self, runner, batch=1): + def _index(self, runner: runners.Runner, batch: int = 1) -> int: """ Index a single rank or table. `runner` describes the SQL to use for indexing. `batch` describes the number of objects that should be processed with a single SQL statement @@ -272,9 +227,9 @@ class Indexer: # asynchronously get the next batch has_more = fetcher.fetch_next_batch(cur, runner) - # And insert the curent batch + # And insert the current batch for idx in range(0, len(places), batch): - part = places[idx:idx+batch] + part = places[idx:idx + batch] LOG.debug("Processing places: %s", str(part)) runner.index_places(pool.next_free_worker(), part) progress.add(len(part)) @@ -284,4 +239,4 @@ class Indexer: conn.commit() - progress.done() + return progress.done()