From: Sarah Hoffmann Date: Sat, 24 Apr 2021 09:25:47 +0000 (+0200) Subject: require tokeinzer for indexer X-Git-Tag: v4.0.0~93^2~25 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/e1c5673ac31831e1b0cca58e21ebe6622c372ea7 require tokeinzer for indexer --- diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py index 8fd4f601..ea95e456 100644 --- a/nominatim/clicmd/index.py +++ b/nominatim/clicmd/index.py @@ -32,8 +32,11 @@ class UpdateIndex: @staticmethod def run(args): from ..indexer.indexer import Indexer + from ..tokenizer import factory as tokenizer_factory - indexer = Indexer(args.config.get_libpq_dsn(), + tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) + + indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or psutil.cpu_count() or 1) if not args.no_boundaries: diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py index c75322d9..69939430 100644 --- a/nominatim/clicmd/replication.py +++ b/nominatim/clicmd/replication.py @@ -83,6 +83,7 @@ class UpdateReplication: def _update(args): from ..tools import replication from ..indexer.indexer import Indexer + from ..tokenizer import factory as tokenizer_factory params = args.osm2pgsql_options(default_cache=2000, default_threads=1) params.update(base_url=args.config.REPLICATION_URL, @@ -106,6 +107,8 @@ class UpdateReplication: raise UsageError("Bad argument '--no-index'.") recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL') + tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) + while True: with connect(args.config.get_libpq_dsn()) as conn: start = dt.datetime.now(dt.timezone.utc) @@ -116,7 +119,7 @@ class UpdateReplication: if state is not replication.UpdateState.NO_CHANGES and args.do_index: index_start = dt.datetime.now(dt.timezone.utc) - indexer = Indexer(args.config.get_libpq_dsn(), + indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1) indexer.index_boundaries(0, 30) indexer.index_by_rank(0, 30) diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index d8d953e3..499eff76 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -123,7 +123,7 @@ class SetupAll: with connect(args.config.get_libpq_dsn()) as conn: SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX) LOG.warning('Indexing places') - indexer = Indexer(args.config.get_libpq_dsn(), + indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or psutil.cpu_count() or 1) indexer.index_full(analyse=not args.index_noanalyse) diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index cfa48433..3a39a151 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -79,8 +79,9 @@ class Indexer: """ Main indexing routine. """ - def __init__(self, dsn, num_threads): + def __init__(self, dsn, tokenizer, num_threads): self.dsn = dsn + self.tokenizer = tokenizer self.num_threads = num_threads diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature index 0e922e1d..fd207059 100644 --- a/test/bdd/db/import/search_name.feature +++ b/test/bdd/db/import/search_name.feature @@ -24,7 +24,7 @@ Feature: Creation of search terms When importing Then search_name contains | object | nameaddress_vector | - | N1 | Rose, Street, Walltown | + | N1 | #Rose Street, Walltown | When searching for "23 Rose Street, Walltown" Then results contain | osm_type | osm_id | name | @@ -248,7 +248,7 @@ Feature: Creation of search terms When importing Then search_name contains | object | name_vector | nameaddress_vector | - | N1 | #Green Moss | Rose, Street, Walltown | + | N1 | #Green Moss | #Rose Street, Walltown | When searching for "Green Moss, Rose Street, Walltown" Then results contain | osm_type | osm_id | name | @@ -299,7 +299,7 @@ Feature: Creation of search terms When importing Then search_name contains | object | name_vector | nameaddress_vector | - | N1 | foo | the road | + | N1 | foo | #the road | Scenario: Some addr: tags are added to address Given the scene roads-with-pois diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index dea09833..52a50a51 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -109,7 +109,7 @@ def import_and_index_data_from_place_table(context): # Call directly as the refresh function does not include postcodes. indexer.LOG.setLevel(logging.ERROR) - indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False) + indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False) check_database_integrity(context) @@ -234,7 +234,7 @@ def check_search_name_contents(context, exclude): if exclude: assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1]) else: - assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1]) + assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1]) elif name != 'object': assert db_row.contains(name, value), db_row.assert_msg(name, value) diff --git a/test/python/conftest.py b/test/python/conftest.py index b55c206d..d4649c24 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -1,3 +1,4 @@ +import importlib import itertools import sys from pathlib import Path @@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve())) from nominatim.config import Configuration from nominatim.db import connection from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.db import properties + +import dummy_tokenizer class _TestingCursor(psycopg2.extras.DictCursor): """ Extension to the DictCursor class that provides execution @@ -292,3 +296,17 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): sql=tmp_path, data=SRC_DIR / 'data') return SQLPreprocessor(temp_db_conn, cfg) + + +@pytest.fixture +def tokenizer_mock(monkeypatch, property_table, temp_db_conn): + """ Sets up the configuration so that the test dummy tokenizer will be + loaded. + """ + monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy') + + def _import_dummy(module, *args, **kwargs): + return dummy_tokenizer + + monkeypatch.setattr(importlib, "import_module", _import_dummy) + properties.set_property(temp_db_conn, 'tokenizer', 'dummy') diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 1d4aefc7..4cca3080 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -57,6 +57,22 @@ def mock_func_factory(monkeypatch): return get_mock +@pytest.fixture +def tokenizer_mock(monkeypatch): + class DummyTokenizer: + def __init__(self, *args, **kwargs): + self.update_sql_functions_called = False + + def update_sql_functions(self, *args): + self.update_sql_functions_called = True + + tok = DummyTokenizer() + monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' , + lambda *args: tok) + + return tok + + def test_cli_help(capsys): """ Running nominatim tool without arguments prints help. """ @@ -221,7 +237,8 @@ def test_add_data_command(mock_run_legacy, name, oid): (['--boundaries-only'], 1, 0), (['--no-boundaries'], 0, 1), (['--boundaries-only', '--no-boundaries'], 0, 0)]) -def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks): +def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock, + params, do_bnds, do_ranks): temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)") bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries') rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank') @@ -253,20 +270,12 @@ def test_refresh_command(mock_func_factory, temp_db, command, func): assert func_mock.called == 1 -def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db): - class DummyTokenizer: - def update_sql_functions(self, *args): - self.called = True - +def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock): func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions') - tok = DummyTokenizer() - monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' , - lambda *args: tok) - assert 0 == call_nominatim('refresh', '--functions') assert func_mock.called == 1 - assert hasattr(tok, 'called') + assert tokenizer_mock.update_sql_functions_called def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db): diff --git a/test/python/test_cli_replication.py b/test/python/test_cli_replication.py index a62ad1a4..d4f8290f 100644 --- a/test/python/test_cli_replication.py +++ b/test/python/test_cli_replication.py @@ -27,7 +27,7 @@ def call_nominatim(*args): cli_args=['replication'] + list(args)) @pytest.fixture -def index_mock(monkeypatch): +def index_mock(monkeypatch, tokenizer_mock): mock = MockParamCapture() monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock) monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock) @@ -52,7 +52,7 @@ def init_status(temp_db_conn, status_table): @pytest.fixture -def update_mock(mock_func_factory, init_status): +def update_mock(mock_func_factory, init_status, tokenizer_mock): return mock_func_factory(nominatim.tools.replication, 'update') @pytest.mark.parametrize("params,func", [ diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 223a599e..fdd50a42 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -6,6 +6,7 @@ import psycopg2 import pytest from nominatim.indexer import indexer +from nominatim.tokenizer import factory class IndexerTestDB: @@ -115,8 +116,14 @@ def test_db(temp_db_conn): yield IndexerTestDB(temp_db_conn) +@pytest.fixture +def test_tokenizer(tokenizer_mock, def_config, tmp_path): + def_config.project_dir = tmp_path + return factory.create_tokenizer(def_config) + + @pytest.mark.parametrize("threads", [1, 15]) -def test_index_all_by_rank(test_db, threads): +def test_index_all_by_rank(test_db, threads, test_tokenizer): for rank in range(31): test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_osmline() @@ -124,7 +131,7 @@ def test_index_all_by_rank(test_db, threads): assert 31 == test_db.placex_unindexed() assert 1 == test_db.osmline_unindexed() - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads) idx.index_by_rank(0, 30) assert 0 == test_db.placex_unindexed() @@ -155,7 +162,7 @@ def test_index_all_by_rank(test_db, threads): @pytest.mark.parametrize("threads", [1, 15]) -def test_index_partial_without_30(test_db, threads): +def test_index_partial_without_30(test_db, threads, test_tokenizer): for rank in range(31): test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_osmline() @@ -163,7 +170,8 @@ def test_index_partial_without_30(test_db, threads): assert 31 == test_db.placex_unindexed() assert 1 == test_db.osmline_unindexed() - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', + test_tokenizer, threads) idx.index_by_rank(4, 15) assert 19 == test_db.placex_unindexed() @@ -175,7 +183,7 @@ def test_index_partial_without_30(test_db, threads): @pytest.mark.parametrize("threads", [1, 15]) -def test_index_partial_with_30(test_db, threads): +def test_index_partial_with_30(test_db, threads, test_tokenizer): for rank in range(31): test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_osmline() @@ -183,7 +191,7 @@ def test_index_partial_with_30(test_db, threads): assert 31 == test_db.placex_unindexed() assert 1 == test_db.osmline_unindexed() - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads) idx.index_by_rank(28, 30) assert 27 == test_db.placex_unindexed() @@ -194,7 +202,7 @@ def test_index_partial_with_30(test_db, threads): WHERE indexed_status = 0 AND rank_address between 1 and 27""") @pytest.mark.parametrize("threads", [1, 15]) -def test_index_boundaries(test_db, threads): +def test_index_boundaries(test_db, threads, test_tokenizer): for rank in range(4, 10): test_db.add_admin(rank_address=rank, rank_search=rank) for rank in range(31): @@ -204,7 +212,7 @@ def test_index_boundaries(test_db, threads): assert 37 == test_db.placex_unindexed() assert 1 == test_db.osmline_unindexed() - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads) idx.index_boundaries(0, 30) assert 31 == test_db.placex_unindexed() @@ -216,13 +224,13 @@ def test_index_boundaries(test_db, threads): @pytest.mark.parametrize("threads", [1, 15]) -def test_index_postcodes(test_db, threads): +def test_index_postcodes(test_db, threads, test_tokenizer): for postcode in range(1000): test_db.add_postcode('de', postcode) for postcode in range(32000, 33000): test_db.add_postcode('us', postcode) - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads) idx.index_postcodes() assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode @@ -230,7 +238,7 @@ def test_index_postcodes(test_db, threads): @pytest.mark.parametrize("analyse", [True, False]) -def test_index_full(test_db, analyse): +def test_index_full(test_db, analyse, test_tokenizer): for rank in range(4, 10): test_db.add_admin(rank_address=rank, rank_search=rank) for rank in range(31): @@ -239,7 +247,7 @@ def test_index_full(test_db, analyse): for postcode in range(1000): test_db.add_postcode('de', postcode) - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4) idx.index_full(analyse=analyse) assert 0 == test_db.placex_unindexed() @@ -249,13 +257,13 @@ def test_index_full(test_db, analyse): @pytest.mark.parametrize("threads", [1, 15]) -def test_index_reopen_connection(test_db, threads, monkeypatch): +def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer): monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15) for _ in range(1000): test_db.add_place(rank_address=30, rank_search=30) - idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) + idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads) idx.index_by_rank(28, 30) assert 0 == test_db.placex_unindexed() diff --git a/test/python/test_tokenizer_factory.py b/test/python/test_tokenizer_factory.py index 63c6915b..69517e94 100644 --- a/test/python/test_tokenizer_factory.py +++ b/test/python/test_tokenizer_factory.py @@ -7,7 +7,7 @@ import pytest from nominatim.db import properties from nominatim.tokenizer import factory from nominatim.errors import UsageError -import dummy_tokenizer +from dummy_tokenizer import DummyTokenizer @pytest.fixture def test_config(def_config, tmp_path): @@ -15,37 +15,27 @@ def test_config(def_config, tmp_path): return def_config -@pytest.fixture -def tokenizer_import(monkeypatch): - monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy') - - def _import_dummy(module, *args, **kwargs): - return dummy_tokenizer - - monkeypatch.setattr(importlib, "import_module", _import_dummy) - - def test_setup_dummy_tokenizer(temp_db_conn, test_config, - tokenizer_import, property_table): + tokenizer_mock, property_table): tokenizer = factory.create_tokenizer(test_config) - assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer) + assert isinstance(tokenizer, DummyTokenizer) assert tokenizer.init_state == "new" assert (test_config.project_dir / 'tokenizer').is_dir() assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy' -def test_setup_tokenizer_dir_exists(test_config, tokenizer_import, property_table): +def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table): (test_config.project_dir / 'tokenizer').mkdir() tokenizer = factory.create_tokenizer(test_config) - assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer) + assert isinstance(tokenizer, DummyTokenizer) assert tokenizer.init_state == "new" -def test_setup_tokenizer_dir_failure(test_config, tokenizer_import, property_table): +def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table): (test_config.project_dir / 'tokenizer').write_text("foo") with pytest.raises(UsageError): @@ -59,16 +49,16 @@ def test_setup_bad_tokenizer_name(test_config, monkeypatch): factory.create_tokenizer(test_config) def test_load_tokenizer(temp_db_conn, test_config, - tokenizer_import, property_table): + tokenizer_mock, property_table): factory.create_tokenizer(test_config) tokenizer = factory.get_tokenizer_for_db(test_config) - assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer) + assert isinstance(tokenizer, DummyTokenizer) assert tokenizer.init_state == "loaded" -def test_load_no_tokenizer_dir(test_config, tokenizer_import, property_table): +def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table): factory.create_tokenizer(test_config) test_config.project_dir = test_config.project_dir / 'foo' @@ -77,7 +67,7 @@ def test_load_no_tokenizer_dir(test_config, tokenizer_import, property_table): factory.get_tokenizer_for_db(test_config) -def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_import, property_table): +def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table): factory.create_tokenizer(test_config) temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")