require tokeinzer for indexer

author Sarah Hoffmann <lonvia@denofr.de>

Sat, 24 Apr 2021 09:25:47 +0000 (11:25 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sat, 24 Apr 2021 09:25:47 +0000 (11:25 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py

index 8fd4f6011251f480a3d37ff2572d979c7b971cb5..ea95e4565270a72dad8b9cdd2c609f51ee27a9d3 100644 (file)
--- a/nominatim/clicmd/index.py
+++ b/nominatim/clicmd/index.py
@@ -32,8 +32,11 @@ class UpdateIndex:
      @staticmethod
      def run(args):
          from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
  
-        indexer = Indexer(args.config.get_libpq_dsn(),
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                            args.threads or psutil.cpu_count() or 1)
  
          if not args.no_boundaries:
diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py

index c75322d9bf4c5591b9d252bdd288496424dccf54..69939430188838bb69f89bfe9ae041eb3cac1989 100644 (file)
--- a/nominatim/clicmd/replication.py
+++ b/nominatim/clicmd/replication.py
@@ -83,6 +83,7 @@ class UpdateReplication:
      def _update(args):
          from ..tools import replication
          from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
  
          params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
          params.update(base_url=args.config.REPLICATION_URL,
@@ -106,6 +107,8 @@ class UpdateReplication:
                  raise UsageError("Bad argument '--no-index'.")
              recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
  
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
          while True:
              with connect(args.config.get_libpq_dsn()) as conn:
                  start = dt.datetime.now(dt.timezone.utc)
@@ -116,7 +119,7 @@ class UpdateReplication:
  
              if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                  index_start = dt.datetime.now(dt.timezone.utc)
-                indexer = Indexer(args.config.get_libpq_dsn(),
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                    args.threads or 1)
                  indexer.index_boundaries(0, 30)
                  indexer.index_by_rank(0, 30)
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py

index d8d953e30e55ee7e36f45e4cd8351382ef66b6f7..499eff7673dc8f1d393eeb889c731eb980dc216f 100644 (file)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -123,7 +123,7 @@ class SetupAll:
                  with connect(args.config.get_libpq_dsn()) as conn:
                      SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
              LOG.warning('Indexing places')
-            indexer = Indexer(args.config.get_libpq_dsn(),
+            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                args.threads or psutil.cpu_count() or 1)
              indexer.index_full(analyse=not args.index_noanalyse)
  
diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py

index cfa48433fd120c664ecd7aea3b432aec329a3261..3a39a151dd4f1f1c845ee15ae1c1591a75cd0809 100644 (file)
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -79,8 +79,9 @@ class Indexer:
      """ Main indexing routine.
      """
  
-    def __init__(self, dsn, num_threads):
+    def __init__(self, dsn, tokenizer, num_threads):
          self.dsn = dsn
+        self.tokenizer = tokenizer
          self.num_threads = num_threads
  
  
diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature

index 0e922e1d6b503716f649ff8b13409adf21bd9a1f..fd207059408fc96092b537149c0f83e807a5260c 100644 (file)
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -24,7 +24,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | nameaddress_vector |
-         | N1     | Rose, Street, Walltown |
+         | N1     | #Rose Street, Walltown |
          When searching for "23 Rose Street, Walltown"
          Then results contain
           | osm_type | osm_id | name |
@@ -248,7 +248,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Rose, Street, Walltown |
+         | N1     | #Green Moss | #Rose Street, Walltown |
          When searching for "Green Moss, Rose Street, Walltown"
          Then results contain
           | osm_type | osm_id | name |
@@ -299,7 +299,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | name_vector | nameaddress_vector |
-         | N1     | foo         | the road |
+         | N1     | foo         | #the road |
  
      Scenario: Some addr: tags are added to address
          Given the scene roads-with-pois
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py

index dea09833f9cc84efd66fd743a4e79dc027562429..52a50a511c8a095c0c9ce5bd8bd9cb9f7e0783fb 100644 (file)
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -109,7 +109,7 @@ def import_and_index_data_from_place_table(context):
  
      # Call directly as the refresh function does not include postcodes.
      indexer.LOG.setLevel(logging.ERROR)
-    indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
+    indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
  
      check_database_integrity(context)
  
@@ -234,7 +234,7 @@ def check_search_name_contents(context, exclude):
                                  if exclude:
                                      assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
                                  else:
-                                    assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1])
+                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
                      elif name != 'object':
                          assert db_row.contains(name, value), db_row.assert_msg(name, value)
  
diff --git a/test/python/conftest.py b/test/python/conftest.py

index b55c206d69ededeac268e00440060d235df45b22..d4649c24929b91835b6aa3dd7d34f985904ceed6 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -1,3 +1,4 @@
+import importlib
  import itertools
  import sys
  from pathlib import Path
@@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve()))
  from nominatim.config import Configuration
  from nominatim.db import connection
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.db import properties
+
+import dummy_tokenizer
  
  class _TestingCursor(psycopg2.extras.DictCursor):
      """ Extension to the DictCursor class that provides execution
@@ -292,3 +296,17 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
                      sql=tmp_path, data=SRC_DIR / 'data')
  
      return SQLPreprocessor(temp_db_conn, cfg)
+
+
+@pytest.fixture
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn):
+    """ Sets up the configuration so that the test dummy tokenizer will be
+        loaded.
+    """
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    def _import_dummy(module, *args, **kwargs):
+        return dummy_tokenizer
+
+    monkeypatch.setattr(importlib, "import_module", _import_dummy)
+    properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
diff --git a/test/python/test_cli.py b/test/python/test_cli.py

index 1d4aefc7258368ea9496191335010cca554d1d1f..4cca30802369319b37e2abd563c0d80388fc5360 100644 (file)
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -57,6 +57,22 @@ def mock_func_factory(monkeypatch):
      return get_mock
  
  
+@pytest.fixture
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+
+    return tok
+
+
  def test_cli_help(capsys):
      """ Running nominatim tool without arguments prints help.
      """
@@ -221,7 +237,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
                            (['--boundaries-only'], 1, 0),
                            (['--no-boundaries'], 0, 1),
                            (['--boundaries-only', '--no-boundaries'], 0, 0)])
-def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks):
+def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
+                       params, do_bnds, do_ranks):
      temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
      bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
      rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
@@ -253,20 +270,12 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
      assert func_mock.called == 1
  
  
-def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db):
-    class DummyTokenizer:
-        def update_sql_functions(self, *args):
-            self.called = True
-
+def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
      func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
-    tok = DummyTokenizer()
-    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
-                        lambda *args: tok)
-
  
      assert 0 == call_nominatim('refresh', '--functions')
      assert func_mock.called == 1
-    assert hasattr(tok, 'called')
+    assert tokenizer_mock.update_sql_functions_called
  
  
  def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
diff --git a/test/python/test_cli_replication.py b/test/python/test_cli_replication.py

index a62ad1a4a69be28887755ea301aeb747788d4e4d..d4f8290f2caac5701debe0bada8cdd124fcb0d87 100644 (file)
--- a/test/python/test_cli_replication.py
+++ b/test/python/test_cli_replication.py
@@ -27,7 +27,7 @@ def call_nominatim(*args):
                                     cli_args=['replication'] + list(args))
  
  @pytest.fixture
-def index_mock(monkeypatch):
+def index_mock(monkeypatch, tokenizer_mock):
      mock = MockParamCapture()
      monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
      monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@@ -52,7 +52,7 @@ def init_status(temp_db_conn, status_table):
  
  
  @pytest.fixture
-def update_mock(mock_func_factory, init_status):
+def update_mock(mock_func_factory, init_status, tokenizer_mock):
      return mock_func_factory(nominatim.tools.replication, 'update')
  
  @pytest.mark.parametrize("params,func", [
diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py

index 223a599ec4fc818c0bd3ad1a3712251b6ab8ec7a..fdd50a42c2e59489d0194e3ba83e8fb0f4436e44 100644 (file)
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -6,6 +6,7 @@ import psycopg2
  import pytest
  
  from nominatim.indexer import indexer
+from nominatim.tokenizer import factory
  
  class IndexerTestDB:
  
@@ -115,8 +116,14 @@ def test_db(temp_db_conn):
      yield IndexerTestDB(temp_db_conn)
  
  
+@pytest.fixture
+def test_tokenizer(tokenizer_mock, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return factory.create_tokenizer(def_config)
+
+
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_all_by_rank(test_db, threads):
+def test_index_all_by_rank(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -124,7 +131,7 @@ def test_index_all_by_rank(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(0, 30)
  
      assert 0 == test_db.placex_unindexed()
@@ -155,7 +162,7 @@ def test_index_all_by_rank(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_without_30(test_db, threads):
+def test_index_partial_without_30(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -163,7 +170,8 @@ def test_index_partial_without_30(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
+                          test_tokenizer, threads)
      idx.index_by_rank(4, 15)
  
      assert 19 == test_db.placex_unindexed()
@@ -175,7 +183,7 @@ def test_index_partial_without_30(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_with_30(test_db, threads):
+def test_index_partial_with_30(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -183,7 +191,7 @@ def test_index_partial_with_30(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(28, 30)
  
      assert 27 == test_db.placex_unindexed()
@@ -194,7 +202,7 @@ def test_index_partial_with_30(test_db, threads):
                        WHERE indexed_status = 0 AND rank_address between 1 and 27""")
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_boundaries(test_db, threads):
+def test_index_boundaries(test_db, threads, test_tokenizer):
      for rank in range(4, 10):
          test_db.add_admin(rank_address=rank, rank_search=rank)
      for rank in range(31):
@@ -204,7 +212,7 @@ def test_index_boundaries(test_db, threads):
      assert 37 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_boundaries(0, 30)
  
      assert 31 == test_db.placex_unindexed()
@@ -216,13 +224,13 @@ def test_index_boundaries(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_postcodes(test_db, threads):
+def test_index_postcodes(test_db, threads, test_tokenizer):
      for postcode in range(1000):
          test_db.add_postcode('de', postcode)
      for postcode in range(32000, 33000):
          test_db.add_postcode('us', postcode)
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_postcodes()
  
      assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
@@ -230,7 +238,7 @@ def test_index_postcodes(test_db, threads):
  
  
  @pytest.mark.parametrize("analyse", [True, False])
-def test_index_full(test_db, analyse):
+def test_index_full(test_db, analyse, test_tokenizer):
      for rank in range(4, 10):
          test_db.add_admin(rank_address=rank, rank_search=rank)
      for rank in range(31):
@@ -239,7 +247,7 @@ def test_index_full(test_db, analyse):
      for postcode in range(1000):
          test_db.add_postcode('de', postcode)
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
      idx.index_full(analyse=analyse)
  
      assert 0 == test_db.placex_unindexed()
@@ -249,13 +257,13 @@ def test_index_full(test_db, analyse):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_reopen_connection(test_db, threads, monkeypatch):
+def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
      monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
  
      for _ in range(1000):
          test_db.add_place(rank_address=30, rank_search=30)
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(28, 30)
  
      assert 0 == test_db.placex_unindexed()
diff --git a/test/python/test_tokenizer_factory.py b/test/python/test_tokenizer_factory.py

index 63c6915b1f81b371295c161718e115d32e31a433..69517e9401739c1e23ae8cdc3d892cf84f9ca2a2 100644 (file)
--- a/test/python/test_tokenizer_factory.py
+++ b/test/python/test_tokenizer_factory.py
@@ -7,7 +7,7 @@ import pytest
  from nominatim.db import properties
  from nominatim.tokenizer import factory
  from nominatim.errors import UsageError
-import dummy_tokenizer
+from dummy_tokenizer import DummyTokenizer
  
  @pytest.fixture
  def test_config(def_config, tmp_path):
@@ -15,37 +15,27 @@ def test_config(def_config, tmp_path):
      return def_config
  
  
-@pytest.fixture
-def tokenizer_import(monkeypatch):
-    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
-
-    def _import_dummy(module, *args, **kwargs):
-        return dummy_tokenizer
-
-    monkeypatch.setattr(importlib, "import_module", _import_dummy)
-
-
  def test_setup_dummy_tokenizer(temp_db_conn, test_config,
-                               tokenizer_import, property_table):
+                               tokenizer_mock, property_table):
      tokenizer = factory.create_tokenizer(test_config)
  
-    assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer)
+    assert isinstance(tokenizer, DummyTokenizer)
      assert tokenizer.init_state == "new"
      assert (test_config.project_dir / 'tokenizer').is_dir()
  
      assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
  
  
-def test_setup_tokenizer_dir_exists(test_config, tokenizer_import, property_table):
+def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
      (test_config.project_dir / 'tokenizer').mkdir()
  
      tokenizer = factory.create_tokenizer(test_config)
  
-    assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer)
+    assert isinstance(tokenizer, DummyTokenizer)
      assert tokenizer.init_state == "new"
  
  
-def test_setup_tokenizer_dir_failure(test_config, tokenizer_import, property_table):
+def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
      (test_config.project_dir / 'tokenizer').write_text("foo")
  
      with pytest.raises(UsageError):
@@ -59,16 +49,16 @@ def test_setup_bad_tokenizer_name(test_config, monkeypatch):
          factory.create_tokenizer(test_config)
  
  def test_load_tokenizer(temp_db_conn, test_config,
-                        tokenizer_import, property_table):
+                        tokenizer_mock, property_table):
      factory.create_tokenizer(test_config)
  
      tokenizer = factory.get_tokenizer_for_db(test_config)
  
-    assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer)
+    assert isinstance(tokenizer, DummyTokenizer)
      assert tokenizer.init_state == "loaded"
  
  
-def test_load_no_tokenizer_dir(test_config, tokenizer_import, property_table):
+def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
      factory.create_tokenizer(test_config)
  
      test_config.project_dir = test_config.project_dir / 'foo'
@@ -77,7 +67,7 @@ def test_load_no_tokenizer_dir(test_config, tokenizer_import, property_table):
          factory.get_tokenizer_for_db(test_config)
  
  
-def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_import, property_table):
+def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
      factory.create_tokenizer(test_config)
  
      temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
author	Sarah Hoffmann <lonvia@denofr.de>
	Sat, 24 Apr 2021 09:25:47 +0000 (11:25 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
nominatim/clicmd/index.py		patch \| blob \| history
nominatim/clicmd/replication.py		patch \| blob \| history
nominatim/clicmd/setup.py		patch \| blob \| history
nominatim/indexer/indexer.py		patch \| blob \| history
test/bdd/db/import/search_name.feature		patch \| blob \| history
test/bdd/steps/steps_db_ops.py		patch \| blob \| history
test/python/conftest.py		patch \| blob \| history
test/python/test_cli.py		patch \| blob \| history
test/python/test_cli_replication.py		patch \| blob \| history
test/python/test_indexing.py		patch \| blob \| history
test/python/test_tokenizer_factory.py		patch \| blob \| history