Added --no-replace command for special phrases importation and added corresponding...

author AntoJvlt <antonin.jolivat@gmail.com>

Mon, 17 May 2021 10:40:50 +0000 (12:40 +0200)

committer AntoJvlt <antonin.jolivat@gmail.com>

Mon, 17 May 2021 11:25:06 +0000 (13:25 +0200)
author AntoJvlt <antonin.jolivat@gmail.com>
Mon, 17 May 2021 10:40:50 +0000 (12:40 +0200)
committer AntoJvlt <antonin.jolivat@gmail.com>
Mon, 17 May 2021 11:25:06 +0000 (13:25 +0200)
diff --git a/nominatim/clicmd/special_phrases.py b/nominatim/clicmd/special_phrases.py

index 66fb41ed4557fedbf972d58b84139c3f450744d7..b20a41010680f020ffe6c4bfa9b63928be175c4f 100644 (file)
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@@ -27,6 +27,8 @@ class ImportSpecialPhrases:
                             help='Import special phrases from the OSM wiki to the database.')
          group.add_argument('--import-from-csv', metavar='FILE',
                             help='Import special phrases from a CSV file.')
+        group.add_argument('--no-replace', action='store_true',
+                           help='Keep the old phrases and only add the new ones.')
  
      @staticmethod
      def run(args):
@@ -51,7 +53,8 @@ class ImportSpecialPhrases:
          from ..tokenizer import factory as tokenizer_factory
  
          tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+        should_replace = not args.no_replace
          with connect(args.config.get_libpq_dsn()) as db_connection:
              SPImporter(
                  args.config, args.phplib_dir, db_connection, loader
-            ).import_phrases(tokenizer)
+            ).import_phrases(tokenizer, should_replace)
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 065fdb03a27041eb79bb435db387fdfd316d6801..e07602d90aea7192939d0abf2c0c36240a56c2b3 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -306,7 +306,7 @@ class LegacyICUNameAnalyzer:
              #                WHERE word_id is null and type = 'postcode'""")
  
  
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -345,7 +345,7 @@ class LegacyICUNameAnalyzer:
                                columns=['word', 'word_token', 'class', 'type',
                                         'operator', 'search_name_count'])
  
-            if to_delete:
+            if to_delete and should_replace:
                  psycopg2.extras.execute_values(
                      cur,
                      """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index 438a5aff9ed3861995606c5d8409ff0c7ac13c35..5bd45c51284f211ffc78b4fa4f25a5e169a19d2e 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -314,7 +314,7 @@ class LegacyNameAnalyzer:
                                   FROM location_postcode) x""")
  
  
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -343,7 +343,7 @@ class LegacyNameAnalyzer:
                             FROM (VALUES %s) as v(name, class, type, op))""",
                      to_add)
  
-            if to_delete:
+            if to_delete and should_replace:
                  psycopg2.extras.execute_values(
                      cur,
                      """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
diff --git a/nominatim/tools/special_phrases/sp_importer.py b/nominatim/tools/special_phrases/sp_importer.py

index e82f721e6aca52e32ff0f8d1f8574a6fb8c07d0a..ef4e85bdf1d62161a20de21547e536711c326c2f 100644 (file)
--- a/nominatim/tools/special_phrases/sp_importer.py
+++ b/nominatim/tools/special_phrases/sp_importer.py
@@ -23,7 +23,7 @@ LOG = logging.getLogger()
  class SPImporter():
      # pylint: disable-msg=too-many-instance-attributes
      """
-        Class handling the process of special phrases importations into the database.
+        Class handling the process of special phrases importation into the database.
  
          Take a sp loader which load the phrases from an external source.
      """
@@ -42,10 +42,14 @@ class SPImporter():
          #special phrases class/type on the wiki.
          self.table_phrases_to_delete = set()
  
-    def import_phrases(self, tokenizer):
+    def import_phrases(self, tokenizer, should_replace):
          """
-            Iterate through all specified languages and
-            extract corresponding special phrases from the wiki.
+            Iterate through all SpecialPhrases extracted from the
+            loader and import them into the database.
+
+            If should_replace is set to True only the loaded phrases
+            will be kept into the database. All other phrases already
+            in the database will be removed.
          """
          LOG.warning('Special phrases importation starting')
          self._fetch_existing_place_classtype_tables()
@@ -60,11 +64,12 @@ class SPImporter():
                      class_type_pairs.update(result)
  
          self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_tables_from_db()
+        if should_replace:
+            self._remove_non_existent_tables_from_db()
          self.db_connection.commit()
  
          with tokenizer.name_analyzer() as analyzer:
-            analyzer.update_special_phrases(self.word_phrases)
+            analyzer.update_special_phrases(self.word_phrases, should_replace)
  
          LOG.warning('Import done.')
          self.statistics_handler.notify_import_done()
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

index 6352a644c9988d52ea187634c10519642caa7cc6..2e61a24524c992e8cc41161fe7fffc5d6e11d6a4 100644 (file)
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -54,7 +54,7 @@ class DummyNameAnalyzer:
      def add_postcodes_from_db(self):
          pass
  
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
          self.analyser_cache['special_phrases'] = phrases
  
      def add_country_names(self, code, names):
diff --git a/test/python/test_cli.py b/test/python/test_cli.py

index 9810aee756b63d9b4d0158673a4e20b2c3c5db08..1d89ec699e38de8db1902ae2ecf9ef03a90745e4 100644 (file)
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -255,18 +255,27 @@ def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
      assert bnd_mock.called == do_bnds
      assert rank_mock.called == do_ranks
  
-def test_special_phrases_wiki_command(temp_db, mock_func_factory, tokenizer_mock):
+@pytest.mark.parametrize("no_replace", [(True), (False)])
+def test_special_phrases_wiki_command(temp_db, mock_func_factory, tokenizer_mock, no_replace):
      func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases')
  
-    call_nominatim('special-phrases', '--import-from-wiki')
+    if no_replace:
+        call_nominatim('special-phrases', '--import-from-wiki', '--no-replace')
+    else:
+        call_nominatim('special-phrases', '--import-from-wiki')
  
      assert func.called == 1
  
-def test_special_phrases_csv_command(temp_db, mock_func_factory, tokenizer_mock):
+@pytest.mark.parametrize("no_replace", [(True), (False)])
+def test_special_phrases_csv_command(temp_db, mock_func_factory, tokenizer_mock, no_replace):
      func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases')
-    testdata = Path('__file__') / '..' / '..' / 'testdb'
+    testdata = SRC_DIR / 'test' / 'testdb'
      csv_path = str((testdata / 'full_en_phrases_test.csv').resolve())
-    call_nominatim('special-phrases', '--import-from-csv', csv_path)
+
+    if no_replace:
+        call_nominatim('special-phrases', '--import-from-csv', csv_path, '--no-replace')
+    else:
+        call_nominatim('special-phrases', '--import-from-csv', csv_path)
  
      assert func.called == 1
  
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py

index c567a4c1d331b5ed9c62ddb568b86f4f981a266d..801471723c4b72b5be4c4f6938c188b059639408 100644 (file)
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -209,7 +209,7 @@ def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
          ("König bei", "amenity", "royal", "near"),
          ("Könige", "amenity", "royal", "-"),
          ("strasse", "highway", "primary", "in")
-    ])
+    ], True)
  
      assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                       FROM word WHERE class != 'place'""") \
@@ -226,11 +226,24 @@ def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
  
      assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
  
-    analyzer.update_special_phrases([])
+    analyzer.update_special_phrases([], True)
  
      assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
  
  
+def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,
+                                          make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([], False)
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
  def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
                                        make_standard_name):
      temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
@@ -243,7 +256,7 @@ def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
        ('prison', 'amenity', 'prison', 'in'),
        ('bar', 'highway', 'road', '-'),
        ('garden', 'leisure', 'garden', 'near')
-    ])
+    ], True)
  
      assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                       FROM word WHERE class != 'place'""") \
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py

index 836f15b9e8809d50b2edf60c58cf94b710efa522..92a83249034f1d89435d90e3b8cacbc30f3efaf1 100644 (file)
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -159,7 +159,7 @@ def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor)
              ("König bei", "amenity", "royal", "near"),
              ("Könige", "amenity", "royal", "-"),
              ("street", "highway", "primary", "in")
-        ])
+        ], True)
  
      assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                       FROM word WHERE class != 'place'""") \
@@ -176,11 +176,24 @@ def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor):
      assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
  
      with analyzer() as a:
-        a.update_special_phrases([])
+        a.update_special_phrases([], True)
  
      assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
  
  
+def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
+                                     (' BAR', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    with analyzer() as a:
+        a.update_special_phrases([], False)
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
  def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor):
      temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
                                VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
@@ -193,7 +206,7 @@ def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor):
            ('prison', 'amenity', 'prison', 'in'),
            ('bar', 'highway', 'road', '-'),
            ('garden', 'leisure', 'garden', 'near')
-        ])
+        ], True)
  
      assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                       FROM word WHERE class != 'place'""") \
diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py

index 6f451ade72106853442e9b65c80d6613399fef2f..1b4ab19155fd20b6e84a54c883647e5de87a542b 100644 (file)
--- a/test/python/test_tools_import_special_phrases.py
+++ b/test/python/test_tools_import_special_phrases.py
@@ -185,8 +185,9 @@ def test_remove_non_existent_tables_from_db(sp_importer, default_phrases,
              tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
          )
  
+@pytest.mark.parametrize("should_replace", [(True), (False)])
  def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
-                          placex_table, tokenizer_mock):
+                        placex_table, tokenizer_mock, should_replace):
      """
          Check that the main import_phrases() method is well executed.
          It should create the place_classtype table, the place_id and centroid indexes,
@@ -202,10 +203,10 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
              CREATE TABLE place_classtype_wrongclass_wrongtype();""")
      
      monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content',
-                    mock_get_wiki_content)
+                        mock_get_wiki_content)
  
      tokenizer = tokenizer_mock()
-    sp_importer.import_phrases(tokenizer)
+    sp_importer.import_phrases(tokenizer, should_replace)
  
      assert len(tokenizer.analyser_cache['special_phrases']) == 18
  
@@ -216,7 +217,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
      assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
      assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
      assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
-    assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
+    if should_replace:
+        assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
  
      #Format (query, should_return_something_bool) use to easily execute all asserts
      queries_tests = set()
@@ -237,7 +239,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
          WHERE table_schema='public'
          AND table_name = 'place_classtype_wrongclass_wrongtype';
      """
-    queries_tests.add((query_wrong_table, False))
+    if should_replace:
+        queries_tests.add((query_wrong_table, False))
  
      with temp_db_conn.cursor() as temp_db_cursor:
          for query in queries_tests:
@@ -247,7 +250,7 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
              else:
                  assert not temp_db_cursor.fetchone()
  
-def mock_get_wiki_content(lang):
+def mock_get_wiki_content(self, lang):
      """
          Mock the _get_wiki_content() method to return
          static xml test file content.
diff --git a/test/python/test_tools_sp_csv_loader.py b/test/python/test_tools_sp_csv_loader.py

index 628dfd16932453b180bc5b0f115ed075a57d0160..ccc9996d64c581c33c3d5107fcffe2ed80be554d 100644 (file)
--- a/test/python/test_tools_sp_csv_loader.py
+++ b/test/python/test_tools_sp_csv_loader.py
@@ -16,7 +16,6 @@ def test_parse_csv(sp_csv_loader):
      phrases = sp_csv_loader.parse_csv()
      assert check_phrases_content(phrases)
  
-
  def test_next(sp_csv_loader):
      """
          Test objects returned from the next() method.
diff --git a/test/python/test_tools_sp_wiki_loader.py b/test/python/test_tools_sp_wiki_loader.py

index ad8e22c30a4b9b171fa35e5a21923e680add39c4..b4f6d529c606cad62fdb9e56c7d73dd930dd6ea6 100644 (file)
--- a/test/python/test_tools_sp_wiki_loader.py
+++ b/test/python/test_tools_sp_wiki_loader.py
@@ -47,7 +47,7 @@ def sp_wiki_loader(monkeypatch, def_config):
                          mock_get_wiki_content)
      return loader
  
-def mock_get_wiki_content(lang):
+def mock_get_wiki_content(self, lang):
      """
          Mock the _get_wiki_content() method to return
          static xml test file content.
author	AntoJvlt <antonin.jolivat@gmail.com>
	Mon, 17 May 2021 10:40:50 +0000 (12:40 +0200)
committer	AntoJvlt <antonin.jolivat@gmail.com>
	Mon, 17 May 2021 11:25:06 +0000 (13:25 +0200)
nominatim/clicmd/special_phrases.py		patch \| blob \| history
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
nominatim/tools/special_phrases/sp_importer.py		patch \| blob \| history
test/python/dummy_tokenizer.py		patch \| blob \| history
test/python/test_cli.py		patch \| blob \| history
test/python/test_tokenizer_legacy.py		patch \| blob \| history
test/python/test_tokenizer_legacy_icu.py		patch \| blob \| history
test/python/test_tools_import_special_phrases.py		patch \| blob \| history
test/python/test_tools_sp_csv_loader.py		patch \| blob \| history
test/python/test_tools_sp_wiki_loader.py		patch \| blob \| history