Merge pull request #2472 from lonvia/word-count-computation

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 19 Oct 2021 12:58:57 +0000 (14:58 +0200)

committer GitHub <noreply@github.com>

Tue, 19 Oct 2021 12:58:57 +0000 (14:58 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 19 Oct 2021 12:58:57 +0000 (14:58 +0200)
committer GitHub <noreply@github.com>
Tue, 19 Oct 2021 12:58:57 +0000 (14:58 +0200)
diff --git a/lib-sql/words_from_search_name.sql b/lib-sql/words_from_search_name.sql

deleted file mode 100644 (file)

index b7727dc..0000000
--- a/lib-sql/words_from_search_name.sql
+++ /dev/null
@@ -1,11 +0,0 @@
-DROP TABLE IF EXISTS word_frequencies;
-CREATE TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
-
-CREATE INDEX idx_word_frequencies ON word_frequencies(id);
-
-UPDATE word SET search_name_count = count
-  FROM word_frequencies
- WHERE word_token like ' %' and word_id = id;
-
-DROP TABLE word_frequencies;
diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py

index aa540f6b253e7fada9500767ba01a3d6383093e3..e7d7d7ba1d33cc032b3c0260a5ffc3d05f772344 100644 (file)
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -71,8 +71,8 @@ class UpdateRefresh:
                            "Postcode updates on a frozen database is not possible.")
  
          if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
-            refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+            LOG.warning('Recompute word statistics')
+            self._get_tokenizer(args.config).update_statistics()
  
          if args.address_levels:
              cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

index 02bc312f18dc5e0bbe65fcacfc7e3564f805d441..94fac1fc4e1cdcab553af27f6fe1a94b21a9f95f 100644 (file)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
          pass
  
  
+    @abstractmethod
+    def update_statistics(self) -> None:
+        """ Recompute any tokenizer statistics necessary for efficient lookup.
+            This function is meant to be called from time to time by the user
+            to improve performance. However, the tokenizer must not depend on
+            it to be called in order to work.
+        """
+        pass
+
+
      @abstractmethod
      def name_analyzer(self) -> AbstractAnalyzer:
          """ Create a new analyzer for tokenizing names and queries
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 12d1eccd15f1799b6b45af4df6b0b39ec6a93674..2af0bcb257ad214f3e67621a7ac1aaa83b7092d1 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -2,7 +2,6 @@
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from collections import Counter
  import itertools
  import json
  import logging
@@ -93,6 +92,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
          return None
  
  
+    def update_statistics(self):
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table("word_frequencies")
+                LOG.info("Computing word frequencies")
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute("CREATE INDEX ON word_frequencies(id)")
+                LOG.info("Update word table with recomputed frequencies")
+                cur.execute("""UPDATE word
+                               SET info = info || jsonb_build_object('count', count)
+                               FROM word_frequencies WHERE word_id = id""")
+                cur.drop_table("word_frequencies")
+            conn.commit()
+
+
      def name_analyzer(self):
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
@@ -142,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
-            LOG.warning("Precomputing word tokens")
-
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        analysis = self.loader.make_token_analysis()
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                word = analysis.search.transliterate(name)
-                if word and ' ' in word:
-                    for term in set(word.split()):
-                        words[term] += cnt
-
-        return words
-
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index c935f20d4a9836e0f1c97ab74a5ce93a98b99ba1..d901a68d2e53f77e5c96210c11ede863e7e5e36f 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
              self._save_config(conn, config)
  
  
+    def update_statistics(self):
+        """ Recompute the frequency of full words.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table("word_frequencies")
+                LOG.info("Computing word frequencies")
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute("CREATE INDEX ON word_frequencies(id)")
+                LOG.info("Update word table with recomputed frequencies")
+                cur.execute("""UPDATE word SET search_name_count = count
+                               FROM word_frequencies
+                               WHERE word_token like ' %' and word_id = id""")
+                cur.drop_table("word_frequencies")
+            conn.commit()
+
      def name_analyzer(self):
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py

index 5aaee0c8d1d8417a5a88c4b7a317a2d2f37c4467..00ae5dc95bb4c3c5fa0205e74ab49abf61b51e13 100644 (file)
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
  LOG = logging.getLogger()
  
  
-def recompute_word_counts(dsn, sql_dir):
-    """ Compute the frequency of full-word search terms.
-    """
-    execute_file(dsn, sql_dir / 'words_from_search_name.sql')
-
-
  def _add_address_level_rows_from_entry(rows, entry):
      """ Converts a single entry from the JSON format for address rank
          descriptions into a flat format suitable for inserting into a
diff --git a/test/python/test_cli.py b/test/python/test_cli.py

index c83ee3ddfba64e5fc880886275ed866cc2259e38..7e6bf99e4286c3c6430131a5e8a9ae81a53a80aa 100644 (file)
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -144,6 +144,7 @@ class TestCliWithDb:
              def __init__(self, *args, **kwargs):
                  self.update_sql_functions_called = False
                  self.finalize_import_called = False
+                self.update_statistics_called = False
  
              def update_sql_functions(self, *args):
                  self.update_sql_functions_called = True
@@ -151,6 +152,10 @@ class TestCliWithDb:
              def finalize_import(self, *args):
                  self.finalize_import_called = True
  
+            def update_statistics(self):
+                self.update_statistics_called = True
+
+
          tok = DummyTokenizer()
          monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
                              lambda *args: tok)
@@ -316,7 +321,6 @@ class TestCliWithDb:
          assert func.called == 1
  
      @pytest.mark.parametrize("command,func", [
-                             ('word-counts', 'recompute_word_counts'),
                               ('address-levels', 'load_address_levels_from_file'),
                               ('wiki-data', 'import_wikipedia_articles'),
                               ('importance', 'recompute_importance'),
@@ -329,6 +333,11 @@ class TestCliWithDb:
          assert func_mock.called == 1
  
  
+    def test_refresh_word_count(self):
+        assert self.call_nominatim('refresh', '--word-count') == 0
+        assert self.tokenizer_mock.update_statistics_called
+
+
      def test_refresh_postcodes(self, mock_func_factory, place_table):
          func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
          idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py

index 6a2f2f8bd04b405f7741aca6fbaa27cbe2ce113c..4d029dec6b1725553faf32d075a01f3f00010d85 100644 (file)
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -160,7 +160,7 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
      assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
  
  
-def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
+def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
      place_row(names={'name' : 'Test Area', 'ref' : '52'})
      place_row(names={'name' : 'No Area'})
      place_row(names={'name' : 'Holzstrasse'})
@@ -168,8 +168,7 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
  
-    assert word_table.get_partial_words() == {('test', 1),
-                                              ('no', 1), ('area', 2)}
+    assert temp_db_cursor.table_exists('word')
  
  
  def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 19 Oct 2021 12:58:57 +0000 (14:58 +0200)
committer	GitHub <noreply@github.com>
	Tue, 19 Oct 2021 12:58:57 +0000 (14:58 +0200)
lib-sql/words_from_search_name.sql	[deleted file]	patch \| blob \| history
nominatim/clicmd/refresh.py		patch \| blob \| history
nominatim/tokenizer/base.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
nominatim/tools/refresh.py		patch \| blob \| history
test/python/test_cli.py		patch \| blob \| history
test/python/test_tokenizer_icu.py		patch \| blob \| history