introduce name analyzer

author Sarah Hoffmann <lonvia@denofr.de>

Sat, 24 Apr 2021 20:35:46 +0000 (22:35 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sat, 24 Apr 2021 20:35:46 +0000 (22:35 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py

index 3a39a151dd4f1f1c845ee15ae1c1591a75cd0809..2dd8220b1da16723b14a00b3b77301e1c8060815 100644 (file)
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -124,8 +124,9 @@ class Indexer:
          LOG.warning("Starting indexing boundaries using %s threads",
                      self.num_threads)
  
-        for rank in range(max(minrank, 4), min(maxrank, 26)):
-            self._index(runners.BoundaryRunner(rank))
+        with self.tokenizer.name_analyzer() as analyzer:
+            for rank in range(max(minrank, 4), min(maxrank, 26)):
+                self._index(runners.BoundaryRunner(rank, analyzer))
  
      def index_by_rank(self, minrank, maxrank):
          """ Index all entries of placex in the given rank range (inclusive)
@@ -138,15 +139,16 @@ class Indexer:
          LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                      minrank, maxrank, self.num_threads)
  
-        for rank in range(max(1, minrank), maxrank):
-            self._index(runners.RankRunner(rank))
+        with self.tokenizer.name_analyzer() as analyzer:
+            for rank in range(max(1, minrank), maxrank):
+                self._index(runners.RankRunner(rank, analyzer))
  
-        if maxrank == 30:
-            self._index(runners.RankRunner(0))
-            self._index(runners.InterpolationRunner(), 20)
-            self._index(runners.RankRunner(30), 20)
-        else:
-            self._index(runners.RankRunner(maxrank))
+            if maxrank == 30:
+                self._index(runners.RankRunner(0, analyzer))
+                self._index(runners.InterpolationRunner(), 20)
+                self._index(runners.RankRunner(30, analyzer), 20)
+            else:
+                self._index(runners.RankRunner(maxrank, analyzer))
  
  
      def index_postcodes(self):
diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py

index dd6ced3881f1bdf25f0df5fa0df96e5d419b2077..2bf9e51632a3ba84940755d005481aab9a9eed0b 100644 (file)
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -2,6 +2,10 @@
  Mix-ins that provide the actual commands for the indexer for various indexing
  tasks.
  """
+import functools
+
+import psycopg2.extras
+
  # pylint: disable=C0111
  
  class AbstractPlacexRunner:
@@ -9,28 +13,26 @@ class AbstractPlacexRunner:
      """
      SELECT_SQL = 'SELECT place_id, (placex_prepare_update(placex)).* FROM placex'
  
-    def __init__(self, rank):
+    def __init__(self, rank, analyzer):
          self.rank = rank
-        self._sql_terms = 0
-        self._cached_index_sql = None
+        self.analyzer = analyzer
  
-    def _index_sql(self, num_places):
-        if num_places != self._sql_terms:
-            self._cached_index_sql = \
-                """ UPDATE placex
-                    SET indexed_status = 0, address = v.addr
-                    FROM (VALUES {}) as v(id, addr)
-                    WHERE place_id = v.id
-                """.format(','.join(["(%s, %s::hstore)"]  * num_places))
-            self._sql_terms = num_places
  
-        return self._cached_index_sql
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE placex
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::json)"]  * num_places))
  
  
      def index_places(self, worker, places):
          values = []
          for place in places:
              values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
  
          worker.perform(self._index_sql(len(places)), values)
  
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index d0a404b937cc889c45e61c9eac3ea421eb7b7b37..b0cbe9c342bc9e1d8dbd82ccb3e8614f0a1aa379 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -5,6 +5,7 @@ import logging
  import shutil
  
  import psycopg2
+import psycopg2.extras
  
  from nominatim.db.connection import connect
  from nominatim.db import properties
@@ -140,6 +141,24 @@ class LegacyTokenizer:
              self._save_config(conn, config)
  
  
+    def name_analyzer(self):
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        return LegacyNameAnalyzer(self.dsn)
+
+
      def _init_db_tables(self, config):
          """ Set up the word table and fill it with pre-computed word
              frequencies.
@@ -159,3 +178,42 @@ class LegacyTokenizer:
          """
          properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
          properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+
+
+
+class LegacyNameAnalyzer:
+    """ The legacy analyzer uses the special Postgresql module for
+        splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn):
+        self.conn = connect(dsn).connection
+        self.conn.autocommit = True
+        psycopg2.extras.register_hstore(self.conn)
+
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def close(self):
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+    def process_place(self, place):
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        return {}
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

index 47cc580ce941531b6e36442c2cf816ff214d4fa2..013016c8d0ba7c2e84bddab61bfb3428944a205a 100644 (file)
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -23,3 +23,30 @@ class DummyTokenizer:
      def init_from_project(self):
          assert self.init_state == None
          self.init_state = "loaded"
+
+
+    def name_analyzer(self):
+        return DummyNameAnalyzer()
+
+
+class DummyNameAnalyzer:
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def close(self):
+        """ Free all resources used by the analyzer.
+        """
+        pass
+
+    def process_place(self, place):
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        return {}
diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py

index fdd50a42c2e59489d0194e3ba83e8fb0f4436e44..d68769064af9d6ff626ac1e78b9bcd684b9c6c51 100644 (file)
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -29,6 +29,7 @@ class IndexerTestDB:
                                                  partition SMALLINT,
                                                  admin_level SMALLINT,
                                                  address HSTORE,
+                                                token_info JSONB,
                                                  geometry_sector INTEGER)""")
              cur.execute("""CREATE TABLE location_property_osmline (
                                 place_id BIGINT,
author	Sarah Hoffmann <lonvia@denofr.de>
	Sat, 24 Apr 2021 20:35:46 +0000 (22:35 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
nominatim/indexer/indexer.py		patch \| blob \| history
nominatim/indexer/runners.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
test/python/dummy_tokenizer.py		patch \| blob \| history
test/python/test_indexing.py		patch \| blob \| history