add wrapper class for place data passed to tokenizer

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 29 Sep 2021 08:37:54 +0000 (10:37 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 29 Sep 2021 09:54:07 +0000 (11:54 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 29 Sep 2021 08:37:54 +0000 (10:37 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 29 Sep 2021 09:54:07 +0000 (11:54 +0200)
diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py

new file mode 100644 (file)

index 0000000..fd179fe
--- /dev/null
+++ b/nominatim/indexer/place_info.py
@@ -0,0 +1,44 @@
+"""
+Wrapper around place information the indexer gets from the database and hands to
+the tokenizer.
+"""
+
+import psycopg2.extras
+
+class PlaceInfo:
+    """ Data class containing all information the tokenizer gets about a
+        place it should process the names for.
+    """
+
+    def __init__(self, info):
+        self._info = info
+
+
+    def analyze(self, analyzer):
+        """ Process this place with the given tokenizer and return the
+            result in psycopg2-compatible Json.
+        """
+        return psycopg2.extras.Json(analyzer.process_place(self))
+
+
+    @property
+    def name(self):
+        """ A dictionary with the names of the place or None if the place
+            has no names.
+        """
+        return self._info.get('name')
+
+
+    @property
+    def address(self):
+        """ A dictionary with the address elements of the place
+            or None if no address information is available.
+        """
+        return self._info.get('address')
+
+
+    @property
+    def country_feature(self):
+        """ Return the country code if the place is a valid country boundary.
+        """
+        return self._info.get('country_feature')
diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py

index 29261ee50ebe59aac57a260e560a6016036e5139..43966419dbb3744da99b4c4223bb18bdf8a333ce 100644 (file)
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -4,14 +4,16 @@ tasks.
  """
  import functools
  
-import psycopg2.extras
  from psycopg2 import sql as pysql
  
+from nominatim.indexer.place_info import PlaceInfo
+
  # pylint: disable=C0111
  
  def _mk_valuelist(template, num):
      return pysql.SQL(',').join([pysql.SQL(template)] * num)
  
+
  class AbstractPlacexRunner:
      """ Returns SQL commands for indexing of the placex table.
      """
@@ -47,7 +49,7 @@ class AbstractPlacexRunner:
          for place in places:
              for field in ('place_id', 'name', 'address', 'linked_place_id'):
                  values.append(place[field])
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
  
          worker.perform(self._index_sql(len(places)), values)
  
@@ -141,7 +143,7 @@ class InterpolationRunner:
          values = []
          for place in places:
              values.extend((place[x] for x in ('place_id', 'address')))
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
  
          worker.perform(self._index_sql(len(places)), values)
  
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

index 00ecae447c5843eb0fc772960a421e271a82cb1d..d827f8131e5e8bb6a501f6bc9410c7aea07dd310 100644 (file)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any
  
  from nominatim.config import Configuration
+from nominatim.indexer.place_info import PlaceInfo
  
  # pylint: disable=unnecessary-pass
  
@@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
  
  
      @abstractmethod
-    def process_place(self, place: Dict) -> Any:
+    def process_place(self, place: PlaceInfo) -> Any:
          """ Extract tokens for the given place and compute the
              information to be handed to the PL/pgSQL processor for building
              the search index.
  
              Arguments:
-                place: Dictionary with the information about the place. Currently
-                       the following fields may be present:
-
-                       - *name* is a dictionary of names for the place together
-                         with the designation of the name.
-                       - *address* is a dictionary of address terms.
-                       - *country_feature* is set to a country code when the
-                         place describes a country.
+                place: Place information retrived from the database.
  
              Returns:
                  A JSON-serialisable structure that will be handed into
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 5768fd3596652e07fca2896a9d6a02772af8ccb5..81b07568de0cd47a82f51f9220186468ce93e991 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -390,18 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.get('name')
+        names = place.name
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
-            country_feature = place.get('country_feature')
+            country_feature = place.country_feature
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                  self.add_country_names(country_feature.lower(), names)
  
-        address = place.get('address')
+        address = place.address
          if address:
              self._process_place_address(token_info, address)
  
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index 8957426b353efa7ec17f572e754f7fe47f90022c..8bfb309d406f8745e5836071a1a2cf59758d36f2 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -405,16 +405,16 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.get('name')
+        names = place.name
  
          if names:
              token_info.add_names(self.conn, names)
  
-            country_feature = place.get('country_feature')
+            country_feature = place.country_feature
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                  self.add_country_names(country_feature.lower(), names)
  
-        address = place.get('address')
+        address = place.address
          if address:
              self._process_place_address(token_info, address)
  
diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py

index ff498f777e527a38adbc38a48e500f5b514bc684..19a1268253feaa7ff1e2e6de20be3c43f1d74025 100644 (file)
--- a/nominatim/tools/tiger_data.py
+++ b/nominatim/tools/tiger_data.py
@@ -7,12 +7,11 @@ import logging
  import os
  import tarfile
  
-import psycopg2.extras
-
  from nominatim.db.connection import connect
  from nominatim.db.async_connection import WorkerPool
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.errors import UsageError
+from nominatim.indexer.place_info import PlaceInfo
  
  LOG = logging.getLogger()
  
@@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
              address = dict(street=row['street'], postcode=row['postcode'])
              args = ('SRID=4326;' + row['geometry'],
                      int(row['from']), int(row['to']), row['interpolation'],
-                    psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
+                    PlaceInfo({'address': address}).analyze(analyzer),
                      analyzer.normalize_postcode(row['postcode']))
          except ValueError:
              continue
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

index 69202bc322ffd88e103f60f8ced809bfa8e82fd3..db0f32cda6a1f95b1e590a8b7b1ef4be83975659 100644 (file)
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -1,6 +1,7 @@
  """
  Tokenizer for testing.
  """
+from nominatim.indexer.place_info import PlaceInfo
  
  def create(dsn, data_dir):
      """ Create a new instance of the tokenizer provided by this module.
@@ -68,4 +69,5 @@ class DummyNameAnalyzer:
  
      @staticmethod
      def process_place(place):
+        assert isinstance(place, PlaceInfo)
          return {}
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py

index ed079269561ff8ba8a19b56ecaff840ec0bd0c93..28c6ef7abb48e0c7192cb1cf3039a14e43259229 100644 (file)
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -11,6 +11,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.db import properties
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
  
  from mock_icu_word_table import MockIcuWordTable
  
@@ -322,30 +323,37 @@ class TestPlaceNames:
          assert eval(info['names']) == set((t[2] for t in tokens))
  
  
+    def process_named_place(self, names, country_feature=None):
+        place = {'name': names}
+        if country_feature:
+            place['country_feature'] = country_feature
+
+        return self.analyzer.process_place(PlaceInfo(place))
+
+
      def test_simple_names(self):
-        info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
+        info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
  
          self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
  
  
      @pytest.mark.parametrize('sep', [',' , ';'])
      def test_names_with_separator(self, sep):
-        info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
+        info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
  
          self.expect_name_terms(info, '#New York', '#Big Apple',
                                       'new', 'york', 'big', 'apple')
  
  
      def test_full_names_with_bracket(self):
-        info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
+        info = self.process_named_place({'name': 'Houseboat (left)'})
  
          self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
                                       'houseboat', 'left')
  
  
      def test_country_name(self, word_table):
-        info = self.analyzer.process_place({'name': {'name': 'Norge'},
-                                           'country_feature': 'no'})
+        info = self.process_named_place({'name': 'Norge'}, country_feature='no')
  
          self.expect_name_terms(info, '#norge', 'norge')
          assert word_table.get_country() == {('no', 'NORGE')}
@@ -361,7 +369,7 @@ class TestPlaceAddress:
  
  
      def process_address(self, **kwargs):
-        return self.analyzer.process_place({'address': kwargs})
+        return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
  
  
      def name_token_set(self, *expected_terms):
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py

index 4dd3a1414d4ac0872678a118fae51ae07b6920a0..2545c2db5952e59eaccca1f26f581d713ad2a601 100644 (file)
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -5,6 +5,7 @@ import shutil
  
  import pytest
  
+from nominatim.indexer.place_info import PlaceInfo
  from nominatim.tokenizer import legacy_tokenizer
  from nominatim.db import properties
  from nominatim.errors import UsageError
@@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
  
  
  def test_process_place_names(analyzer, make_keywords):
-    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+    info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
  
      assert info['names'] == '{1,2,3}'
  
  
  @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
  def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
  
      assert word_table.get_postcodes() == {pcode, }
  
  
  @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
  def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
  
      assert not word_table.get_postcodes()
  
@@ -319,7 +320,7 @@ class TestHousenumberName:
      @staticmethod
      @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
      def test_process_place_housenumbers_simple(analyzer, hnr):
-        info = analyzer.process_place({'address': {'housenumber' : hnr}})
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
  
          assert info['hnr'] == hnr
          assert info['hnr_tokens'].startswith("{")
@@ -327,15 +328,15 @@ class TestHousenumberName:
  
      @staticmethod
      def test_process_place_housenumbers_lists(analyzer):
-        info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+        info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
  
          assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
  
  
      @staticmethod
      def test_process_place_housenumbers_duplicates(analyzer):
-        info = analyzer.process_place({'address': {'housenumber' : '134',
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
                                                     'conscriptionnumber' : '134',
-                                                   'streetnumber' : '99a'}})
+                                                   'streetnumber' : '99a'}}))
  
          assert set(info['hnr'].split(';')) == set(('134', '99a'))
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 29 Sep 2021 08:37:54 +0000 (10:37 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 29 Sep 2021 09:54:07 +0000 (11:54 +0200)
nominatim/indexer/place_info.py	[new file with mode: 0644]	patch \| blob
nominatim/indexer/runners.py		patch \| blob \| history
nominatim/tokenizer/base.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
nominatim/tools/tiger_data.py		patch \| blob \| history
test/python/dummy_tokenizer.py		patch \| blob \| history
test/python/test_tokenizer_icu.py		patch \| blob \| history
test/python/test_tokenizer_legacy.py		patch \| blob \| history