]> git.openstreetmap.org Git - nominatim.git/commitdiff
export more data for the tokenizer name preparation
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 29 Sep 2021 09:54:14 +0000 (11:54 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 29 Sep 2021 09:54:14 +0000 (11:54 +0200)
Adds class, type, country and rank to the exported information
and removes the rather odd hack for countries. Whether a place
represents a country boundary can now be computed by the tokenizer.

lib-sql/functions/placex_triggers.sql
nominatim/indexer/place_info.py
nominatim/indexer/runners.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
test/python/test_indexing.py
test/python/test_tokenizer_icu.py

index 9c2a67a1ae8f50039071c8bd8f24e50ed9c06ae6..8ae8cf39c0d9fb2e54649ffc821164c26e5de0e6 100644 (file)
@@ -1,30 +1,33 @@
 -- Trigger functions for the placex table.
 
+-- Information returned by update preparation.
+DROP TYPE IF EXISTS prepare_update_info CASCADE;
+CREATE TYPE prepare_update_info AS (
+  name HSTORE,
+  address HSTORE,
+  rank_address SMALLINT,
+  country_code TEXT,
+  class TEXT,
+  type TEXT,
+  linked_place_id BIGINT
+);
+
 -- Retrieve the data needed by the indexer for updating the place.
---
--- Return parameters:
---  name            list of names
---  address         list of address tags, either from the object or a surrounding
---                  building
---  country_feature If the place is a country feature, this contains the
---                  country code, otherwise it is null.
-CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                 OUT name HSTORE,
-                                                 OUT address HSTORE,
-                                                 OUT country_feature VARCHAR,
-                                                 OUT linked_place_id BIGINT)
+CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
+  RETURNS prepare_update_info
   AS $$
 DECLARE
   location RECORD;
+  result prepare_update_info;
 BEGIN
   -- For POI nodes, check if the address should be derived from a surrounding
   -- building.
   IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
-    address := p.address;
+    result.address := p.address;
   ELSE
     -- The additional && condition works around the misguided query
     -- planner of postgis 3.0.
-    SELECT placex.address || hstore('_inherited', '') INTO address
+    SELECT placex.address || hstore('_inherited', '') INTO result.address
       FROM placex
      WHERE ST_Covers(geometry, p.centroid)
            and geometry && p.centroid
@@ -34,27 +37,26 @@ BEGIN
      LIMIT 1;
   END IF;
 
-  address := address - '_unlisted_place'::TEXT;
-  name := p.name;
+  result.address := result.address - '_unlisted_place'::TEXT;
+  result.name := p.name;
+  result.class := p.class;
+  result.type := p.type;
+  result.country_code := p.country_code;
+  result.rank_address := p.rank_address;
 
   -- Names of linked places need to be merged in, so search for a linkable
   -- place already here.
   SELECT * INTO location FROM find_linked_place(p);
 
   IF location.place_id is not NULL THEN
-    linked_place_id := location.place_id;
+    result.linked_place_id := location.place_id;
 
     IF NOT location.name IS NULL THEN
-      name := location.name || name;
+      result.name := location.name || result.name;
     END IF;
   END IF;
 
-  country_feature := CASE WHEN p.admin_level = 2
-                               and p.class = 'boundary' and p.type = 'administrative'
-                               and p.osm_type = 'R'
-                          THEN p.country_code
-                          ELSE null
-                     END;
+  RETURN result;
 END;
 $$
 LANGUAGE plpgsql STABLE;
index fd179fef7c752fc4c53ee2d071f40fb32f9000fb..06d730e02a839c37c08a4eb73088a465d28cf76a 100644 (file)
@@ -38,7 +38,31 @@ class PlaceInfo:
 
 
     @property
-    def country_feature(self):
-        """ Return the country code if the place is a valid country boundary.
+    def country_code(self):
+        """ The country code of the country the place is in. Guaranteed
+            to be a two-letter lower-case string or None, if no country
+            could be found.
         """
-        return self._info.get('country_feature')
+        return self._info.get('country_code')
+
+
+    @property
+    def rank_address(self):
+        """ The computed rank address before rank correction.
+        """
+        return self._info.get('rank_address')
+
+
+    def is_a(self, key, value):
+        """ Check if the place's primary tag corresponds to the given
+            key and value.
+        """
+        return self._info.get('class') == key and self._info.get('type') == value
+
+
+    def is_country(self):
+        """ Check if the place is a valid country boundary.
+        """
+        return self.rank_address == 4 \
+               and self.is_a('boundary', 'administrative') \
+               and self.country_code is not None
index 43966419dbb3744da99b4c4223bb18bdf8a333ce..70536a71db8b51f2e8b3792b63a224a8a646e046 100644 (file)
@@ -39,7 +39,7 @@ class AbstractPlacexRunner:
 
     @staticmethod
     def get_place_details(worker, ids):
-        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+        worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
                           FROM placex WHERE place_id IN %s""",
                        (tuple((p[0] for p in ids)), ))
 
index 81b07568de0cd47a82f51f9220186468ce93e991..fbaa25969dec5436a159ccb5d663e72aa1fc72ad 100644 (file)
@@ -397,9 +397,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             token_info.add_names(fulls, partials)
 
-            country_feature = place.country_feature
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
 
         address = place.address
         if address:
index 8bfb309d406f8745e5836071a1a2cf59758d36f2..dc6972dc2cc8cee28959152c2419606f94b2efba 100644 (file)
@@ -410,9 +410,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
         if names:
             token_info.add_names(self.conn, names)
 
-            country_feature = place.country_feature
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
 
         address = place.address
         if address:
index 60ad0bc4cbd3c5705891c60c847809ba74d7b985..4c9d940d09b1c2a0a0cddbe78383c34c9251af53 100644 (file)
@@ -29,6 +29,7 @@ class IndexerTestDB:
                                                 indexed_date TIMESTAMP,
                                                 partition SMALLINT,
                                                 admin_level SMALLINT,
+                                                country_code TEXT,
                                                 address HSTORE,
                                                 token_info JSONB,
                                                 geometry_sector INTEGER)""")
@@ -54,15 +55,26 @@ class IndexerTestDB:
                              END IF;
                              RETURN NEW;
                            END; $$ LANGUAGE plpgsql;""")
-            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                      OUT name HSTORE,
-                                                      OUT address HSTORE,
-                                                      OUT country_feature VARCHAR,
-                                                      OUT linked_place_id BIGINT)
+            cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE")
+            cur.execute("""CREATE TYPE prepare_update_info AS (
+                             name HSTORE,
+                             address HSTORE,
+                             rank_address SMALLINT,
+                             country_code TEXT,
+                             class TEXT,
+                             type TEXT,
+                             linked_place_id BIGINT
+                           )""")
+            cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex,
+                                                     OUT result prepare_update_info)
                            AS $$
                            BEGIN
-                            address := p.address;
-                            name := p.name;
+                             result.address := p.address;
+                             result.name := p.name;
+                             result.class := p.class;
+                             result.type := p.type;
+                             result.country_code := p.country_code;
+                             result.rank_address := p.rank_address;
                            END;
                            $$ LANGUAGE plpgsql STABLE;
                         """)
index 28c6ef7abb48e0c7192cb1cf3039a14e43259229..bbfc0b120d0a98405fb77aabdc1e48266ae541a2 100644 (file)
@@ -323,10 +323,8 @@ class TestPlaceNames:
         assert eval(info['names']) == set((t[2] for t in tokens))
 
 
-    def process_named_place(self, names, country_feature=None):
+    def process_named_place(self, names):
         place = {'name': names}
-        if country_feature:
-            place['country_feature'] = country_feature
 
         return self.analyzer.process_place(PlaceInfo(place))
 
@@ -353,7 +351,13 @@ class TestPlaceNames:
 
 
     def test_country_name(self, word_table):
-        info = self.process_named_place({'name': 'Norge'}, country_feature='no')
+        place = PlaceInfo({'name' : {'name': 'Norge'},
+                           'country_code': 'no',
+                           'rank_address': 4,
+                           'class': 'boundary',
+                           'type': 'administrative'})
+
+        info = self.analyzer.process_place(place)
 
         self.expect_name_terms(info, '#norge', 'norge')
         assert word_table.get_country() == {('no', 'NORGE')}