]> git.openstreetmap.org Git - nominatim.git/commitdiff
move postcode normalization into tokenizer
authorSarah Hoffmann <lonvia@denofr.de>
Sun, 25 Apr 2021 16:26:36 +0000 (18:26 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
12 files changed:
lib-sql/functions/interpolation.sql
lib-sql/functions/placex_triggers.sql
lib-sql/tokenizer/legacy_tokenizer.sql
nominatim/clicmd/setup.py
nominatim/indexer/indexer.py
nominatim/indexer/runners.py
nominatim/tokenizer/legacy_tokenizer.py
nominatim/tools/postcodes.py
test/python/conftest.py
test/python/dummy_tokenizer.py
test/python/test_indexing.py
test/python/test_tools_postcodes.py

index a797cad3ac1de74b3500eb53d90bbd2111b826f1..8bfc307b7d9f1dd2147f190cb3a168695118fecd 100644 (file)
@@ -12,39 +12,47 @@ $$
 LANGUAGE plpgsql IMMUTABLE;
 
 
+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+  AS $$
+DECLARE
+  location RECORD;
+  waynodes BIGINT[];
+BEGIN
+  IF akeys(in_address) != ARRAY['interpolation'] THEN
+    RETURN in_address;
+  END IF;
+
+  SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+  FOR location IN
+    SELECT placex.address, placex.osm_id FROM placex
+     WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+           and placex.address is not null
+           and (placex.address ? 'street' or placex.address ? 'place')
+           and indexed_status < 100
+  LOOP
+    -- mark it as a derived address
+    RETURN location.address || in_address || hstore('_inherited', '');
+  END LOOP;
+
+  RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street TEXT,
                                                     place TEXT, partition SMALLINT,
                                                     centroid GEOMETRY, geom GEOMETRY)
   RETURNS BIGINT
   AS $$
 DECLARE
-  addr_street TEXT;
-  addr_place TEXT;
   parent_place_id BIGINT;
-
-  waynodes BIGINT[];
-
   location RECORD;
 BEGIN
-  addr_street = street;
-  addr_place = place;
-
-  IF addr_street is null and addr_place is null THEN
-    select nodes from planet_osm_ways where id = wayid INTO waynodes;
-    FOR location IN SELECT placex.address from placex
-                    where osm_type = 'N' and osm_id = ANY(waynodes)
-                          and placex.address is not null
-                          and (placex.address ? 'street' or placex.address ? 'place')
-                          and indexed_status < 100
-                    limit 1 LOOP
-      addr_street = location.address->'street';
-      addr_place = location.address->'place';
-    END LOOP;
-  END IF;
-
-  parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                             partition, centroid);
+  parent_place_id := find_parent_for_address(street, place, partition, centroid);
 
   IF parent_place_id is null THEN
     FOR location IN SELECT place_id FROM placex
@@ -147,17 +155,20 @@ BEGIN
   NEW.interpolationtype = NEW.address->'interpolation';
 
   place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
+  NEW.parent_place_id = get_interpolation_parent(NEW.address->'street',
                                                  NEW.address->'place',
                                                  NEW.partition, place_centroid, NEW.linegeo);
 
   IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
     interpol_postcode := NEW.address->'postcode';
-    housenum := getorcreate_postcode_id(NEW.address->'postcode');
   ELSE
     interpol_postcode := NULL;
   END IF;
 
+  IF NEW.address ? '_inherited' THEN
+    NEW.address := hstore('interpolation', NEW.interpolationtype);
+  END IF;
+
   -- if the line was newly inserted, split the line as necessary
   IF OLD.indexed_status = 1 THEN
       select nodes from planet_osm_ways where id = NEW.osm_id INTO waynodes;
index ca9ab5cc550f2889cc3b1653ed75a8aa2067e6ab..922d79bd51b3b0da6d9de7d7e7ee6eb147edcd3b 100644 (file)
@@ -817,10 +817,6 @@ BEGIN
   IF NEW.address is not NULL THEN
       addr_street := NEW.address->'street';
       addr_place := NEW.address->'place';
-
-      IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
-        i := getorcreate_postcode_id(NEW.address->'postcode');
-      END IF;
   END IF;
 
   NEW.postcode := null;
index 916ba9aedc1e215906037be2bcf01ec7af583c28..4c22424ec74afa2504f51bbebdf9498fba2e740a 100644 (file)
@@ -34,6 +34,13 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+  RETURNS TEXT
+AS $$
+  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
 -- Return token info that should be saved permanently in the database.
 CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
   RETURNS JSONB
@@ -133,26 +140,26 @@ $$
 LANGUAGE plpgsql;
 
 
-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-  RETURNS INTEGER
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+  RETURNS BOOLEAN
   AS $$
 DECLARE
+  r RECORD;
   lookup_token TEXT;
-  lookup_word TEXT;
   return_word_id INTEGER;
 BEGIN
-  lookup_word := upper(trim(postcode));
-  lookup_token := ' ' || make_standard_name(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and word = lookup_word
+  lookup_token := ' ' || make_standard_name(postcode);
+  FOR r IN
+    SELECT word_id FROM word
+    WHERE word_token = lookup_token and word = postcode
           and class='place' and type='postcode'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
-                             'place', 'postcode', null, 0);
-  END IF;
-  RETURN return_word_id;
+  LOOP
+    RETURN false;
+  END LOOP;
+
+  INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
+                           'place', 'postcode', null, 0);
+  RETURN true;
 END;
 $$
 LANGUAGE plpgsql;
index 499eff7673dc8f1d393eeb889c731eb980dc216f..1ce9cf3e0039642641c0c8ead82f52ca9b2b7d2c 100644 (file)
@@ -116,7 +116,8 @@ class SetupAll:
 
         if args.continue_at is None or args.continue_at == 'load-data':
             LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+                                       tokenizer)
 
         if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
             if args.continue_at is not None and args.continue_at != 'load-data':
index ea7b0e5986e22c0fad8c9b826bb03b2af3d1a29c..d0c6ea0ca66c8c0927cbfb718e6cd7518bbe8238 100644 (file)
@@ -147,7 +147,7 @@ class Indexer:
 
             if maxrank == 30:
                 self._index(runners.RankRunner(0, analyzer))
-                self._index(runners.InterpolationRunner(), 20)
+                self._index(runners.InterpolationRunner(analyzer), 20)
                 self._index(runners.RankRunner(30, analyzer), 20)
             else:
                 self._index(runners.RankRunner(maxrank, analyzer))
index 2bf9e51632a3ba84940755d005481aab9a9eed0b..75429fe427428a9bc79b7c9ec9a4f491a873028c 100644 (file)
@@ -25,7 +25,7 @@ class AbstractPlacexRunner:
                    SET indexed_status = 0, address = v.addr, token_info = v.ti
                    FROM (VALUES {}) as v(id, addr, ti)
                    WHERE place_id = v.id
-               """.format(','.join(["(%s, %s::hstore, %s::json)"]  * num_places))
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
 
 
     def index_places(self, worker, places):
@@ -82,6 +82,10 @@ class InterpolationRunner:
         location_property_osmline.
     """
 
+    def __init__(self, analyzer):
+        self.analyzer = analyzer
+
+
     @staticmethod
     def name():
         return "interpolation lines (location_property_osmline)"
@@ -93,15 +97,30 @@ class InterpolationRunner:
 
     @staticmethod
     def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
+        return """SELECT place_id, get_interpolation_address(address, osm_id) as address
+                  FROM location_property_osmline
                   WHERE indexed_status > 0
                   ORDER BY geometry_sector"""
 
+
     @staticmethod
-    def index_places(worker, ids):
-        worker.perform(""" UPDATE location_property_osmline
-                           SET indexed_status = 0 WHERE place_id IN ({})
-                       """.format(','.join((str(i[0]) for i in ids))))
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE location_property_osmline
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+
 
 
 class PostcodeRunner:
index 6ffdc4ef46cc0269348428d8ba7abb76d872f3b0..b95a6208f3fac47e978669f38e476ef5c16fc49f 100644 (file)
@@ -1,6 +1,7 @@
 """
 Tokenizer implementing normalisation as used before Nominatim 4.
 """
+from collections import OrderedDict
 import logging
 import re
 import shutil
@@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
             self.conn.close()
             self.conn = None
 
+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT count(create_postcode_id(pc))
+                           FROM (SELECT distinct(postcode) as pc
+                                 FROM location_postcode) x""")
+
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
@@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
         address = place.get('address')
 
         if address:
+            self._add_postcode(address.get('postcode'))
             token_info.add_housenumbers(self.conn, address)
 
         return token_info.data
 
 
+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        if not postcode or re.search(r'[:,;]', postcode) is not None:
+            return
+
+        def _create_postcode_from_db(pcode):
+            with self.conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+        self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
 class _TokenInfo:
     """ Collect token information to be sent back to the database.
     """
@@ -285,6 +309,32 @@ class _TokenInfo:
             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 
 
+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize=128):
+        self.data = OrderedDict()
+        self.maxsize = maxsize
+
+    def get(self, key, generator):
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
 class _TokenCache:
     """ Cache for token information to avoid repeated database queries.
 
@@ -292,6 +342,9 @@ class _TokenCache:
         analyzer.
     """
     def __init__(self, conn):
+        # various LRU caches
+        self.postcodes = _LRU(maxsize=32)
+
         # Lookup houseunumbers up to 100 and cache them
         with conn.cursor() as cur:
             cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
index 0a568cbafc73c125c4f9d34b7141b42b05eb6b11..78bd8cb9490c5646754cef2ef2bbf2348d5e2a74 100644 (file)
@@ -6,7 +6,7 @@ of artificial postcode centroids.
 from nominatim.db.utils import execute_file
 from nominatim.db.connection import connect
 
-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
     """ Set up the initial list of postcodes.
     """
 
@@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                 INSERT INTO location_postcode
                  (place_id, indexed_status, country_code, postcode, geometry)
                 SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                        ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                   FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
+                       and token_normalized_postcode(address->'postcode') is not null
                        AND geometry IS NOT null
                  GROUP BY country_code, pc
             """)
@@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
             cur.execute("""
                 INSERT INTO location_postcode
                  (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
+                       token_normalized_postcode(postcode),
                        ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                         (SELECT postcode FROM location_postcode
                           WHERE country_code = 'us')
             """)
@@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
             cur.execute("""
                 INSERT INTO location_postcode
                  (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
-                  FROM gb_postcode WHERE postcode NOT IN
+                SELECT nextval('seq_place'), 1, 'gb',
+                       token_normalized_postcode(postcode), geometry
+                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                            (SELECT postcode FROM location_postcode
                              WHERE country_code = 'gb')
             """)
@@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                     DELETE FROM word WHERE class='place' and type='postcode'
                     and word NOT IN (SELECT postcode FROM location_postcode)
             """)
-
-            cur.execute("""
-                SELECT count(getorcreate_postcode_id(v)) FROM
-                (SELECT distinct(postcode) as v FROM location_postcode) p
-            """)
         conn.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.add_postcodes_from_db()
index d4649c24929b91835b6aa3dd7d34f985904ceed6..f43f09d074c7ab56ce3837872ff2539ba3f18f86 100644 (file)
@@ -299,7 +299,7 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
 
 
 @pytest.fixture
-def tokenizer_mock(monkeypatch, property_table, temp_db_conn):
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
     """ Sets up the configuration so that the test dummy tokenizer will be
         loaded.
     """
index 013016c8d0ba7c2e84bddab61bfb3428944a205a..ceea4a7ededdaa1e829e6ebf902ef42455e33e47 100644 (file)
@@ -43,6 +43,9 @@ class DummyNameAnalyzer:
         """
         pass
 
+    def add_postcodes_from_db(self):
+        pass
+
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
index d68769064af9d6ff626ac1e78b9bcd684b9c6c51..ff84e37964b660a33be2faa0908ba2ad0c1519a7 100644 (file)
@@ -33,6 +33,9 @@ class IndexerTestDB:
                                                 geometry_sector INTEGER)""")
             cur.execute("""CREATE TABLE location_property_osmline (
                                place_id BIGINT,
+                               osm_id BIGINT,
+                               address HSTORE,
+                               token_info JSONB,
                                indexed_status SMALLINT,
                                indexed_date TIMESTAMP,
                                geometry_sector INTEGER)""")
@@ -61,6 +64,14 @@ class IndexerTestDB:
                            END;
                            $$ LANGUAGE plpgsql STABLE;
                         """)
+            cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+                           RETURNS HSTORE AS $$
+                           BEGIN
+                             RETURN in_address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+
             for table in ('placex', 'location_property_osmline', 'location_postcode'):
                 cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
                                FOR EACH ROW EXECUTE PROCEDURE date_update()
@@ -91,9 +102,9 @@ class IndexerTestDB:
         next_id = next(self.osmline_id)
         with self.conn.cursor() as cur:
             cur.execute("""INSERT INTO location_property_osmline
-                              (place_id, indexed_status, geometry_sector)
-                              VALUES (%s, 1, %s)""",
-                        (next_id, sector))
+                              (place_id, osm_id, indexed_status, geometry_sector)
+                              VALUES (%s, %s, 1, %s)""",
+                        (next_id, next_id, sector))
         return next_id
 
     def add_postcode(self, country, postcode):
index 1fc060b0c6439677e592aa90d4d48d3db51a8ea6..37b47dfa680258e5587fa40e5057d7fd85904a96 100644 (file)
@@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
 import pytest
 
 from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+    return dummy_tokenizer.DummyTokenizer(None, None)
 
 @pytest.fixture
 def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
                       postcode TEXT,
                       geometry GEOMETRY(Geometry, 4326)""")
     temp_db_cursor.execute('CREATE SEQUENCE seq_place')
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-                              RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+                              RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
                            """)
 
 
-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
-    postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
 
     assert temp_db_cursor.table_exists('gb_postcode')
     assert temp_db_cursor.table_exists('us_postcode')
     assert temp_db_cursor.table_rows('location_postcode') == 0
 
 
-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
     temp_db_cursor.execute("""
         INSERT INTO placex (place_id, country_code, address, geometry)
           VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
     """)
 
-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
 
     rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
                                       ST_X(geometry), ST_Y(geometry)