From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 4 Oct 2022 14:25:47 +0000 (+0200)
Subject: Merge pull request #2835 from lonvia/secondary-importance
X-Git-Tag: v4.2.0~25
X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/afeafc8aa797a9051fc43a4482165c563a834a3f?hp=3381a92d92fba7adcc72b0e6397a9726664fe430

Merge pull request #2835 from lonvia/secondary-importance

Secondary importance
---

diff --git a/docs/admin/Import.md b/docs/admin/Import.md
index 90294959..8b6d6baa 100644
--- a/docs/admin/Import.md
+++ b/docs/admin/Import.md
@@ -79,10 +79,10 @@ This data is available as a binary download. Put it into your project directory:
 The file is about 400MB and adds around 4GB to the Nominatim database.
 
 !!! tip
-    If you forgot to download the wikipedia rankings, you can also add
-    importances after the import. Download the files, then run
-    `nominatim refresh --wiki-data --importance`. Updating importances for
-    a planet can take a couple of hours.
+    If you forgot to download the wikipedia rankings, then you can
+    also add importances after the import. Download the SQL files, then
+    run `nominatim refresh --wiki-data --importance`. Updating
+    importances for a planet will take a couple of hours.
 
 ### External postcodes
 
@@ -139,7 +139,7 @@ import. So this option is particularly interesting if you plan to transfer the
 database or reuse the space later.
 
 !!! warning
-    The datastructure for updates are also required when adding additional data
+    The data structure for updates are also required when adding additional data
     after the import, for example [TIGER housenumber data](../customize/Tiger.md).
     If you plan to use those, you must not use the `--no-updates` parameter.
     Do a normal import, add the external data and once you are done with
diff --git a/docs/customize/Importance.md b/docs/customize/Importance.md
new file mode 100644
index 00000000..d12bfc86
--- /dev/null
+++ b/docs/customize/Importance.md
@@ -0,0 +1,49 @@
+## Importance
+
+Search requests can yield multiple results which match equally well with
+the original query. In such case Nominatim needs to order the results
+according to a different criterion: importance. This is a measure for how
+likely it is that a user will search for a given place. This section explains
+the sources Nominatim uses for computing importance of a place and how to
+customize them.
+
+### How importance is computed
+
+The main value for importance is derived from page ranking values for Wikipedia
+pages for a place. For places that do not have their own
+Wikipedia page, a formula is used that derives a static importance from the
+places [search rank](../customize/Ranking#search-rank).
+
+In a second step, a secondary importance value is added which is meant to
+represent how well-known the general area is where the place is located. It
+functions as a tie-breaker between places with very similar primary
+importance values.
+
+nominatim.org has preprocessed importance tables for the
+[primary Wikipedia rankings](https://nominatim.org/data/wikimedia-importance.sql.gz)
+and for a secondary importance based on the number of tile views on openstreetmap.org.
+
+### Customizing secondary importance
+
+The secondary importance is implemented as a simple
+[Postgis raster](https://postgis.net/docs/raster.html) table, where Nominatim
+looks up the value for the coordinates of the centroid of a place. You can
+provide your own secondary importance raster in form of an SQL file named
+`secondary_importance.sql.gz` in your project directory.
+
+The SQL file needs to drop and (re)create a table `secondary_importance` which
+must as a minimum contain a column `rast` of type `raster`. The raster must
+be in EPSG:4326 and contain 16bit unsigned ints
+(`raster_constraint_pixel_types(rast) = '{16BUI}'). Any other columns in the
+table will be ignored. You must furthermore create an index as follows:
+
+```
+CREATE INDEX ON secondary_importance USING gist(ST_ConvexHull(gist))
+```
+
+The following raster2pgsql command will create a table that conforms to
+the requirements:
+
+```
+raster2pgsql -I -C -Y -d -t 128x128 input.tiff public.secondary_importance
+```
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index e89c32d5..ab7dec30 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -30,6 +30,7 @@ nav:
         - 'Configuration Settings': 'customize/Settings.md'
         - 'Per-Country Data': 'customize/Country-Settings.md'
         - 'Place Ranking' : 'customize/Ranking.md'
+        - 'Importance' : 'customize/Importance.md'
         - 'Tokenizers' : 'customize/Tokenizers.md'
         - 'Special Phrases': 'customize/Special-Phrases.md'
         - 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
diff --git a/lib-sql/functions/importance.sql b/lib-sql/functions/importance.sql
index ac3aa7f8..44e8bc8b 100644
--- a/lib-sql/functions/importance.sql
+++ b/lib-sql/functions/importance.sql
@@ -100,32 +100,55 @@ LANGUAGE plpgsql STABLE;
 
 CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
                                               country_code varchar(2),
-                                              osm_type varchar(1), osm_id BIGINT)
+                                              rank_search SMALLINT,
+                                              centroid GEOMETRY)
   RETURNS place_importance
   AS $$
 DECLARE
   match RECORD;
   result place_importance;
+  osm_views_exists BIGINT;
+  views BIGINT;
 BEGIN
-  FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code)
-               WHERE language is not NULL
+  -- add importance by wikipedia article if the place has one
+  FOR match IN
+    SELECT * FROM get_wikipedia_match(extratags, country_code)
+    WHERE language is not NULL
   LOOP
     result.importance := match.importance;
     result.wikipedia := match.language || ':' || match.title;
     RETURN result;
   END LOOP;
 
-  IF extratags ? 'wikidata' THEN
+  -- Nothing? Then try with the wikidata tag.
+  IF result.importance is null AND extratags ? 'wikidata' THEN
     FOR match IN SELECT * FROM wikipedia_article
                   WHERE wd_page_title = extratags->'wikidata'
-                  ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP
+                  ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
+    LOOP
       result.importance := match.importance;
       result.wikipedia := match.language || ':' || match.title;
       RETURN result;
     END LOOP;
   END IF;
 
-  RETURN null;
+  -- Still nothing? Fall back to a default.
+  IF result.importance is null THEN
+    result.importance := 0.75001 - (rank_search::float / 40);
+  END IF;
+
+{% if 'secondary_importance' in db.tables %}
+  FOR match IN
+    SELECT ST_Value(rast, centroid) as importance
+    FROM secondary_importance
+    WHERE ST_Intersects(ST_ConvexHull(rast), centroid) LIMIT 1
+  LOOP
+    -- Secondary importance as tie breaker with 0.0001 weight.
+    result.importance := result.importance + match.importance::float / 655350000;
+  END LOOP;
+{% endif %}
+
+  RETURN result;
 END;
 $$
 LANGUAGE plpgsql;
diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql
index a2276f07..367d2149 100644
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -965,7 +965,7 @@ BEGIN
 
   NEW.importance := null;
   SELECT wikipedia, importance
-    FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id)
+    FROM compute_importance(NEW.extratags, NEW.country_code, NEW.rank_search, NEW.centroid)
     INTO NEW.wikipedia,NEW.importance;
 
 {% if debug %}RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;{% endif %}
@@ -1047,7 +1047,7 @@ BEGIN
   IF linked_place is not null THEN
     -- Recompute the ranks here as the ones from the linked place might
     -- have been shifted to accommodate surrounding boundaries.
-    SELECT place_id, osm_id, class, type, extratags,
+    SELECT place_id, osm_id, class, type, extratags, rank_search,
            centroid, geometry,
            (compute_place_rank(country_code, osm_type, class, type, admin_level,
                               (extratags->'capital') = 'yes', null)).*
@@ -1088,7 +1088,7 @@ BEGIN
 
     SELECT wikipedia, importance
       FROM compute_importance(location.extratags, NEW.country_code,
-                              'N', location.osm_id)
+                              location.rank_search, NEW.centroid)
       INTO linked_wikipedia,linked_importance;
 
     -- Use the maximum importance if one could be computed from the linked object.
diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql
index 7ef74349..d576485e 100644
--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@@ -276,7 +276,7 @@ CREATE SEQUENCE file start 1;
 
 -- null table so it won't error
 -- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it
-CREATE TABLE wikipedia_article (
+CREATE TABLE IF NOT EXISTS wikipedia_article (
     language text NOT NULL,
     title text NOT NULL,
     langcount integer,
@@ -290,15 +290,12 @@ CREATE TABLE wikipedia_article (
     wd_page_title text,
     instance_of text
 );
-ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title);
-CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id);
 
-CREATE TABLE wikipedia_redirect (
+CREATE TABLE IF NOT EXISTS wikipedia_redirect (
     language text,
     from_title text,
     to_title text
 );
-ALTER TABLE ONLY wikipedia_redirect ADD CONSTRAINT wikipedia_redirect_pkey PRIMARY KEY (language, from_title);
 
 -- osm2pgsql does not create indexes on the middle tables for Nominatim
 -- Add one for lookup of associated street relations.
diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py
index 4457db5f..2f8273d6 100644
--- a/nominatim/clicmd/args.py
+++ b/nominatim/clicmd/args.py
@@ -115,6 +115,7 @@ class NominatimArgs:
     address_levels: bool
     functions: bool
     wiki_data: bool
+    secondary_importance: bool
     importance: bool
     website: bool
     diffs: bool
diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py
index dce28d98..ea605ea0 100644
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -63,6 +63,8 @@ class UpdateRefresh:
                            help='Update the PL/pgSQL functions in the database')
         group.add_argument('--wiki-data', action='store_true',
                            help='Update Wikipedia/data importance numbers')
+        group.add_argument('--secondary-importance', action='store_true',
+                           help='Update secondary importance raster data')
         group.add_argument('--importance', action='store_true',
                            help='Recompute place importances (expensive!)')
         group.add_argument('--website', action='store_true',
@@ -83,7 +85,7 @@ class UpdateRefresh:
                            help='Enable debug warning statements in functions')
 
 
-    def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches
+    def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
         from ..tools import refresh, postcodes
         from ..indexer.indexer import Indexer
 
@@ -115,6 +117,20 @@ class UpdateRefresh:
             with connect(args.config.get_libpq_dsn()) as conn:
                 refresh.load_address_levels_from_config(conn, args.config)
 
+        # Attention: must come BEFORE functions
+        if args.secondary_importance:
+            with connect(args.config.get_libpq_dsn()) as conn:
+                # If the table did not exist before, then the importance code
+                # needs to be enabled.
+                if not conn.table_exists('secondary_importance'):
+                    args.functions = True
+
+            LOG.warning('Import secondary importance raster data from %s', args.project_dir)
+            if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
+                                                args.project_dir) > 0:
+                LOG.fatal('FATAL: Cannot update sendary importance raster data')
+                return 1
+
         if args.functions:
             LOG.warning('Create functions')
             with connect(args.config.get_libpq_dsn()) as conn:
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py
index 29724433..344167bb 100644
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -59,7 +59,7 @@ class SetupAll:
                            help="Do not keep tables that are only needed for "
                                 "updating the database later")
         group2.add_argument('--offline', action='store_true',
-                           help="Do not attempt to load any additional data from the internet")
+                            help="Do not attempt to load any additional data from the internet")
         group3 = parser.add_argument_group('Expert options')
         group3.add_argument('--ignore-errors', action='store_true',
                            help='Continue import even when errors in SQL are present')
@@ -96,14 +96,21 @@ class SetupAll:
                                             drop=args.no_updates,
                                             ignore_errors=args.ignore_errors)
 
-            self._setup_tables(args.config, args.reverse_only)
-
             LOG.warning('Importing wikipedia importance data')
             data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
             if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
                                                  data_path) > 0:
                 LOG.error('Wikipedia importance dump file not found. '
-                          'Will be using default importances.')
+                          'Calculating importance values of locations will not '
+                          'use Wikipedia importance data.')
+
+            LOG.warning('Importing secondary importance raster data')
+            if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
+                                                   args.project_dir) != 0:
+                LOG.error('Secondary importance file not imported. '
+                          'Falling back to default ranking.')
+
+            self._setup_tables(args.config, args.reverse_only)
 
         if args.continue_at is None or args.continue_at == 'load-data':
             LOG.warning('Initialise tables')
diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py
index f6ebe90d..cb620d41 100644
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -75,6 +75,11 @@ def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
         with conn.cursor() as cur:
             cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
             cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
+
+            postgis_version = conn.postgis_version_tuple()
+            if postgis_version[0] >= 3:
+                cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster')
+
         conn.commit()
 
         _require_version('PostGIS',
diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py
index 8c1e9d9b..c50493cc 100644
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -15,7 +15,7 @@ from pathlib import Path
 from psycopg2 import sql as pysql
 
 from nominatim.config import Configuration
-from nominatim.db.connection import Connection
+from nominatim.db.connection import Connection, connect
 from nominatim.db.utils import execute_file
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.version import version_str
@@ -146,6 +146,25 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
 
     return 0
 
+def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
+    """ Replaces the secondary importance raster data table with new data.
+
+        Returns 0 if all was well and 1 if the raster SQL file could not
+        be found. Throws an exception if there was an error reading the file.
+    """
+    datafile = data_path / 'secondary_importance.sql.gz'
+    if not datafile.exists():
+        return 1
+
+    with connect(dsn) as conn:
+        postgis_version = conn.postgis_version_tuple()
+        if postgis_version[0] < 3:
+            LOG.error('PostGIS version is too old for using OSM raster data.')
+            return 2
+
+    execute_file(dsn, datafile, ignore_errors=ignore_errors)
+
+    return 0
 
 def recompute_importance(conn: Connection) -> None:
     """ Recompute wikipedia links and importance for all entries in placex.
@@ -157,7 +176,7 @@ def recompute_importance(conn: Connection) -> None:
         cur.execute("""
             UPDATE placex SET (wikipedia, importance) =
                (SELECT wikipedia, importance
-                FROM compute_importance(extratags, country_code, osm_type, osm_id))
+                FROM compute_importance(extratags, country_code, osm_type, osm_id, centroid))
             """)
         cur.execute("""
             UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
diff --git a/test/bdd/api/search/params.feature b/test/bdd/api/search/params.feature
index 300948a9..af83bd33 100644
--- a/test/bdd/api/search/params.feature
+++ b/test/bdd/api/search/params.feature
@@ -68,14 +68,15 @@ Feature: Search queries
           | 0 |
         Then there are duplicates
 
+    @fail-legacy
     Scenario: Search with bounded viewbox in right area
-        When sending json search query "bar" with address
+        When sending json search query "post" with address
           | bounded | viewbox |
           | 1       |  9,47,10,48 |
         Then result addresses contain
           | ID | town |
           | 0  | Vaduz |
-        When sending json search query "bar" with address
+        When sending json search query "post" with address
           | bounded | viewbox |
           | 1       |  9.49712,47.17122,9.52605,47.16242 |
         Then result addresses contain
@@ -118,18 +119,18 @@ Feature: Search queries
         Then result has centroid in 9.49712,47.16242,9.52605,47.17122
 
     Scenario: Prefer results within viewbox
-        When sending json search query "GÃ¤ssle" with address
-          | accept-language |
-          | en |
-        Then result addresses contain
-          | ID | town |
-          | 0  | Balzers |
         When sending json search query "GÃ¤ssle" with address
           | accept-language | viewbox |
           | en              | 9.52413,47.10759,9.53140,47.10539 |
         Then result addresses contain
           | ID | village |
           | 0  | Triesen |
+        When sending json search query "GÃ¤ssle" with address
+          | accept-language | viewbox |
+          | en              | 9.45949,47.08421,9.54094,47.05466 |
+        Then result addresses contain
+          | ID | town |
+          | 0  | Balzers |
 
     Scenario: viewboxes cannot be points
         When sending json search query "foo"
diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py
index e7234788..1feafd75 100644
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -201,19 +201,21 @@ class NominatimEnvironment:
             self.api_db_done = True
 
             if not self._reuse_or_drop_db(self.api_test_db):
-                testdata = Path('__file__') / '..' / '..' / 'testdb'
-                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+                testdata = (Path(__file__) / '..' / '..' / '..' / 'testdb').resolve()
+                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata)
+                simp_file = Path(self.website_dir.name) / 'secondary_importance.sql.gz'
+                simp_file.symlink_to(testdata / 'secondary_importance.sql.gz')
 
                 try:
                     self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    self.run_nominatim('add-data', '--tiger-data', str(testdata / 'tiger'))
                     self.run_nominatim('freeze')
 
                     if self.tokenizer == 'legacy':
-                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                        phrase_file = str(testdata / 'specialphrases_testdb.sql')
                         run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                     else:
-                        csv_path = str((testdata / 'full_en_phrases_test.csv').resolve())
+                        csv_path = str(testdata / 'full_en_phrases_test.csv')
                         self.run_nominatim('special-phrases', '--import-from-csv', csv_path)
                 except:
                     self.db_drop_database(self.api_test_db)
diff --git a/test/python/cli/test_cmd_import.py b/test/python/cli/test_cmd_import.py
index 737c4e5c..d098e27e 100644
--- a/test/python/cli/test_cmd_import.py
+++ b/test/python/cli/test_cmd_import.py
@@ -40,6 +40,7 @@ class TestCliImportWithDb:
             mock_func_factory(nominatim.data.country_info, 'setup_country_tables'),
             mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
             mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
+            mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'),
             mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
             mock_func_factory(nominatim.tools.database_import, 'load_data'),
             mock_func_factory(nominatim.tools.database_import, 'create_tables'),
diff --git a/test/python/cli/test_cmd_refresh.py b/test/python/cli/test_cmd_refresh.py
index 7f44765b..f3f93f0f 100644
--- a/test/python/cli/test_cmd_refresh.py
+++ b/test/python/cli/test_cmd_refresh.py
@@ -71,6 +71,18 @@ class TestRefresh:
 
         assert self.call_nominatim('refresh', '--wiki-data') == 1
 
+    def test_refresh_secondary_importance_file_not_found(self):
+        assert self.call_nominatim('refresh', '--secondary-importance') == 1
+
+
+    def test_refresh_secondary_importance_new_table(self, mock_func_factory):
+        mocks = [mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'),
+                 mock_func_factory(nominatim.tools.refresh, 'create_functions')]
+
+        assert self.call_nominatim('refresh', '--secondary-importance') == 0
+        assert mocks[0].called == 1
+        assert mocks[1].called == 1
+
 
     def test_refresh_importance_computed_after_wiki_import(self, monkeypatch):
         calls = []
diff --git a/test/python/tools/test_refresh.py b/test/python/tools/test_refresh.py
index ac52aa36..c6be4fe7 100644
--- a/test/python/tools/test_refresh.py
+++ b/test/python/tools/test_refresh.py
@@ -17,6 +17,21 @@ def test_refresh_import_wikipedia_not_existing(dsn):
     assert refresh.import_wikipedia_articles(dsn, Path('.')) == 1
 
 
+def test_refresh_import_secondary_importance_non_existing(dsn):
+    assert refresh.import_secondary_importance(dsn, Path('.')) == 1
+
+def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
+    temp_db_cursor.execute('CREATE EXTENSION postgis')
+
+    if temp_db_conn.postgis_version_tuple()[0] < 3:
+        assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0
+    else:
+        temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
+        assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
+
+        assert temp_db_conn.table_exists('secondary_importance')
+
+
 @pytest.mark.parametrize("replace", (True, False))
 def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, replace):
     if replace:
@@ -34,6 +49,7 @@ def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db
     temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
                                               country_code varchar(2),
                                               osm_type varchar(1), osm_id BIGINT,
+                                              centroid GEOMETRY,
                                               OUT importance FLOAT,
                                               OUT wikipedia TEXT)
                                AS $$ SELECT 0.1::float, 'foo'::text $$ LANGUAGE SQL""")
diff --git a/test/testdb/secondary_importance.sql.gz b/test/testdb/secondary_importance.sql.gz
new file mode 100644
index 00000000..e9c115d1
Binary files /dev/null and b/test/testdb/secondary_importance.sql.gz differ