]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2835 from lonvia/secondary-importance
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 4 Oct 2022 14:25:47 +0000 (16:25 +0200)
committerGitHub <noreply@github.com>
Tue, 4 Oct 2022 14:25:47 +0000 (16:25 +0200)
Secondary importance

17 files changed:
docs/admin/Import.md
docs/customize/Importance.md [new file with mode: 0644]
docs/mkdocs.yml
lib-sql/functions/importance.sql
lib-sql/functions/placex_triggers.sql
lib-sql/tables.sql
nominatim/clicmd/args.py
nominatim/clicmd/refresh.py
nominatim/clicmd/setup.py
nominatim/tools/database_import.py
nominatim/tools/refresh.py
test/bdd/api/search/params.feature
test/bdd/steps/nominatim_environment.py
test/python/cli/test_cmd_import.py
test/python/cli/test_cmd_refresh.py
test/python/tools/test_refresh.py
test/testdb/secondary_importance.sql.gz [new file with mode: 0644]

index 90294959c334297c805f71510c7224544f99584e..8b6d6baa9aff27cc9ef4a72833223eab04d5df9f 100644 (file)
@@ -79,10 +79,10 @@ This data is available as a binary download. Put it into your project directory:
 The file is about 400MB and adds around 4GB to the Nominatim database.
 
 !!! tip
-    If you forgot to download the wikipedia rankings, you can also add
-    importances after the import. Download the files, then run
-    `nominatim refresh --wiki-data --importance`. Updating importances for
-    a planet can take a couple of hours.
+    If you forgot to download the wikipedia rankings, then you can
+    also add importances after the import. Download the SQL files, then
+    run `nominatim refresh --wiki-data --importance`. Updating
+    importances for a planet will take a couple of hours.
 
 ### External postcodes
 
@@ -139,7 +139,7 @@ import. So this option is particularly interesting if you plan to transfer the
 database or reuse the space later.
 
 !!! warning
-    The datastructure for updates are also required when adding additional data
+    The data structure for updates are also required when adding additional data
     after the import, for example [TIGER housenumber data](../customize/Tiger.md).
     If you plan to use those, you must not use the `--no-updates` parameter.
     Do a normal import, add the external data and once you are done with
diff --git a/docs/customize/Importance.md b/docs/customize/Importance.md
new file mode 100644 (file)
index 0000000..d12bfc8
--- /dev/null
@@ -0,0 +1,49 @@
+## Importance
+
+Search requests can yield multiple results which match equally well with
+the original query. In such case Nominatim needs to order the results
+according to a different criterion: importance. This is a measure for how
+likely it is that a user will search for a given place. This section explains
+the sources Nominatim uses for computing importance of a place and how to
+customize them.
+
+### How importance is computed
+
+The main value for importance is derived from page ranking values for Wikipedia
+pages for a place. For places that do not have their own
+Wikipedia page, a formula is used that derives a static importance from the
+places [search rank](../customize/Ranking#search-rank).
+
+In a second step, a secondary importance value is added which is meant to
+represent how well-known the general area is where the place is located. It
+functions as a tie-breaker between places with very similar primary
+importance values.
+
+nominatim.org has preprocessed importance tables for the
+[primary Wikipedia rankings](https://nominatim.org/data/wikimedia-importance.sql.gz)
+and for a secondary importance based on the number of tile views on openstreetmap.org.
+
+### Customizing secondary importance
+
+The secondary importance is implemented as a simple
+[Postgis raster](https://postgis.net/docs/raster.html) table, where Nominatim
+looks up the value for the coordinates of the centroid of a place. You can
+provide your own secondary importance raster in form of an SQL file named
+`secondary_importance.sql.gz` in your project directory.
+
+The SQL file needs to drop and (re)create a table `secondary_importance` which
+must as a minimum contain a column `rast` of type `raster`. The raster must
+be in EPSG:4326 and contain 16bit unsigned ints
+(`raster_constraint_pixel_types(rast) = '{16BUI}'). Any other columns in the
+table will be ignored. You must furthermore create an index as follows:
+
+```
+CREATE INDEX ON secondary_importance USING gist(ST_ConvexHull(gist))
+```
+
+The following raster2pgsql command will create a table that conforms to
+the requirements:
+
+```
+raster2pgsql -I -C -Y -d -t 128x128 input.tiff public.secondary_importance
+```
index e89c32d577fef165d1768dc4631463ad861f94dc..ab7dec30508838bc9e2c56c094686e8d186140cc 100644 (file)
@@ -30,6 +30,7 @@ nav:
         - 'Configuration Settings': 'customize/Settings.md'
         - 'Per-Country Data': 'customize/Country-Settings.md'
         - 'Place Ranking' : 'customize/Ranking.md'
+        - 'Importance' : 'customize/Importance.md'
         - 'Tokenizers' : 'customize/Tokenizers.md'
         - 'Special Phrases': 'customize/Special-Phrases.md'
         - 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
index ac3aa7f856cbae355f7111fdd12490a003e99a57..44e8bc8b8e0b31e8d0ff837d059a119deb5c3af1 100644 (file)
@@ -100,32 +100,55 @@ LANGUAGE plpgsql STABLE;
 
 CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
                                               country_code varchar(2),
-                                              osm_type varchar(1), osm_id BIGINT)
+                                              rank_search SMALLINT,
+                                              centroid GEOMETRY)
   RETURNS place_importance
   AS $$
 DECLARE
   match RECORD;
   result place_importance;
+  osm_views_exists BIGINT;
+  views BIGINT;
 BEGIN
-  FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code)
-               WHERE language is not NULL
+  -- add importance by wikipedia article if the place has one
+  FOR match IN
+    SELECT * FROM get_wikipedia_match(extratags, country_code)
+    WHERE language is not NULL
   LOOP
     result.importance := match.importance;
     result.wikipedia := match.language || ':' || match.title;
     RETURN result;
   END LOOP;
 
-  IF extratags ? 'wikidata' THEN
+  -- Nothing? Then try with the wikidata tag.
+  IF result.importance is null AND extratags ? 'wikidata' THEN
     FOR match IN SELECT * FROM wikipedia_article
                   WHERE wd_page_title = extratags->'wikidata'
-                  ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP
+                  ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
+    LOOP
       result.importance := match.importance;
       result.wikipedia := match.language || ':' || match.title;
       RETURN result;
     END LOOP;
   END IF;
 
-  RETURN null;
+  -- Still nothing? Fall back to a default.
+  IF result.importance is null THEN
+    result.importance := 0.75001 - (rank_search::float / 40);
+  END IF;
+
+{% if 'secondary_importance' in db.tables %}
+  FOR match IN
+    SELECT ST_Value(rast, centroid) as importance
+    FROM secondary_importance
+    WHERE ST_Intersects(ST_ConvexHull(rast), centroid) LIMIT 1
+  LOOP
+    -- Secondary importance as tie breaker with 0.0001 weight.
+    result.importance := result.importance + match.importance::float / 655350000;
+  END LOOP;
+{% endif %}
+
+  RETURN result;
 END;
 $$
 LANGUAGE plpgsql;
index a2276f074587658b4f0b2b781f559dcce0d49cde..367d214962b2380d6cbb48d377b4148a1f08e8ac 100644 (file)
@@ -965,7 +965,7 @@ BEGIN
 
   NEW.importance := null;
   SELECT wikipedia, importance
-    FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id)
+    FROM compute_importance(NEW.extratags, NEW.country_code, NEW.rank_search, NEW.centroid)
     INTO NEW.wikipedia,NEW.importance;
 
 {% if debug %}RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;{% endif %}
@@ -1047,7 +1047,7 @@ BEGIN
   IF linked_place is not null THEN
     -- Recompute the ranks here as the ones from the linked place might
     -- have been shifted to accommodate surrounding boundaries.
-    SELECT place_id, osm_id, class, type, extratags,
+    SELECT place_id, osm_id, class, type, extratags, rank_search,
            centroid, geometry,
            (compute_place_rank(country_code, osm_type, class, type, admin_level,
                               (extratags->'capital') = 'yes', null)).*
@@ -1088,7 +1088,7 @@ BEGIN
 
     SELECT wikipedia, importance
       FROM compute_importance(location.extratags, NEW.country_code,
-                              'N', location.osm_id)
+                              location.rank_search, NEW.centroid)
       INTO linked_wikipedia,linked_importance;
 
     -- Use the maximum importance if one could be computed from the linked object.
index 7ef74349faefbd652002fee6fe4befcb0673e59e..d576485e33839ea4b7ccbe163bab1bf530b4c9bc 100644 (file)
@@ -276,7 +276,7 @@ CREATE SEQUENCE file start 1;
 
 -- null table so it won't error
 -- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it
-CREATE TABLE wikipedia_article (
+CREATE TABLE IF NOT EXISTS wikipedia_article (
     language text NOT NULL,
     title text NOT NULL,
     langcount integer,
@@ -290,15 +290,12 @@ CREATE TABLE wikipedia_article (
     wd_page_title text,
     instance_of text
 );
-ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title);
-CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id);
 
-CREATE TABLE wikipedia_redirect (
+CREATE TABLE IF NOT EXISTS wikipedia_redirect (
     language text,
     from_title text,
     to_title text
 );
-ALTER TABLE ONLY wikipedia_redirect ADD CONSTRAINT wikipedia_redirect_pkey PRIMARY KEY (language, from_title);
 
 -- osm2pgsql does not create indexes on the middle tables for Nominatim
 -- Add one for lookup of associated street relations.
index 4457db5fcb457d800fd40265f8a009a7704f9ae3..2f8273d602890be4971d59ac5495f7a54858cd48 100644 (file)
@@ -115,6 +115,7 @@ class NominatimArgs:
     address_levels: bool
     functions: bool
     wiki_data: bool
+    secondary_importance: bool
     importance: bool
     website: bool
     diffs: bool
index dce28d983a1365656c43b3e57bc79eb20c8befc8..ea605ea09e5d970996f676a51a6ebd6f44cd989b 100644 (file)
@@ -63,6 +63,8 @@ class UpdateRefresh:
                            help='Update the PL/pgSQL functions in the database')
         group.add_argument('--wiki-data', action='store_true',
                            help='Update Wikipedia/data importance numbers')
+        group.add_argument('--secondary-importance', action='store_true',
+                           help='Update secondary importance raster data')
         group.add_argument('--importance', action='store_true',
                            help='Recompute place importances (expensive!)')
         group.add_argument('--website', action='store_true',
@@ -83,7 +85,7 @@ class UpdateRefresh:
                            help='Enable debug warning statements in functions')
 
 
-    def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches
+    def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
         from ..tools import refresh, postcodes
         from ..indexer.indexer import Indexer
 
@@ -115,6 +117,20 @@ class UpdateRefresh:
             with connect(args.config.get_libpq_dsn()) as conn:
                 refresh.load_address_levels_from_config(conn, args.config)
 
+        # Attention: must come BEFORE functions
+        if args.secondary_importance:
+            with connect(args.config.get_libpq_dsn()) as conn:
+                # If the table did not exist before, then the importance code
+                # needs to be enabled.
+                if not conn.table_exists('secondary_importance'):
+                    args.functions = True
+
+            LOG.warning('Import secondary importance raster data from %s', args.project_dir)
+            if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
+                                                args.project_dir) > 0:
+                LOG.fatal('FATAL: Cannot update sendary importance raster data')
+                return 1
+
         if args.functions:
             LOG.warning('Create functions')
             with connect(args.config.get_libpq_dsn()) as conn:
index 297244331b5f6e5065030516007a35f38a80965c..344167bb595577256ef691d0e291beafb0c8f252 100644 (file)
@@ -59,7 +59,7 @@ class SetupAll:
                            help="Do not keep tables that are only needed for "
                                 "updating the database later")
         group2.add_argument('--offline', action='store_true',
-                           help="Do not attempt to load any additional data from the internet")
+                            help="Do not attempt to load any additional data from the internet")
         group3 = parser.add_argument_group('Expert options')
         group3.add_argument('--ignore-errors', action='store_true',
                            help='Continue import even when errors in SQL are present')
@@ -96,14 +96,21 @@ class SetupAll:
                                             drop=args.no_updates,
                                             ignore_errors=args.ignore_errors)
 
-            self._setup_tables(args.config, args.reverse_only)
-
             LOG.warning('Importing wikipedia importance data')
             data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
             if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
                                                  data_path) > 0:
                 LOG.error('Wikipedia importance dump file not found. '
-                          'Will be using default importances.')
+                          'Calculating importance values of locations will not '
+                          'use Wikipedia importance data.')
+
+            LOG.warning('Importing secondary importance raster data')
+            if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
+                                                   args.project_dir) != 0:
+                LOG.error('Secondary importance file not imported. '
+                          'Falling back to default ranking.')
+
+            self._setup_tables(args.config, args.reverse_only)
 
         if args.continue_at is None or args.continue_at == 'load-data':
             LOG.warning('Initialise tables')
index f6ebe90d0983588ccc101988f5085d6692f405a0..cb620d41fb8f31126fe69a622bf14130e38494d1 100644 (file)
@@ -75,6 +75,11 @@ def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
         with conn.cursor() as cur:
             cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
             cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
+
+            postgis_version = conn.postgis_version_tuple()
+            if postgis_version[0] >= 3:
+                cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster')
+
         conn.commit()
 
         _require_version('PostGIS',
index 8c1e9d9bbb24882601e4d0d8bed6c3c54a41f498..c50493cc847f529377653e95d521f51bcdd55321 100644 (file)
@@ -15,7 +15,7 @@ from pathlib import Path
 from psycopg2 import sql as pysql
 
 from nominatim.config import Configuration
-from nominatim.db.connection import Connection
+from nominatim.db.connection import Connection, connect
 from nominatim.db.utils import execute_file
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.version import version_str
@@ -146,6 +146,25 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
 
     return 0
 
+def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
+    """ Replaces the secondary importance raster data table with new data.
+
+        Returns 0 if all was well and 1 if the raster SQL file could not
+        be found. Throws an exception if there was an error reading the file.
+    """
+    datafile = data_path / 'secondary_importance.sql.gz'
+    if not datafile.exists():
+        return 1
+
+    with connect(dsn) as conn:
+        postgis_version = conn.postgis_version_tuple()
+        if postgis_version[0] < 3:
+            LOG.error('PostGIS version is too old for using OSM raster data.')
+            return 2
+
+    execute_file(dsn, datafile, ignore_errors=ignore_errors)
+
+    return 0
 
 def recompute_importance(conn: Connection) -> None:
     """ Recompute wikipedia links and importance for all entries in placex.
@@ -157,7 +176,7 @@ def recompute_importance(conn: Connection) -> None:
         cur.execute("""
             UPDATE placex SET (wikipedia, importance) =
                (SELECT wikipedia, importance
-                FROM compute_importance(extratags, country_code, osm_type, osm_id))
+                FROM compute_importance(extratags, country_code, osm_type, osm_id, centroid))
             """)
         cur.execute("""
             UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
index 300948a9a0765f2c40adea52c5d9dbd6a3587e90..af83bd33b723ac4b07054ca019e77260dbd692c8 100644 (file)
@@ -68,14 +68,15 @@ Feature: Search queries
           | 0 |
         Then there are duplicates
 
+    @fail-legacy
     Scenario: Search with bounded viewbox in right area
-        When sending json search query "bar" with address
+        When sending json search query "post" with address
           | bounded | viewbox |
           | 1       |  9,47,10,48 |
         Then result addresses contain
           | ID | town |
           | 0  | Vaduz |
-        When sending json search query "bar" with address
+        When sending json search query "post" with address
           | bounded | viewbox |
           | 1       |  9.49712,47.17122,9.52605,47.16242 |
         Then result addresses contain
@@ -118,18 +119,18 @@ Feature: Search queries
         Then result has centroid in 9.49712,47.16242,9.52605,47.17122
 
     Scenario: Prefer results within viewbox
-        When sending json search query "Gässle" with address
-          | accept-language |
-          | en |
-        Then result addresses contain
-          | ID | town |
-          | 0  | Balzers |
         When sending json search query "Gässle" with address
           | accept-language | viewbox |
           | en              | 9.52413,47.10759,9.53140,47.10539 |
         Then result addresses contain
           | ID | village |
           | 0  | Triesen |
+        When sending json search query "Gässle" with address
+          | accept-language | viewbox |
+          | en              | 9.45949,47.08421,9.54094,47.05466 |
+        Then result addresses contain
+          | ID | town |
+          | 0  | Balzers |
 
     Scenario: viewboxes cannot be points
         When sending json search query "foo"
index e72347885371efe0caa269e2764e7801eee21d52..1feafd751891a1355e51db0804ba38e03b7fee26 100644 (file)
@@ -201,19 +201,21 @@ class NominatimEnvironment:
             self.api_db_done = True
 
             if not self._reuse_or_drop_db(self.api_test_db):
-                testdata = Path('__file__') / '..' / '..' / 'testdb'
-                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+                testdata = (Path(__file__) / '..' / '..' / '..' / 'testdb').resolve()
+                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata)
+                simp_file = Path(self.website_dir.name) / 'secondary_importance.sql.gz'
+                simp_file.symlink_to(testdata / 'secondary_importance.sql.gz')
 
                 try:
                     self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    self.run_nominatim('add-data', '--tiger-data', str(testdata / 'tiger'))
                     self.run_nominatim('freeze')
 
                     if self.tokenizer == 'legacy':
-                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                        phrase_file = str(testdata / 'specialphrases_testdb.sql')
                         run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                     else:
-                        csv_path = str((testdata / 'full_en_phrases_test.csv').resolve())
+                        csv_path = str(testdata / 'full_en_phrases_test.csv')
                         self.run_nominatim('special-phrases', '--import-from-csv', csv_path)
                 except:
                     self.db_drop_database(self.api_test_db)
index 737c4e5c369b0eaea017ae19b468033fa639b03b..d098e27e24cdb73125aef3d82ba29cd6cbef98c6 100644 (file)
@@ -40,6 +40,7 @@ class TestCliImportWithDb:
             mock_func_factory(nominatim.data.country_info, 'setup_country_tables'),
             mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
             mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
+            mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'),
             mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
             mock_func_factory(nominatim.tools.database_import, 'load_data'),
             mock_func_factory(nominatim.tools.database_import, 'create_tables'),
index 7f44765bcade9ce08cb4c0f41da6f95f19e822c5..f3f93f0ff3409553c73470f7a612a1b10cd663d1 100644 (file)
@@ -71,6 +71,18 @@ class TestRefresh:
 
         assert self.call_nominatim('refresh', '--wiki-data') == 1
 
+    def test_refresh_secondary_importance_file_not_found(self):
+        assert self.call_nominatim('refresh', '--secondary-importance') == 1
+
+
+    def test_refresh_secondary_importance_new_table(self, mock_func_factory):
+        mocks = [mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'),
+                 mock_func_factory(nominatim.tools.refresh, 'create_functions')]
+
+        assert self.call_nominatim('refresh', '--secondary-importance') == 0
+        assert mocks[0].called == 1
+        assert mocks[1].called == 1
+
 
     def test_refresh_importance_computed_after_wiki_import(self, monkeypatch):
         calls = []
index ac52aa36091643923d1c6791038b78ddc8f8c8d9..c6be4fe72774068e9476a79299feba0dc7e30c03 100644 (file)
@@ -17,6 +17,21 @@ def test_refresh_import_wikipedia_not_existing(dsn):
     assert refresh.import_wikipedia_articles(dsn, Path('.')) == 1
 
 
+def test_refresh_import_secondary_importance_non_existing(dsn):
+    assert refresh.import_secondary_importance(dsn, Path('.')) == 1
+
+def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
+    temp_db_cursor.execute('CREATE EXTENSION postgis')
+
+    if temp_db_conn.postgis_version_tuple()[0] < 3:
+        assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0
+    else:
+        temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
+        assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
+
+        assert temp_db_conn.table_exists('secondary_importance')
+
+
 @pytest.mark.parametrize("replace", (True, False))
 def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, replace):
     if replace:
@@ -34,6 +49,7 @@ def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db
     temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
                                               country_code varchar(2),
                                               osm_type varchar(1), osm_id BIGINT,
+                                              centroid GEOMETRY,
                                               OUT importance FLOAT,
                                               OUT wikipedia TEXT)
                                AS $$ SELECT 0.1::float, 'foo'::text $$ LANGUAGE SQL""")
diff --git a/test/testdb/secondary_importance.sql.gz b/test/testdb/secondary_importance.sql.gz
new file mode 100644 (file)
index 0000000..e9c115d
Binary files /dev/null and b/test/testdb/secondary_importance.sql.gz differ