From: Sarah Hoffmann Date: Tue, 4 Oct 2022 14:25:47 +0000 (+0200) Subject: Merge pull request #2835 from lonvia/secondary-importance X-Git-Tag: v4.2.0~25 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/afeafc8aa797a9051fc43a4482165c563a834a3f?hp=3381a92d92fba7adcc72b0e6397a9726664fe430 Merge pull request #2835 from lonvia/secondary-importance Secondary importance --- diff --git a/docs/admin/Import.md b/docs/admin/Import.md index 90294959..8b6d6baa 100644 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@ -79,10 +79,10 @@ This data is available as a binary download. Put it into your project directory: The file is about 400MB and adds around 4GB to the Nominatim database. !!! tip - If you forgot to download the wikipedia rankings, you can also add - importances after the import. Download the files, then run - `nominatim refresh --wiki-data --importance`. Updating importances for - a planet can take a couple of hours. + If you forgot to download the wikipedia rankings, then you can + also add importances after the import. Download the SQL files, then + run `nominatim refresh --wiki-data --importance`. Updating + importances for a planet will take a couple of hours. ### External postcodes @@ -139,7 +139,7 @@ import. So this option is particularly interesting if you plan to transfer the database or reuse the space later. !!! warning - The datastructure for updates are also required when adding additional data + The data structure for updates are also required when adding additional data after the import, for example [TIGER housenumber data](../customize/Tiger.md). If you plan to use those, you must not use the `--no-updates` parameter. Do a normal import, add the external data and once you are done with diff --git a/docs/customize/Importance.md b/docs/customize/Importance.md new file mode 100644 index 00000000..d12bfc86 --- /dev/null +++ b/docs/customize/Importance.md @@ -0,0 +1,49 @@ +## Importance + +Search requests can yield multiple results which match equally well with +the original query. In such case Nominatim needs to order the results +according to a different criterion: importance. This is a measure for how +likely it is that a user will search for a given place. This section explains +the sources Nominatim uses for computing importance of a place and how to +customize them. + +### How importance is computed + +The main value for importance is derived from page ranking values for Wikipedia +pages for a place. For places that do not have their own +Wikipedia page, a formula is used that derives a static importance from the +places [search rank](../customize/Ranking#search-rank). + +In a second step, a secondary importance value is added which is meant to +represent how well-known the general area is where the place is located. It +functions as a tie-breaker between places with very similar primary +importance values. + +nominatim.org has preprocessed importance tables for the +[primary Wikipedia rankings](https://nominatim.org/data/wikimedia-importance.sql.gz) +and for a secondary importance based on the number of tile views on openstreetmap.org. + +### Customizing secondary importance + +The secondary importance is implemented as a simple +[Postgis raster](https://postgis.net/docs/raster.html) table, where Nominatim +looks up the value for the coordinates of the centroid of a place. You can +provide your own secondary importance raster in form of an SQL file named +`secondary_importance.sql.gz` in your project directory. + +The SQL file needs to drop and (re)create a table `secondary_importance` which +must as a minimum contain a column `rast` of type `raster`. The raster must +be in EPSG:4326 and contain 16bit unsigned ints +(`raster_constraint_pixel_types(rast) = '{16BUI}'). Any other columns in the +table will be ignored. You must furthermore create an index as follows: + +``` +CREATE INDEX ON secondary_importance USING gist(ST_ConvexHull(gist)) +``` + +The following raster2pgsql command will create a table that conforms to +the requirements: + +``` +raster2pgsql -I -C -Y -d -t 128x128 input.tiff public.secondary_importance +``` diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index e89c32d5..ab7dec30 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -30,6 +30,7 @@ nav: - 'Configuration Settings': 'customize/Settings.md' - 'Per-Country Data': 'customize/Country-Settings.md' - 'Place Ranking' : 'customize/Ranking.md' + - 'Importance' : 'customize/Importance.md' - 'Tokenizers' : 'customize/Tokenizers.md' - 'Special Phrases': 'customize/Special-Phrases.md' - 'External data: US housenumbers from TIGER': 'customize/Tiger.md' diff --git a/lib-sql/functions/importance.sql b/lib-sql/functions/importance.sql index ac3aa7f8..44e8bc8b 100644 --- a/lib-sql/functions/importance.sql +++ b/lib-sql/functions/importance.sql @@ -100,32 +100,55 @@ LANGUAGE plpgsql STABLE; CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), - osm_type varchar(1), osm_id BIGINT) + rank_search SMALLINT, + centroid GEOMETRY) RETURNS place_importance AS $$ DECLARE match RECORD; result place_importance; + osm_views_exists BIGINT; + views BIGINT; BEGIN - FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code) - WHERE language is not NULL + -- add importance by wikipedia article if the place has one + FOR match IN + SELECT * FROM get_wikipedia_match(extratags, country_code) + WHERE language is not NULL LOOP result.importance := match.importance; result.wikipedia := match.language || ':' || match.title; RETURN result; END LOOP; - IF extratags ? 'wikidata' THEN + -- Nothing? Then try with the wikidata tag. + IF result.importance is null AND extratags ? 'wikidata' THEN FOR match IN SELECT * FROM wikipedia_article WHERE wd_page_title = extratags->'wikidata' - ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP + ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 + LOOP result.importance := match.importance; result.wikipedia := match.language || ':' || match.title; RETURN result; END LOOP; END IF; - RETURN null; + -- Still nothing? Fall back to a default. + IF result.importance is null THEN + result.importance := 0.75001 - (rank_search::float / 40); + END IF; + +{% if 'secondary_importance' in db.tables %} + FOR match IN + SELECT ST_Value(rast, centroid) as importance + FROM secondary_importance + WHERE ST_Intersects(ST_ConvexHull(rast), centroid) LIMIT 1 + LOOP + -- Secondary importance as tie breaker with 0.0001 weight. + result.importance := result.importance + match.importance::float / 655350000; + END LOOP; +{% endif %} + + RETURN result; END; $$ LANGUAGE plpgsql; diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index a2276f07..367d2149 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -965,7 +965,7 @@ BEGIN NEW.importance := null; SELECT wikipedia, importance - FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id) + FROM compute_importance(NEW.extratags, NEW.country_code, NEW.rank_search, NEW.centroid) INTO NEW.wikipedia,NEW.importance; {% if debug %}RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;{% endif %} @@ -1047,7 +1047,7 @@ BEGIN IF linked_place is not null THEN -- Recompute the ranks here as the ones from the linked place might -- have been shifted to accommodate surrounding boundaries. - SELECT place_id, osm_id, class, type, extratags, + SELECT place_id, osm_id, class, type, extratags, rank_search, centroid, geometry, (compute_place_rank(country_code, osm_type, class, type, admin_level, (extratags->'capital') = 'yes', null)).* @@ -1088,7 +1088,7 @@ BEGIN SELECT wikipedia, importance FROM compute_importance(location.extratags, NEW.country_code, - 'N', location.osm_id) + location.rank_search, NEW.centroid) INTO linked_wikipedia,linked_importance; -- Use the maximum importance if one could be computed from the linked object. diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql index 7ef74349..d576485e 100644 --- a/lib-sql/tables.sql +++ b/lib-sql/tables.sql @@ -276,7 +276,7 @@ CREATE SEQUENCE file start 1; -- null table so it won't error -- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it -CREATE TABLE wikipedia_article ( +CREATE TABLE IF NOT EXISTS wikipedia_article ( language text NOT NULL, title text NOT NULL, langcount integer, @@ -290,15 +290,12 @@ CREATE TABLE wikipedia_article ( wd_page_title text, instance_of text ); -ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title); -CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id); -CREATE TABLE wikipedia_redirect ( +CREATE TABLE IF NOT EXISTS wikipedia_redirect ( language text, from_title text, to_title text ); -ALTER TABLE ONLY wikipedia_redirect ADD CONSTRAINT wikipedia_redirect_pkey PRIMARY KEY (language, from_title); -- osm2pgsql does not create indexes on the middle tables for Nominatim -- Add one for lookup of associated street relations. diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py index 4457db5f..2f8273d6 100644 --- a/nominatim/clicmd/args.py +++ b/nominatim/clicmd/args.py @@ -115,6 +115,7 @@ class NominatimArgs: address_levels: bool functions: bool wiki_data: bool + secondary_importance: bool importance: bool website: bool diffs: bool diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index dce28d98..ea605ea0 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -63,6 +63,8 @@ class UpdateRefresh: help='Update the PL/pgSQL functions in the database') group.add_argument('--wiki-data', action='store_true', help='Update Wikipedia/data importance numbers') + group.add_argument('--secondary-importance', action='store_true', + help='Update secondary importance raster data') group.add_argument('--importance', action='store_true', help='Recompute place importances (expensive!)') group.add_argument('--website', action='store_true', @@ -83,7 +85,7 @@ class UpdateRefresh: help='Enable debug warning statements in functions') - def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches + def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements from ..tools import refresh, postcodes from ..indexer.indexer import Indexer @@ -115,6 +117,20 @@ class UpdateRefresh: with connect(args.config.get_libpq_dsn()) as conn: refresh.load_address_levels_from_config(conn, args.config) + # Attention: must come BEFORE functions + if args.secondary_importance: + with connect(args.config.get_libpq_dsn()) as conn: + # If the table did not exist before, then the importance code + # needs to be enabled. + if not conn.table_exists('secondary_importance'): + args.functions = True + + LOG.warning('Import secondary importance raster data from %s', args.project_dir) + if refresh.import_secondary_importance(args.config.get_libpq_dsn(), + args.project_dir) > 0: + LOG.fatal('FATAL: Cannot update sendary importance raster data') + return 1 + if args.functions: LOG.warning('Create functions') with connect(args.config.get_libpq_dsn()) as conn: diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 29724433..344167bb 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -59,7 +59,7 @@ class SetupAll: help="Do not keep tables that are only needed for " "updating the database later") group2.add_argument('--offline', action='store_true', - help="Do not attempt to load any additional data from the internet") + help="Do not attempt to load any additional data from the internet") group3 = parser.add_argument_group('Expert options') group3.add_argument('--ignore-errors', action='store_true', help='Continue import even when errors in SQL are present') @@ -96,14 +96,21 @@ class SetupAll: drop=args.no_updates, ignore_errors=args.ignore_errors) - self._setup_tables(args.config, args.reverse_only) - LOG.warning('Importing wikipedia importance data') data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir) if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), data_path) > 0: LOG.error('Wikipedia importance dump file not found. ' - 'Will be using default importances.') + 'Calculating importance values of locations will not ' + 'use Wikipedia importance data.') + + LOG.warning('Importing secondary importance raster data') + if refresh.import_secondary_importance(args.config.get_libpq_dsn(), + args.project_dir) != 0: + LOG.error('Secondary importance file not imported. ' + 'Falling back to default ranking.') + + self._setup_tables(args.config, args.reverse_only) if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Initialise tables') diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index f6ebe90d..cb620d41 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -75,6 +75,11 @@ def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None: with conn.cursor() as cur: cur.execute('CREATE EXTENSION IF NOT EXISTS hstore') cur.execute('CREATE EXTENSION IF NOT EXISTS postgis') + + postgis_version = conn.postgis_version_tuple() + if postgis_version[0] >= 3: + cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster') + conn.commit() _require_version('PostGIS', diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 8c1e9d9b..c50493cc 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -15,7 +15,7 @@ from pathlib import Path from psycopg2 import sql as pysql from nominatim.config import Configuration -from nominatim.db.connection import Connection +from nominatim.db.connection import Connection, connect from nominatim.db.utils import execute_file from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.version import version_str @@ -146,6 +146,25 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F return 0 +def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int: + """ Replaces the secondary importance raster data table with new data. + + Returns 0 if all was well and 1 if the raster SQL file could not + be found. Throws an exception if there was an error reading the file. + """ + datafile = data_path / 'secondary_importance.sql.gz' + if not datafile.exists(): + return 1 + + with connect(dsn) as conn: + postgis_version = conn.postgis_version_tuple() + if postgis_version[0] < 3: + LOG.error('PostGIS version is too old for using OSM raster data.') + return 2 + + execute_file(dsn, datafile, ignore_errors=ignore_errors) + + return 0 def recompute_importance(conn: Connection) -> None: """ Recompute wikipedia links and importance for all entries in placex. @@ -157,7 +176,7 @@ def recompute_importance(conn: Connection) -> None: cur.execute(""" UPDATE placex SET (wikipedia, importance) = (SELECT wikipedia, importance - FROM compute_importance(extratags, country_code, osm_type, osm_id)) + FROM compute_importance(extratags, country_code, osm_type, osm_id, centroid)) """) cur.execute(""" UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance diff --git a/test/bdd/api/search/params.feature b/test/bdd/api/search/params.feature index 300948a9..af83bd33 100644 --- a/test/bdd/api/search/params.feature +++ b/test/bdd/api/search/params.feature @@ -68,14 +68,15 @@ Feature: Search queries | 0 | Then there are duplicates + @fail-legacy Scenario: Search with bounded viewbox in right area - When sending json search query "bar" with address + When sending json search query "post" with address | bounded | viewbox | | 1 | 9,47,10,48 | Then result addresses contain | ID | town | | 0 | Vaduz | - When sending json search query "bar" with address + When sending json search query "post" with address | bounded | viewbox | | 1 | 9.49712,47.17122,9.52605,47.16242 | Then result addresses contain @@ -118,18 +119,18 @@ Feature: Search queries Then result has centroid in 9.49712,47.16242,9.52605,47.17122 Scenario: Prefer results within viewbox - When sending json search query "Gässle" with address - | accept-language | - | en | - Then result addresses contain - | ID | town | - | 0 | Balzers | When sending json search query "Gässle" with address | accept-language | viewbox | | en | 9.52413,47.10759,9.53140,47.10539 | Then result addresses contain | ID | village | | 0 | Triesen | + When sending json search query "Gässle" with address + | accept-language | viewbox | + | en | 9.45949,47.08421,9.54094,47.05466 | + Then result addresses contain + | ID | town | + | 0 | Balzers | Scenario: viewboxes cannot be points When sending json search query "foo" diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index e7234788..1feafd75 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -201,19 +201,21 @@ class NominatimEnvironment: self.api_db_done = True if not self._reuse_or_drop_db(self.api_test_db): - testdata = Path('__file__') / '..' / '..' / 'testdb' - self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve()) + testdata = (Path(__file__) / '..' / '..' / '..' / 'testdb').resolve() + self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata) + simp_file = Path(self.website_dir.name) / 'secondary_importance.sql.gz' + simp_file.symlink_to(testdata / 'secondary_importance.sql.gz') try: self.run_nominatim('import', '--osm-file', str(self.api_test_file)) - self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) + self.run_nominatim('add-data', '--tiger-data', str(testdata / 'tiger')) self.run_nominatim('freeze') if self.tokenizer == 'legacy': - phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) + phrase_file = str(testdata / 'specialphrases_testdb.sql') run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) else: - csv_path = str((testdata / 'full_en_phrases_test.csv').resolve()) + csv_path = str(testdata / 'full_en_phrases_test.csv') self.run_nominatim('special-phrases', '--import-from-csv', csv_path) except: self.db_drop_database(self.api_test_db) diff --git a/test/python/cli/test_cmd_import.py b/test/python/cli/test_cmd_import.py index 737c4e5c..d098e27e 100644 --- a/test/python/cli/test_cmd_import.py +++ b/test/python/cli/test_cmd_import.py @@ -40,6 +40,7 @@ class TestCliImportWithDb: mock_func_factory(nominatim.data.country_info, 'setup_country_tables'), mock_func_factory(nominatim.tools.database_import, 'import_osm_data'), mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'), + mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'), mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), mock_func_factory(nominatim.tools.database_import, 'load_data'), mock_func_factory(nominatim.tools.database_import, 'create_tables'), diff --git a/test/python/cli/test_cmd_refresh.py b/test/python/cli/test_cmd_refresh.py index 7f44765b..f3f93f0f 100644 --- a/test/python/cli/test_cmd_refresh.py +++ b/test/python/cli/test_cmd_refresh.py @@ -71,6 +71,18 @@ class TestRefresh: assert self.call_nominatim('refresh', '--wiki-data') == 1 + def test_refresh_secondary_importance_file_not_found(self): + assert self.call_nominatim('refresh', '--secondary-importance') == 1 + + + def test_refresh_secondary_importance_new_table(self, mock_func_factory): + mocks = [mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'), + mock_func_factory(nominatim.tools.refresh, 'create_functions')] + + assert self.call_nominatim('refresh', '--secondary-importance') == 0 + assert mocks[0].called == 1 + assert mocks[1].called == 1 + def test_refresh_importance_computed_after_wiki_import(self, monkeypatch): calls = [] diff --git a/test/python/tools/test_refresh.py b/test/python/tools/test_refresh.py index ac52aa36..c6be4fe7 100644 --- a/test/python/tools/test_refresh.py +++ b/test/python/tools/test_refresh.py @@ -17,6 +17,21 @@ def test_refresh_import_wikipedia_not_existing(dsn): assert refresh.import_wikipedia_articles(dsn, Path('.')) == 1 +def test_refresh_import_secondary_importance_non_existing(dsn): + assert refresh.import_secondary_importance(dsn, Path('.')) == 1 + +def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor): + temp_db_cursor.execute('CREATE EXTENSION postgis') + + if temp_db_conn.postgis_version_tuple()[0] < 3: + assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0 + else: + temp_db_cursor.execute('CREATE EXTENSION postgis_raster') + assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0 + + assert temp_db_conn.table_exists('secondary_importance') + + @pytest.mark.parametrize("replace", (True, False)) def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, replace): if replace: @@ -34,6 +49,7 @@ def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT, + centroid GEOMETRY, OUT importance FLOAT, OUT wikipedia TEXT) AS $$ SELECT 0.1::float, 'foo'::text $$ LANGUAGE SQL""") diff --git a/test/testdb/secondary_importance.sql.gz b/test/testdb/secondary_importance.sql.gz new file mode 100644 index 00000000..e9c115d1 Binary files /dev/null and b/test/testdb/secondary_importance.sql.gz differ