From: Sarah Hoffmann Date: Tue, 4 Aug 2020 12:31:19 +0000 (+0200) Subject: Merge pull request #1865 from mtmail/how-to-import-test-db X-Git-Tag: v3.6.0~98 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/f29dc7d7ac61ad4408655a8d195d8424de13ebc9?hp=01b009ff24494408b8b568bcc0f6c4ad4156b9ce Merge pull request #1865 from mtmail/how-to-import-test-db test/README.md - more instructions how to import test db --- diff --git a/VAGRANT.md b/VAGRANT.md index 4c8eb724..b10a5ac7 100644 --- a/VAGRANT.md +++ b/VAGRANT.md @@ -160,9 +160,9 @@ You can configure/download other Vagrant boxes from [https://app.vagrantup.com/b Let's say you have a Postgres database named `nominatim_it` on server `your-server.com` and port `5432`. The Postgres username is `postgres`. You can edit `settings/local.php` and point Nominatim to it. - pgsql://postgres@your-server.com:5432/nominatim_it + pgsql:host=your-server.com;port=5432;user=postgres;dbname=nominatim_it -No data import necessary or restarting necessary. +No data import or restarting necessary. If the Postgres installation is behind a firewall, you can try diff --git a/nominatim/indexer/__init__.py b/nominatim/indexer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/indexer/progress.py b/nominatim/indexer/progress.py new file mode 100644 index 00000000..456d3eae --- /dev/null +++ b/nominatim/indexer/progress.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. +# Copyright (C) 2020 Sarah Hoffmann + +import logging +from datetime import datetime + +log = logging.getLogger() + +class ProgressLogger(object): + """ Tracks and prints progress for the indexing process. + `name` is the name of the indexing step being tracked. + `total` sets up the total number of items that need processing. + `log_interval` denotes the interval in seconds at which progres + should be reported. + """ + + def __init__(self, name, total, log_interval=1): + self.name = name + self.total_places = total + self.done_places = 0 + self.rank_start_time = datetime.now() + self.next_info = 100 if log.isEnabledFor(logging.INFO) else total + 1 + + def add(self, num=1): + """ Mark `num` places as processed. Print a log message if the + logging is at least info and the log interval has past. + """ + self.done_places += num + + if self.done_places >= self.next_info: + now = datetime.now() + done_time = (now - self.rank_start_time).total_seconds() + places_per_sec = self.done_places / done_time + eta = (self.total_places - self.done_places)/places_per_sec + + log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}" + .format(self.done_places, int(done_time), + places_per_sec, self.name, eta)) + + self.next_info += int(places_per_sec) + + def done(self): + """ Print final staticstics about the progress. + """ + rank_end_time = datetime.now() + diff_seconds = (rank_end_time-self.rank_start_time).total_seconds() + + log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format( + self.done_places, self.total_places, int(diff_seconds), + self.done_places/diff_seconds, self.name)) diff --git a/nominatim/nominatim.py b/nominatim/nominatim.py index b29bf343..e8600ca8 100755 --- a/nominatim/nominatim.py +++ b/nominatim/nominatim.py @@ -32,6 +32,8 @@ import psycopg2 from psycopg2.extras import wait_select import select +from indexer.progress import ProgressLogger + log = logging.getLogger() def make_connection(options, asynchronous=False): @@ -55,24 +57,19 @@ class RankRunner(object): def name(self): return "rank {}".format(self.rank) - def sql_index_sectors(self): - return """SELECT geometry_sector, count(*) FROM placex + def sql_count_objects(self): + return """SELECT count(*) FROM placex WHERE rank_search = {} and indexed_status > 0 - GROUP BY geometry_sector - ORDER BY geometry_sector""".format(self.rank) + """.format(self.rank) - def sql_nosector_places(self): + def sql_get_objects(self): return """SELECT place_id FROM placex WHERE indexed_status > 0 and rank_search = {} ORDER BY geometry_sector""".format(self.rank) - def sql_sector_places(self): - return """SELECT place_id FROM placex - WHERE indexed_status > 0 and rank_search = {} - and geometry_sector = %s""".format(self.rank) - - def sql_index_place(self): - return "UPDATE placex SET indexed_status = 0 WHERE place_id = %s" + def sql_index_place(self, ids): + return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\ + .format(','.join((str(i) for i in ids))) class InterpolationRunner(object): @@ -83,25 +80,19 @@ class InterpolationRunner(object): def name(self): return "interpolation lines (location_property_osmline)" - def sql_index_sectors(self): - return """SELECT geometry_sector, count(*) FROM location_property_osmline - WHERE indexed_status > 0 - GROUP BY geometry_sector - ORDER BY geometry_sector""" + def sql_count_objects(self): + return """SELECT count(*) FROM location_property_osmline + WHERE indexed_status > 0""" - def sql_nosector_places(self): + def sql_get_objects(self): return """SELECT place_id FROM location_property_osmline WHERE indexed_status > 0 ORDER BY geometry_sector""" - def sql_sector_places(self): - return """SELECT place_id FROM location_property_osmline - WHERE indexed_status > 0 and geometry_sector = %s - ORDER BY geometry_sector""" - - def sql_index_place(self): + def sql_index_place(self, ids): return """UPDATE location_property_osmline - SET indexed_status = 0 WHERE place_id = %s""" + SET indexed_status = 0 WHERE place_id IN ({})"""\ + .format(','.join((str(i) for i in ids))) class DBConnection(object): @@ -210,83 +201,48 @@ class Indexer(object): self.index(RankRunner(rank)) if self.maxrank == 30: - self.index(InterpolationRunner()) + self.index(InterpolationRunner(), 20) - self.index(RankRunner(self.maxrank)) + self.index(RankRunner(self.maxrank), 20) - def index(self, obj): + def index(self, obj, batch=1): """ Index a single rank or table. `obj` describes the SQL to use - for indexing. + for indexing. `batch` describes the number of objects that + should be processed with a single SQL statement """ log.warning("Starting {}".format(obj.name())) - cur = self.conn.cursor(name='main') - cur.execute(obj.sql_index_sectors()) + cur = self.conn.cursor() + cur.execute(obj.sql_count_objects()) - total_tuples = 0 - for r in cur: - total_tuples += r[1] - log.debug("Total number of rows; {}".format(total_tuples)) + total_tuples = cur.fetchone()[0] + log.debug("Total number of rows: {}".format(total_tuples)) - cur.scroll(0, mode='absolute') + cur.close() next_thread = self.find_free_thread() - done_tuples = 0 - rank_start_time = datetime.now() - - sector_sql = obj.sql_sector_places() - index_sql = obj.sql_index_place() - min_grouped_tuples = total_tuples - len(self.threads) * 1000 + progress = ProgressLogger(obj.name(), total_tuples) - next_info = 100 if log.isEnabledFor(logging.INFO) else total_tuples + 1 + cur = self.conn.cursor(name='places') + cur.execute(obj.sql_get_objects()) - for r in cur: - sector = r[0] - - # Should we do the remaining ones together? - do_all = done_tuples > min_grouped_tuples - - pcur = self.conn.cursor(name='places') - - if do_all: - pcur.execute(obj.sql_nosector_places()) - else: - pcur.execute(sector_sql, (sector, )) - - for place in pcur: - place_id = place[0] - log.debug("Processing place {}".format(place_id)) - thread = next(next_thread) - - thread.perform(index_sql, (place_id,)) - done_tuples += 1 + while True: + places = [p[0] for p in cur.fetchmany(batch)] + if len(places) == 0: + break - if done_tuples >= next_info: - now = datetime.now() - done_time = (now - rank_start_time).total_seconds() - tuples_per_sec = done_tuples / done_time - log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}" - .format(done_tuples, int(done_time), - tuples_per_sec, obj.name(), - (total_tuples - done_tuples)/tuples_per_sec)) - next_info += int(tuples_per_sec) + log.debug("Processing places: {}".format(places)) + thread = next(next_thread) - pcur.close() - - if do_all: - break + thread.perform(obj.sql_index_place(places)) + progress.add(len(places)) cur.close() for t in self.threads: t.wait() - rank_end_time = datetime.now() - diff_seconds = (rank_end_time-rank_start_time).total_seconds() - - log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format( - done_tuples, total_tuples, int(diff_seconds), - done_tuples/diff_seconds, obj.name())) + progress.done() def find_free_thread(self): """ Generator that returns the next connection that is free for diff --git a/sql/partition-functions.src.sql b/sql/partition-functions.src.sql index 34b4d390..17db9c16 100644 --- a/sql/partition-functions.src.sql +++ b/sql/partition-functions.src.sql @@ -10,6 +10,28 @@ CREATE TYPE nearfeaturecentr AS ( centroid GEOMETRY ); + -- feature intersects geoemtry + -- for areas and linestrings they must touch at least along a line +CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT) +RETURNS BOOLEAN +AS $$ +BEGIN + IF substring(de9im from 1 for 2) != 'FF' THEN + RETURN TRUE; + END IF; + + IF geom_type = 'ST_Point' THEN + RETURN substring(de9im from 4 for 1) = '0'; + END IF; + + IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN + RETURN substring(de9im from 4 for 1) = '1'; + END IF; + + RETURN substring(de9im from 4 for 1) = '2'; +END +$$ LANGUAGE plpgsql IMMUTABLE; + create or replace function getNearFeatures(in_partition INTEGER, feature GEOMETRY, maxrank INTEGER, isin_tokens INT[]) RETURNS setof nearfeaturecentr AS $$ DECLARE r nearfeaturecentr%rowtype; @@ -20,7 +42,8 @@ BEGIN FOR r IN SELECT place_id, keywords, rank_address, rank_search, min(ST_Distance(feature, centroid)) as distance, isguess, postcode, centroid FROM location_area_large_-partition- - WHERE ST_Intersects(geometry, feature) + WHERE geometry && feature + AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature)) AND rank_search < maxrank AND rank_address < maxrank GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid ORDER BY rank_address, isin_tokens && keywords desc, isguess asc, diff --git a/test/bdd/api/search/queries.feature b/test/bdd/api/search/queries.feature index 57ae2c20..b521c171 100644 --- a/test/bdd/api/search/queries.feature +++ b/test/bdd/api/search/queries.feature @@ -23,7 +23,6 @@ Feature: Search queries | type | value | | house_number | 86 | | road | Schellingstraße | - | neighbourhood | Auenviertel | | suburb | Eilbek | | postcode | 22089 | | city | Hamburg | @@ -38,7 +37,6 @@ Feature: Search queries | type | value | | house_number | 73 | | road | Schellingstraße | - | neighbourhood | Auenviertel | | suburb | Eilbek | | postcode | 22089 | | city | Hamburg | @@ -183,4 +181,4 @@ Feature: Search queries When sending json search query "22nd Street Southwest , Huron" Then results contain | class | type | - | highway | residential | \ No newline at end of file + | highway | residential | diff --git a/test/bdd/db/import/addressing.feature b/test/bdd/db/import/addressing.feature index f4297f30..ddb7b438 100644 --- a/test/bdd/db/import/addressing.feature +++ b/test/bdd/db/import/addressing.feature @@ -21,6 +21,99 @@ Feature: Address computation | W1 | W10 | 10 | | W1 | W11 | 10 | + + Scenario: Roads following a boundary should contain both states + Given the grid + | 1 | | | 2 | | 3 | + | | | 8 | 7 | | | + | 4 | | | 5 | | 6 | + And the named places + | osm | class | type | geometry | + | W1 | highway | road | 2, 7, 8 | + And the named places + | osm | class | type | admin | geometry | + | W10 | boundary | administrative | 5 | (1, 2, 5, 4, 1) | + | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 2) | + When importing + Then place_addressline contains + | object | address | cached_rank_address | + | W1 | W10 | 10 | + | W1 | W11 | 10 | + + Scenario: Roads should not contain boundaries they touch in a end point + Given the grid + | 1 | | | 2 | | 3 | + | | 7 | | 8 | | | + | 4 | | | 5 | | 6 | + And the named places + | osm | class | type | geometry | + | W1 | highway | road | 7, 8 | + And the named places + | osm | class | type | admin | geometry | + | W10 | boundary | administrative | 5 | (1, 2, 8, 5, 4, 1) | + | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 8, 2) | + When importing + Then place_addressline contains + | object | address | cached_rank_address | + | W1 | W10 | 10 | + Then place_addressline doesn't contain + | object | address | + | W1 | W11 | + + Scenario: Roads should not contain boundaries they touch in a end point + Given the grid + | 1 | | | 2 | | 3 | + | | 7 | | 8 | | | + | 4 | | 9 | 5 | | 6 | + And the named places + | osm | class | type | geometry | + | W1 | highway | road | 7, 8, 9 | + And the named places + | osm | class | type | admin | geometry | + | W10 | boundary | administrative | 5 | (1, 2, 8, 5, 4, 1) | + | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 8, 2) | + When importing + Then place_addressline contains + | object | address | cached_rank_address | + | W1 | W10 | 10 | + Then place_addressline doesn't contain + | object | address | + | W1 | W11 | + + Scenario: Locality points should contain all boundaries they touch + Given the grid + | 1 | | | 2 | | 3 | + | | | | 8 | | | + | 4 | | | 5 | | 6 | + And the named places + | osm | class | type | geometry | + | N1 | place | locality | 8 | + And the named places + | osm | class | type | admin | geometry | + | W10 | boundary | administrative | 5 | (1, 2, 8, 5, 4, 1) | + | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 8, 2) | + When importing + Then place_addressline contains + | object | address | cached_rank_address | + | N1 | W10 | 10 | + | N1 | W11 | 10 | + + Scenario: Areas should not contain boundaries they touch + Given the grid + | 1 | | | 2 | | 3 | + | | | | | | | + | 4 | | | 5 | | 6 | + And the named places + | osm | class | type | geometry | + | W1 | landuse | industrial | (1, 2, 5, 4, 1) | + And the named places + | osm | class | type | admin | geometry | + | W10 | boundary | administrative | 5 | (2, 3, 6, 5, 2) | + When importing + Then place_addressline doesn't contain + | object | address | + | W1 | W10 | + Scenario: buildings with only addr:postcodes do not appear in the address of a way Given the scene admin-areas And the named places