From: Sarah Hoffmann Date: Fri, 21 Jan 2022 09:17:58 +0000 (+0100) Subject: Merge pull request #2589 from lonvia/clean-housenumbers X-Git-Tag: v4.1.0~93 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/f6ec8d2e33d99fb0497c89f8e423a0f1ea3dad7c?hp=86588419fb1c3fffe131c0e8d99ecea3c77d67c5 Merge pull request #2589 from lonvia/clean-housenumbers Add command for cleaning up word table --- diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 23d640d7..f326c3ca 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -309,12 +309,20 @@ jobs: NOMINATIM_REPLICATION_MAX_DIFF=1 nominatim replication --once working-directory: /home/nominatim/nominatim-project + - name: Clean up database + run: nominatim refresh --postcodes --word-tokens + working-directory: /home/nominatim/nominatim-project + - name: Run reverse-only import run : | echo 'NOMINATIM_DATABASE_DSN="pgsql:dbname=reverse"' >> .env nominatim import --osm-file ../test.pbf --reverse-only --no-updates working-directory: /home/nominatim/data-env-reverse - - name: Check reverse import + - name: Check reverse-only import run: nominatim admin --check-database working-directory: /home/nominatim/data-env-reverse + + - name: Clean up database (reverse-only import) + run: nominatim refresh --postcodes --word-tokens + working-directory: /home/nominatim/nominatim-project diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index 4df283f8..b8a88b6d 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -39,6 +39,8 @@ class UpdateRefresh: group = parser.add_argument_group('Data arguments') group.add_argument('--postcodes', action='store_true', help='Update postcode centroid table') + group.add_argument('--word-tokens', action='store_true', + help='Clean up search terms') group.add_argument('--word-counts', action='store_true', help='Compute frequency of full-word search terms') group.add_argument('--address-levels', action='store_true', @@ -76,6 +78,11 @@ class UpdateRefresh: LOG.error("The place table doesn't exist. " "Postcode updates on a frozen database is not possible.") + if args.word_tokens: + LOG.warning('Updating word tokens') + tokenizer = self._get_tokenizer(args.config) + tokenizer.update_word_tokens() + if args.word_counts: LOG.warning('Recompute word statistics') self._get_tokenizer(args.config).update_statistics() diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 980dc69e..f81b3bc2 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -209,6 +209,13 @@ class AbstractTokenizer(ABC): """ + @abstractmethod + def update_word_tokens(self) -> None: + """ Do house-keeping on the tokenizers internal data structures. + Remove unused word tokens, resort data etc. + """ + + @abstractmethod def name_analyzer(self) -> AbstractAnalyzer: """ Create a new analyzer for tokenizing names and queries diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index cfbb44e3..f5addd3e 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -112,6 +112,47 @@ class LegacyICUTokenizer(AbstractTokenizer): conn.commit() + def _cleanup_housenumbers(self): + """ Remove unused house numbers. + """ + with connect(self.dsn) as conn: + if not conn.table_exists('search_name'): + return + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT word_id, word_token FROM word + WHERE type = 'H' + AND NOT EXISTS(SELECT * FROM search_name + WHERE ARRAY[word.word_id] && name_vector) + AND (char_length(word_token) > 6 + OR word_token not similar to '\\d+') + """) + candidates = {token: wid for wid, token in cur} + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT housenumber FROM placex + WHERE housenumber is not null + AND (char_length(housenumber) > 6 + OR housenumber not similar to '\\d+') + """) + for row in cur: + for hnr in row[0].split(';'): + candidates.pop(hnr, None) + LOG.info("There are %s outdated housenumbers.", len(candidates)) + if candidates: + with conn.cursor() as cur: + cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", + (list(candidates.values()), )) + conn.commit() + + + + def update_word_tokens(self): + """ Remove unused tokens. + """ + LOG.warning("Cleaning up housenumber tokens.") + self._cleanup_housenumbers() + LOG.warning("Tokenizer house-keeping done.") + + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 551b0536..7ce6b242 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -211,6 +211,13 @@ class LegacyTokenizer(AbstractTokenizer): cur.drop_table("word_frequencies") conn.commit() + + def update_word_tokens(self): + """ No house-keeping implemented for the legacy tokenizer. + """ + LOG.info("No tokenizer clean-up available.") + + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py index ea45f2a1..420740cf 100644 --- a/test/python/cli/conftest.py +++ b/test/python/cli/conftest.py @@ -30,6 +30,7 @@ class DummyTokenizer: self.update_sql_functions_called = False self.finalize_import_called = False self.update_statistics_called = False + self.update_word_tokens_called = False def update_sql_functions(self, *args): self.update_sql_functions_called = True @@ -40,6 +41,9 @@ class DummyTokenizer: def update_statistics(self): self.update_statistics_called = True + def update_word_tokens(self): + self.update_word_tokens_called = True + @pytest.fixture def cli_call(src_dir): diff --git a/test/python/cli/test_cmd_refresh.py b/test/python/cli/test_cmd_refresh.py index e6dce8b3..b6281c7a 100644 --- a/test/python/cli/test_cmd_refresh.py +++ b/test/python/cli/test_cmd_refresh.py @@ -39,6 +39,11 @@ class TestRefresh: assert self.tokenizer_mock.update_statistics_called + def test_refresh_word_tokens(self): + assert self.call_nominatim('refresh', '--word-tokens') == 0 + assert self.tokenizer_mock.update_word_tokens_called + + def test_refresh_postcodes(self, mock_func_factory, place_table): func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes') idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes') diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py index f5d89e4f..a7363958 100644 --- a/test/python/mock_icu_word_table.py +++ b/test/python/mock_icu_word_table.py @@ -58,6 +58,14 @@ class MockIcuWordTable: self.conn.commit() + def add_housenumber(self, word_id, word_token): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_id, word_token, type) + VALUES (%s, %s, 'H') + """, (word_id, word_token)) + self.conn.commit() + + def count(self): with self.conn.cursor() as cur: return cur.scalar("SELECT count(*) FROM word") @@ -68,6 +76,11 @@ class MockIcuWordTable: return cur.scalar("SELECT count(*) FROM word WHERE type = 'S'") + def count_housenumbers(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word WHERE type = 'H'") + + def get_special(self): with self.conn.cursor() as cur: cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'") diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index a3839365..372df9d2 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -9,6 +9,7 @@ Tests for ICU tokenizer. """ import shutil import yaml +import itertools import pytest @@ -554,3 +555,69 @@ class TestPlaceAddress: assert 'addr' not in info + +class TestUpdateWordTokens: + + @pytest.fixture(autouse=True) + def setup(self, tokenizer_factory, table_factory, placex_table, word_table): + table_factory('search_name', 'place_id BIGINT, name_vector INT[]') + self.tok = tokenizer_factory() + + + @pytest.fixture + def search_entry(self, temp_db_cursor): + place_id = itertools.count(1000) + + def _insert(*args): + temp_db_cursor.execute("INSERT INTO search_name VALUES (%s, %s)", + (next(place_id), list(args))) + + return _insert + + + @pytest.mark.parametrize('hnr', ('1a', '1234567', '34 5')) + def test_remove_unused_housenumbers(self, word_table, hnr): + word_table.add_housenumber(1000, hnr) + + assert word_table.count_housenumbers() == 1 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 0 + + + def test_keep_unused_numeral_housenumbers(self, word_table): + word_table.add_housenumber(1000, '5432') + + assert word_table.count_housenumbers() == 1 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 + + + def test_keep_housenumbers_from_search_name_table(self, word_table, search_entry): + word_table.add_housenumber(9999, '5432a') + word_table.add_housenumber(9991, '9 a') + search_entry(123, 9999, 34) + + assert word_table.count_housenumbers() == 2 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 + + + def test_keep_housenumbers_from_placex_table(self, word_table, placex_table): + word_table.add_housenumber(9999, '5432a') + word_table.add_housenumber(9990, '34z') + placex_table.add(housenumber='34z') + placex_table.add(housenumber='25432a') + + assert word_table.count_housenumbers() == 2 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 + + + def test_keep_housenumbers_from_placex_table_hnr_list(self, word_table, placex_table): + word_table.add_housenumber(9991, '9 b') + word_table.add_housenumber(9990, '34z') + placex_table.add(housenumber='9 a;9 b;9 c') + + assert word_table.count_housenumbers() == 2 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 diff --git a/test/python/tokenizer/test_legacy.py b/test/python/tokenizer/test_legacy.py index 4addb282..0e46f1dc 100644 --- a/test/python/tokenizer/test_legacy.py +++ b/test/python/tokenizer/test_legacy.py @@ -257,6 +257,13 @@ def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_ search_name_count > 0""") > 0 +def test_update_word_tokens(tokenizer_factory): + tok = tokenizer_factory() + + # This is a noop and should just pass. + tok.update_word_tokens() + + def test_normalize(analyzer): assert analyzer.normalize('TEsT') == 'test'