From bb5de9b955e9ff676ea4d3c73cdfa94c60854857 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 13 Mar 2024 15:10:25 +0100 Subject: [PATCH] extend word statistics to address index Word frequency in names is not sufficient to interpolate word frequency in the address because names of towns, states etc. are much more frequently used than, say street names. --- nominatim/api/search/query.py | 1 + nominatim/clicmd/refresh.py | 3 +- nominatim/clicmd/setup.py | 2 +- nominatim/tokenizer/icu_tokenizer.py | 101 ++++++++++++++++++++++----- 4 files changed, 88 insertions(+), 19 deletions(-) diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index 333722fe..bd91c2ec 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -102,6 +102,7 @@ class Token(ABC): lookup_word: str is_indexed: bool + addr_count: int = 1 @abstractmethod def get_category(self) -> Tuple[str, str]: diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index afafe4a8..343fe48d 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -110,7 +110,8 @@ class UpdateRefresh: if args.word_counts: LOG.warning('Recompute word statistics') - self._get_tokenizer(args.config).update_statistics(args.config) + self._get_tokenizer(args.config).update_statistics(args.config, + threads=args.threads or 1) if args.address_levels: LOG.warning('Updating address levels') diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 2fd8b141..ccd6bd78 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -168,7 +168,7 @@ class SetupAll: tokenizer.finalize_import(args.config) LOG.warning('Recompute word counts') - tokenizer.update_statistics(args.config) + tokenizer.update_statistics(args.config, threads=num_threads) webdir = args.project_dir / 'website' LOG.warning('Setup website at %s', webdir) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index c1821d7e..251f4da5 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer): self.init_from_project(config) - def update_statistics(self, config: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 2) -> None: """ Recompute frequencies for all name words. """ with connect(self.dsn) as conn: @@ -112,22 +112,89 @@ class ICUTokenizer(AbstractTokenizer): return with conn.cursor() as cur: - LOG.info('Computing word frequencies') - cur.drop_table('word_frequencies') - cur.execute("""CREATE TEMP TABLE word_frequencies AS - SELECT unnest(name_vector) as id, count(*) - FROM search_name GROUP BY id""") - cur.execute('CREATE INDEX ON word_frequencies(id)') - LOG.info('Update word table with recomputed frequencies') - cur.drop_table('tmp_word') - cur.execute("""CREATE TABLE tmp_word AS - SELECT word_id, word_token, type, word, - (CASE WHEN wf.count is null THEN info - ELSE info || jsonb_build_object('count', wf.count) - END) as info - FROM word LEFT JOIN word_frequencies wf - ON word.word_id = wf.id""") - cur.drop_table('word_frequencies') + cur.execute('ANALYSE search_name') + if threads > 1: + cur.execute('SET max_parallel_workers_per_gather TO %s', + (min(threads, 6),)) + + if conn.server_version_tuple() < (12, 0): + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + cur.execute("""CREATE TEMP TABLE word_frequencies AS + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON word_frequencies(id)') + cur.execute("""CREATE TEMP TABLE addressword_frequencies AS + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON addressword_frequencies(id)') + cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER, + INOUT info JSONB) + AS $$ + DECLARE rec RECORD; + BEGIN + IF info is null THEN + info = '{}'::jsonb; + END IF; + FOR rec IN SELECT count FROM word_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('count', rec.count); + END LOOP; + FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('addr_count', rec.count); + END LOOP; + IF info = '{}'::jsonb THEN + info = null; + END IF; + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """) + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + word_freq_update(word_id, info) as info + FROM word + """) + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + else: + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.execute(""" + CREATE TEMP TABLE word_frequencies AS + WITH word_freq AS MATERIALIZED ( + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id), + addr_freq AS MATERIALIZED ( + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id) + SELECT coalesce(a.id, w.id) as id, + (CASE WHEN w.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('count', w.count) END + || + CASE WHEN a.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('addr_count', a.count) END) as info + FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; + """) + cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') + cur.execute('ANALYSE word_frequencies') + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + (CASE WHEN wf.info is null THEN word.info + ELSE coalesce(word.info, '{}'::jsonb) || wf.info + END) as info + FROM word LEFT JOIN word_frequencies wf + ON word.word_id = wf.id + """) + cur.drop_table('word_frequencies') + + with conn.cursor() as cur: + cur.execute('SET max_parallel_workers_per_gather TO 0') sqlp = SQLPreprocessor(conn, config) sqlp.run_string(conn, -- 2.43.2