]> git.openstreetmap.org Git - nominatim.git/commitdiff
extend word statistics to address index
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 13 Mar 2024 14:10:25 +0000 (15:10 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Mon, 18 Mar 2024 10:25:48 +0000 (11:25 +0100)
Word frequency in names is not sufficient to interpolate word
frequency in the address because names of towns, states etc. are
much more frequently used than, say street names.

nominatim/api/search/query.py
nominatim/clicmd/refresh.py
nominatim/clicmd/setup.py
nominatim/tokenizer/icu_tokenizer.py

index 333722fe44ffa94ed3816d8832b04fbc0b552e10..bd91c2ece72eac23dbda429e88979f826888b0c1 100644 (file)
@@ -102,6 +102,7 @@ class Token(ABC):
     lookup_word: str
     is_indexed: bool
 
+    addr_count: int = 1
 
     @abstractmethod
     def get_category(self) -> Tuple[str, str]:
index afafe4a8305441bfa42cab62ec317c69fd513fc2..343fe48d204468ce495f8340db1285391096e81b 100644 (file)
@@ -110,7 +110,8 @@ class UpdateRefresh:
 
         if args.word_counts:
             LOG.warning('Recompute word statistics')
-            self._get_tokenizer(args.config).update_statistics(args.config)
+            self._get_tokenizer(args.config).update_statistics(args.config,
+                                                               threads=args.threads or 1)
 
         if args.address_levels:
             LOG.warning('Updating address levels')
index 2fd8b141a86c85be4eb22e78a649145a07ca9c06..ccd6bd788ed315a45bc8d2e7b2d872bdb64c8de5 100644 (file)
@@ -168,7 +168,7 @@ class SetupAll:
         tokenizer.finalize_import(args.config)
 
         LOG.warning('Recompute word counts')
-        tokenizer.update_statistics(args.config)
+        tokenizer.update_statistics(args.config, threads=num_threads)
 
         webdir = args.project_dir / 'website'
         LOG.warning('Setup website at %s', webdir)
index c1821d7edc7b88b2aa1f95797be2ddfce0ee0c85..251f4da5df3cbe7319a622d2b97d16415ff7f5a4 100644 (file)
@@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer):
         self.init_from_project(config)
 
 
-    def update_statistics(self, config: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 2) -> None:
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
@@ -112,22 +112,89 @@ class ICUTokenizer(AbstractTokenizer):
                 return
 
             with conn.cursor() as cur:
-                LOG.info('Computing word frequencies')
-                cur.drop_table('word_frequencies')
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute('CREATE INDEX ON word_frequencies(id)')
-                LOG.info('Update word table with recomputed frequencies')
-                cur.drop_table('tmp_word')
-                cur.execute("""CREATE TABLE tmp_word AS
-                                SELECT word_id, word_token, type, word,
-                                       (CASE WHEN wf.count is null THEN info
-                                          ELSE info || jsonb_build_object('count', wf.count)
-                                        END) as info
-                                FROM word LEFT JOIN word_frequencies wf
-                                  ON word.word_id = wf.id""")
-                cur.drop_table('word_frequencies')
+                cur.execute('ANALYSE search_name')
+                if threads > 1:
+                    cur.execute('SET max_parallel_workers_per_gather TO %s',
+                                (min(threads, 6),))
+
+                if conn.server_version_tuple() < (12, 0):
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON word_frequencies(id)')
+                    cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
+                                     SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
+                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                               INOUT info JSONB)
+                                   AS $$
+                                   DECLARE rec RECORD;
+                                   BEGIN
+                                   IF info is null THEN
+                                     info = '{}'::jsonb;
+                                   END IF;
+                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('count', rec.count);
+                                   END LOOP;
+                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('addr_count', rec.count);
+                                   END LOOP;
+                                   IF info = '{}'::jsonb THEN
+                                     info = null;
+                                   END IF;
+                                   END;
+                                   $$ LANGUAGE plpgsql IMMUTABLE;
+                                """)
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           word_freq_update(word_id, info) as info
+                                    FROM word
+                                """)
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                else:
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.execute("""
+                      CREATE TEMP TABLE word_frequencies AS
+                      WITH word_freq AS MATERIALIZED (
+                               SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id),
+                           addr_freq AS MATERIALIZED (
+                               SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id)
+                      SELECT coalesce(a.id, w.id) as id,
+                             (CASE WHEN w.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('count', w.count) END
+                              ||
+                              CASE WHEN a.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('addr_count', a.count) END) as info
+                      FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
+                      """)
+                    cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
+                    cur.execute('ANALYSE word_frequencies')
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           (CASE WHEN wf.info is null THEN word.info
+                                            ELSE coalesce(word.info, '{}'::jsonb) || wf.info
+                                            END) as info
+                                    FROM word LEFT JOIN word_frequencies wf
+                                         ON word.word_id = wf.id
+                                """)
+                    cur.drop_table('word_frequencies')
+
+            with conn.cursor() as cur:
+                cur.execute('SET max_parallel_workers_per_gather TO 0')
 
             sqlp = SQLPreprocessor(conn, config)
             sqlp.run_string(conn,