]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 18 Mar 2024 13:37:25 +0000 (14:37 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Mon, 18 Mar 2024 13:37:25 +0000 (14:37 +0100)
16 files changed:
docs/admin/Import.md
nominatim/api/search/db_search_builder.py
nominatim/api/search/db_search_fields.py
nominatim/api/search/icu_tokenizer.py
nominatim/api/search/legacy_tokenizer.py
nominatim/api/search/query.py
nominatim/clicmd/refresh.py
nominatim/clicmd/setup.py
nominatim/tokenizer/base.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
test/python/api/search/test_api_search_query.py
test/python/api/search/test_db_search_builder.py
test/python/api/search/test_token_assignment.py
test/python/cli/conftest.py
test/python/tokenizer/test_icu.py

index b31066d34799f5b3f355fe57e7561aa6aa9bac92..884dd44d5ee8042493999007f18cc7ee04d5743c 100644 (file)
@@ -228,7 +228,7 @@ to load the OSM data into the PostgreSQL database. This step is very demanding
 in terms of RAM usage. osm2pgsql and PostgreSQL are running in parallel at 
 this point. PostgreSQL blocks at least the part of RAM that has been configured
 with the `shared_buffers` parameter during
-[PostgreSQL tuning](Installation.md#postgresql-tuning)
+[PostgreSQL tuning](Installation.md#tuning-the-postgresql-database)
 and needs some memory on top of that. osm2pgsql needs at least 2GB of RAM for
 its internal data structures, potentially more when it has to process very large
 relations. In addition it needs to maintain a cache for node locations. The size
index ef7a66b8507387630c6d0aacc5bfb2b67a08b566..97e7ac0282a79b40e7015bcd1069c8edbadeb09e 100644 (file)
@@ -226,27 +226,74 @@ class SearchBuilder:
         name_fulls = self.query.get_tokens(name, TokenType.WORD)
         if name_fulls:
             fulls_count = sum(t.count for t in name_fulls)
-            # At this point drop unindexed partials from the address.
-            # This might yield wrong results, nothing we can do about that.
-            if not partials_indexed:
-                addr_tokens = [t.token for t in addr_partials if t.is_indexed]
+            if len(name_partials) == 1:
+                penalty += min(0.5, max(0, (exp_count - 50 * fulls_count) / (2000 * fulls_count)))
+            if partials_indexed:
                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
-            # Any of the full names applies with all of the partials from the address
-            yield penalty, fulls_count / (2**len(addr_tokens)),\
-                  dbf.lookup_by_any_name([t.token for t in name_fulls],
-                                         addr_tokens,
-                                         fulls_count > 30000 / max(1, len(addr_tokens)))
+
+            yield penalty,fulls_count / (2**len(addr_tokens)), \
+                  self.get_full_name_ranking(name_fulls, addr_partials,
+                                             fulls_count > 30000 / max(1, len(addr_tokens)))
 
         # To catch remaining results, lookup by name and address
         # We only do this if there is a reasonable number of results expected.
         exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
         if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
-            lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
-            if addr_tokens:
-                lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
             penalty += 0.35 * max(1 if name_fulls else 0.1,
                                   5 - len(name_partials) - len(addr_tokens))
-            yield penalty, exp_count, lookup
+            yield penalty, exp_count,\
+                  self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
+
+
+    def get_name_address_ranking(self, name_tokens: List[int],
+                                 addr_partials: List[Token]) -> List[dbf.FieldLookup]:
+        """ Create a ranking expression looking up by name and address.
+        """
+        lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
+
+        addr_restrict_tokens = []
+        addr_lookup_tokens = []
+        for t in addr_partials:
+            if t.is_indexed:
+                if t.addr_count > 20000:
+                    addr_restrict_tokens.append(t.token)
+                else:
+                    addr_lookup_tokens.append(t.token)
+
+        if addr_restrict_tokens:
+            lookup.append(dbf.FieldLookup('nameaddress_vector',
+                                          addr_restrict_tokens, lookups.Restrict))
+        if addr_lookup_tokens:
+            lookup.append(dbf.FieldLookup('nameaddress_vector',
+                                          addr_lookup_tokens, lookups.LookupAll))
+
+        return lookup
+
+
+    def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
+                              use_lookup: bool) -> List[dbf.FieldLookup]:
+        """ Create a ranking expression with full name terms and
+            additional address lookup. When 'use_lookup' is true, then
+            address lookups will use the index, when the occurences are not
+            too many.
+        """
+        # At this point drop unindexed partials from the address.
+        # This might yield wrong results, nothing we can do about that.
+        if use_lookup:
+            addr_restrict_tokens = []
+            addr_lookup_tokens = []
+            for t in addr_partials:
+                if t.is_indexed:
+                    if t.addr_count > 20000:
+                        addr_restrict_tokens.append(t.token)
+                    else:
+                        addr_lookup_tokens.append(t.token)
+        else:
+            addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
+            addr_lookup_tokens = []
+
+        return dbf.lookup_by_any_name([t.token for t in name_fulls],
+                                      addr_restrict_tokens, addr_lookup_tokens)
 
 
     def get_name_ranking(self, trange: TokenRange,
index cd5717753ba722616084dc06bbfb76dda901c0fb..7f775277e6960a7dc37d76a7181b5ad4e4273084 100644 (file)
@@ -231,16 +231,17 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel
     return lookup
 
 
-def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int],
-                       use_index_for_addr: bool) -> List[FieldLookup]:
+def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int],
+                       addr_lookup_tokens: List[int]) -> List[FieldLookup]:
     """ Create a lookup list where name tokens are looked up via index
         and only one of the name tokens must be present.
         Potential address tokens are used to restrict the search further.
     """
     lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)]
-    if addr_tokens:
-        lookup.append(FieldLookup('nameaddress_vector', addr_tokens,
-                                  lookups.LookupAll if use_index_for_addr else lookups.Restrict))
+    if addr_restrict_tokens:
+        lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict))
+    if addr_lookup_tokens:
+        lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll))
 
     return lookup
 
index 76a1a2e5d362688d5388044b511a9d3f0ae4a13c..23cfa5a166c003a1b5638f0334d10636a335d935 100644 (file)
@@ -97,6 +97,7 @@ class ICUToken(qmod.Token):
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
+        addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
 
         penalty = 0.0
         if row.type == 'w':
@@ -123,7 +124,8 @@ class ICUToken(qmod.Token):
 
         return ICUToken(penalty=penalty, token=row.word_id, count=count,
                         lookup_word=lookup_word, is_indexed=True,
-                        word_token=row.word_token, info=row.info)
+                        word_token=row.word_token, info=row.info,
+                        addr_count=addr_count)
 
 
 
@@ -262,7 +264,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                ICUToken(0.5, 0, 1, part.token, True, part.token, None))
+                                ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
index 86d42a543d20ce8429770d16451e61f8b7ea1e4b..bd17706e5dff7c3fc5fd5f1d37eafe234fd809f1 100644 (file)
@@ -210,6 +210,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
 
         return LegacyToken(penalty=penalty, token=row.word_id,
                            count=row.search_name_count or 1,
+                           addr_count=1, # not supported
                            lookup_word=lookup_word,
                            word_token=row.word_token.strip(),
                            category=(rowclass, row.type) if rowclass is not None else None,
@@ -226,7 +227,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
             if len(part) <= 4 and part.isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                LegacyToken(penalty=0.5, token=0, count=1,
+                                LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
                                             lookup_word=part, word_token=part,
                                             category=None, country=None,
                                             operator=None, is_indexed=True))
index 333722fe44ffa94ed3816d8832b04fbc0b552e10..a0d7add1b70118e32d628b4894a893386d09d996 100644 (file)
@@ -99,6 +99,7 @@ class Token(ABC):
     penalty: float
     token: int
     count: int
+    addr_count: int
     lookup_word: str
     is_indexed: bool
 
index afafe4a8305441bfa42cab62ec317c69fd513fc2..343fe48d204468ce495f8340db1285391096e81b 100644 (file)
@@ -110,7 +110,8 @@ class UpdateRefresh:
 
         if args.word_counts:
             LOG.warning('Recompute word statistics')
-            self._get_tokenizer(args.config).update_statistics(args.config)
+            self._get_tokenizer(args.config).update_statistics(args.config,
+                                                               threads=args.threads or 1)
 
         if args.address_levels:
             LOG.warning('Updating address levels')
index 2fd8b141a86c85be4eb22e78a649145a07ca9c06..ccd6bd788ed315a45bc8d2e7b2d872bdb64c8de5 100644 (file)
@@ -168,7 +168,7 @@ class SetupAll:
         tokenizer.finalize_import(args.config)
 
         LOG.warning('Recompute word counts')
-        tokenizer.update_statistics(args.config)
+        tokenizer.update_statistics(args.config, threads=num_threads)
 
         webdir = args.project_dir / 'website'
         LOG.warning('Setup website at %s', webdir)
index 29bcc8e196cf29cf4ef110252fbad0a5f26da2b7..12c826eb21b19da1b1ad989d4cc5ede9f0c699cf 100644 (file)
@@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):
 
 
     @abstractmethod
-    def update_statistics(self, config: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
         """ Recompute any tokenizer statistics necessary for efficient lookup.
             This function is meant to be called from time to time by the user
             to improve performance. However, the tokenizer must not depend on
index c1821d7edc7b88b2aa1f95797be2ddfce0ee0c85..251f4da5df3cbe7319a622d2b97d16415ff7f5a4 100644 (file)
@@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer):
         self.init_from_project(config)
 
 
-    def update_statistics(self, config: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 2) -> None:
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
@@ -112,22 +112,89 @@ class ICUTokenizer(AbstractTokenizer):
                 return
 
             with conn.cursor() as cur:
-                LOG.info('Computing word frequencies')
-                cur.drop_table('word_frequencies')
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute('CREATE INDEX ON word_frequencies(id)')
-                LOG.info('Update word table with recomputed frequencies')
-                cur.drop_table('tmp_word')
-                cur.execute("""CREATE TABLE tmp_word AS
-                                SELECT word_id, word_token, type, word,
-                                       (CASE WHEN wf.count is null THEN info
-                                          ELSE info || jsonb_build_object('count', wf.count)
-                                        END) as info
-                                FROM word LEFT JOIN word_frequencies wf
-                                  ON word.word_id = wf.id""")
-                cur.drop_table('word_frequencies')
+                cur.execute('ANALYSE search_name')
+                if threads > 1:
+                    cur.execute('SET max_parallel_workers_per_gather TO %s',
+                                (min(threads, 6),))
+
+                if conn.server_version_tuple() < (12, 0):
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON word_frequencies(id)')
+                    cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
+                                     SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
+                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                               INOUT info JSONB)
+                                   AS $$
+                                   DECLARE rec RECORD;
+                                   BEGIN
+                                   IF info is null THEN
+                                     info = '{}'::jsonb;
+                                   END IF;
+                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('count', rec.count);
+                                   END LOOP;
+                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('addr_count', rec.count);
+                                   END LOOP;
+                                   IF info = '{}'::jsonb THEN
+                                     info = null;
+                                   END IF;
+                                   END;
+                                   $$ LANGUAGE plpgsql IMMUTABLE;
+                                """)
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           word_freq_update(word_id, info) as info
+                                    FROM word
+                                """)
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                else:
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.execute("""
+                      CREATE TEMP TABLE word_frequencies AS
+                      WITH word_freq AS MATERIALIZED (
+                               SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id),
+                           addr_freq AS MATERIALIZED (
+                               SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id)
+                      SELECT coalesce(a.id, w.id) as id,
+                             (CASE WHEN w.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('count', w.count) END
+                              ||
+                              CASE WHEN a.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('addr_count', a.count) END) as info
+                      FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
+                      """)
+                    cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
+                    cur.execute('ANALYSE word_frequencies')
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           (CASE WHEN wf.info is null THEN word.info
+                                            ELSE coalesce(word.info, '{}'::jsonb) || wf.info
+                                            END) as info
+                                    FROM word LEFT JOIN word_frequencies wf
+                                         ON word.word_id = wf.id
+                                """)
+                    cur.drop_table('word_frequencies')
+
+            with conn.cursor() as cur:
+                cur.execute('SET max_parallel_workers_per_gather TO 0')
 
             sqlp = SQLPreprocessor(conn, config)
             sqlp.run_string(conn,
index f3a00839aa2f0302ba611f0f0454f7fdf818c02e..93808cc39f3407458bb2d570d2a8740128f2c168 100644 (file)
@@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
             self._save_config(conn, config)
 
 
-    def update_statistics(self, _: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
         """ Recompute the frequency of full words.
         """
         with connect(self.dsn) as conn:
index fe850ce902930a817981bd42c6c549fc5bd91ec3..bfdceb4165fc984451e6ca8266a15554cc0cb2b8 100644 (file)
@@ -18,7 +18,8 @@ class MyToken(query.Token):
 
 
 def mktoken(tid: int):
-    return MyToken(3.0, tid, 1, 'foo', True)
+    return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
+                   lookup_word='foo', is_indexed=True)
 
 
 @pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),
index d3aea90002740d7660e12a4b210bf4cb41344c60..68f71298c6b64f10a846796562bd658fdfdf7cc3 100644 (file)
@@ -31,7 +31,9 @@ def make_query(*args):
         for end, ttype, tinfo in tlist:
             for tid, word in tinfo:
                 q.add_token(TokenRange(start, end), ttype,
-                            MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
+                            MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
+                                    token=tid, count=1, addr_count=1,
+                                    lookup_word=word, is_indexed=True))
 
 
     return q
@@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
     q.add_node(BreakType.END, PhraseType.NONE)
 
     q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
-                MyToken(0.5, 1, name_part, 'name_part', True))
+                MyToken(0.5, 1, name_part, 1, 'name_part', True))
     q.add_token(TokenRange(0, 1), TokenType.WORD,
-                MyToken(0, 101, name_full, 'name_full', True))
+                MyToken(0, 101, name_full, 1, 'name_full', True))
     for i in range(num_address_parts):
         q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
-                    MyToken(0.5, 2, address_part, 'address_part', True))
+                    MyToken(0.5, 2, address_part, 1, 'address_part', True))
         q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
-                    MyToken(0, 102, address_full, 'address_full', True))
+                    MyToken(0, 102, address_full, 1, 'address_full', True))
 
     builder = SearchBuilder(q, SearchDetails())
 
index 54e8af14cc27fb466e58a99a5d3d7ef28657e1f6..cde8495d0bb2ce557cc9d6ecd2de24721d454f3b 100644 (file)
@@ -19,7 +19,8 @@ class MyToken(Token):
 
 def make_query(*args):
     q = QueryStruct([Phrase(args[0][1], '')])
-    dummy = MyToken(3.0, 45, 1, 'foo', True)
+    dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
+                    lookup_word='foo', is_indexed=True)
 
     for btype, ptype, _ in args[1:]:
         q.add_node(btype, ptype)
index 1bb393fb240613d1ed85f04d36269733c61469c8..28aba597e7de38d324ebadaa1e6ef67e62b84b82 100644 (file)
@@ -32,16 +32,16 @@ class DummyTokenizer:
         self.update_statistics_called = False
         self.update_word_tokens_called = False
 
-    def update_sql_functions(self, *args):
+    def update_sql_functions(self, *args, **kwargs):
         self.update_sql_functions_called = True
 
-    def finalize_import(self, *args):
+    def finalize_import(self, *args, **kwargs):
         self.finalize_import_called = True
 
-    def update_statistics(self, *args):
+    def update_statistics(self, *args, **kwargs):
         self.update_statistics_called = True
 
-    def update_word_tokens(self, *args):
+    def update_word_tokens(self, *args, **kwargs):
         self.update_word_tokens_called = True
 
 
index aa1afe160ca9010630b7b36502d14b173a453003..9f6eae62e3467900e11829a421a3bbdef623e211 100644 (file)
@@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf
 def test_update_statistics(word_table, table_factory, temp_db_cursor,
                            tokenizer_factory, test_config):
     word_table.add_full_word(1000, 'hello')
+    word_table.add_full_word(1001, 'bye')
     table_factory('search_name',
-                  'place_id BIGINT, name_vector INT[]',
-                  [(12, [1000])])
+                  'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]',
+                  [(12, [1000], [1001])])
     tok = tokenizer_factory()
 
     tok.update_statistics(test_config)
 
     assert temp_db_cursor.scalar("""SELECT count(*) FROM word
-                                    WHERE type = 'W' and
-                                          (info->>'count')::int > 0""") > 0
+                                    WHERE type = 'W' and word_id = 1000 and
+                                          (info->>'count')::int > 0""") == 1
+    assert temp_db_cursor.scalar("""SELECT count(*) FROM word
+                                    WHERE type = 'W' and word_id = 1001 and
+                                          (info->>'addr_count')::int > 0""") == 1
 
 
 def test_normalize_postcode(analyzer):