From 497e27bb9a36d88bfb7d1ef7ad5c6d0ec76ae8d6 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 11 Apr 2025 08:57:34 +0200
Subject: [PATCH] move partial token into a separate field in the query struct

There is exactly one token to be expected and the token is usually
present.
---
 src/nominatim_api/search/db_search_builder.py | 33 +++++++-------
 src/nominatim_api/search/icu_tokenizer.py     |  8 +++-
 src/nominatim_api/search/query.py             | 44 ++++++++++++-------
 src/nominatim_api/search/token_assignment.py  | 27 ++++++++----
 .../api/search/test_api_search_query.py       |  9 ++--
 .../api/search/test_icu_query_analyzer.py     |  8 ++--
 6 files changed, 78 insertions(+), 51 deletions(-)

diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py
index 4dcaf14d..03563f17 100644
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Conversion from token assignment to an abstract DB search.
@@ -296,26 +296,27 @@ class SearchBuilder:
 
         while todo:
             neglen, pos, rank = heapq.heappop(todo)
+            # partial node
+            partial = self.query.nodes[pos].partial
+            if partial is not None:
+                if pos + 1 < trange.end:
+                    penalty = rank.penalty + partial.penalty \
+                              + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+                    heapq.heappush(todo, (neglen - 1, pos + 1,
+                                   dbf.RankedTokens(penalty, rank.tokens)))
+                else:
+                    ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
+                                                  rank.tokens))
+            # full words
             for tlist in self.query.nodes[pos].starting:
-                if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
+                if tlist.ttype == qmod.TOKEN_WORD:
                     if tlist.end < trange.end:
                         chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
-                        if tlist.ttype == qmod.TOKEN_PARTIAL:
-                            penalty = rank.penalty + chgpenalty \
-                                      + max(t.penalty for t in tlist.tokens)
+                        for t in tlist.tokens:
                             heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                  dbf.RankedTokens(penalty, rank.tokens)))
-                        else:
-                            for t in tlist.tokens:
-                                heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                      rank.with_token(t, chgpenalty)))
+                                                  rank.with_token(t, chgpenalty)))
                     elif tlist.end == trange.end:
-                        if tlist.ttype == qmod.TOKEN_PARTIAL:
-                            ranks.append(dbf.RankedTokens(rank.penalty
-                                                          + max(t.penalty for t in tlist.tokens),
-                                                          rank.tokens))
-                        else:
-                            ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
+                        ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
                         if len(ranks) >= 10:
                             # Too many variants, bail out and only add
                             # Worst-case Fallback: sum of penalty of partials
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index 1bd0030d..85b0a9f0 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Implementation of query analysis for the ICU tokenizer.
@@ -280,7 +280,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                     for repl in node.starting:
                         if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
                             repl.add_penalty(0.5 - tlist.tokens[0].penalty)
-            elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
+            elif tlist.ttype != qmod.TOKEN_COUNTRY:
                 norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
                                 if n.btype != qmod.BREAK_TOKEN)
                 if not norm:
@@ -293,6 +293,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
     yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
     for i, node in enumerate(query.nodes):
+        if node.partial is not None:
+            t = cast(ICUToken, node.partial)
+            yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token,
+                   t.word_token, t.lookup_word, t.penalty, t.count, t.info]
         for tlist in node.starting:
             for token in tlist.tokens:
                 t = cast(ICUToken, token)
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index 07bb685b..8c5983c4 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Datastructures for a tokenized query.
@@ -192,6 +192,14 @@ class QueryNode:
     """
 
     starting: List[TokenList] = dataclasses.field(default_factory=list)
+    """ List of all full tokens starting at this node.
+    """
+    partial: Optional[Token] = None
+    """ Base token going to the next node.
+        May be None when the query has parts for which no words are known.
+        Note that the query may still be parsable when there are other
+        types of tokens spanning over the gap.
+    """
 
     def adjust_break(self, btype: BreakType, penalty: float) -> None:
         """ Change the break type and penalty for this node.
@@ -269,33 +277,37 @@ class QueryStruct:
             be added to, then the token is silently dropped.
         """
         snode = self.nodes[trange.start]
-        full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
-            and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
-        if _phrase_compatible_with(snode.ptype, ttype, full_phrase):
-            tlist = snode.get_tokens(trange.end, ttype)
-            if tlist is None:
-                snode.starting.append(TokenList(trange.end, ttype, [token]))
-            else:
-                tlist.append(token)
+        if ttype == TOKEN_PARTIAL:
+            assert snode.partial is None
+            if _phrase_compatible_with(snode.ptype, TOKEN_PARTIAL, False):
+                snode.partial = token
+        else:
+            full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
+                and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
+            if _phrase_compatible_with(snode.ptype, ttype, full_phrase):
+                tlist = snode.get_tokens(trange.end, ttype)
+                if tlist is None:
+                    snode.starting.append(TokenList(trange.end, ttype, [token]))
+                else:
+                    tlist.append(token)
 
     def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
         """ Get the list of tokens of a given type, spanning the given
             nodes. The nodes must exist. If no tokens exist, an
             empty list is returned.
+
+            Cannot be used to get the partial token.
         """
+        assert ttype != TOKEN_PARTIAL
         return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
 
     def get_partials_list(self, trange: TokenRange) -> List[Token]:
         """ Create a list of partial tokens between the given nodes.
-            The list is composed of the first token of type PARTIAL
-            going to the subsequent node. Such PARTIAL tokens are
-            assumed to exist.
         """
-        return [next(iter(self.get_tokens(TokenRange(i, i+1), TOKEN_PARTIAL)))
-                for i in range(trange.start, trange.end)]
+        return list(filter(None, (self.nodes[i].partial for i in range(trange.start, trange.end))))
 
     def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
-        """ Iterator over all token lists in the query.
+        """ Iterator over all token lists except partial tokens in the query.
         """
         for i, node in enumerate(self.nodes):
             for tlist in node.starting:
@@ -308,6 +320,8 @@ class QueryStruct:
             debugging.
         """
         for node in self.nodes:
+            if node.partial is not None and node.partial.token == token:
+                return f"[P]{node.partial.lookup_word}"
             for tlist in node.starting:
                 for t in tlist.tokens:
                     if t.token == token:
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py
index 8d25aa8f..a7a2e8d5 100644
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -409,11 +409,22 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
         node = query.nodes[state.end_pos]
 
         for tlist in node.starting:
-            newstate = state.advance(tlist.ttype, tlist.end, node.btype)
-            if newstate is not None:
-                if newstate.end_pos == query.num_token_slots():
-                    if newstate.recheck_sequence():
-                        log().var_dump('Assignment', newstate)
-                        yield from newstate.get_assignments(query)
-                elif not newstate.is_final():
-                    todo.append(newstate)
+            yield from _append_state_to_todo(
+                query, todo,
+                state.advance(tlist.ttype, tlist.end, node.btype))
+
+        if node.partial is not None:
+            yield from _append_state_to_todo(
+                query, todo,
+                state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1, node.btype))
+
+
+def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence],
+                          newstate: Optional[_TokenSequence]) -> Iterator[TokenAssignment]:
+    if newstate is not None:
+        if newstate.end_pos == query.num_token_slots():
+            if newstate.recheck_sequence():
+                log().var_dump('Assignment', newstate)
+                yield from newstate.get_assignments(query)
+        elif not newstate.is_final():
+            todo.append(newstate)
diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py
index c9e8de93..e54da1e9 100644
--- a/test/python/api/search/test_api_search_query.py
+++ b/test/python/api/search/test_api_search_query.py
@@ -44,7 +44,6 @@ def test_phrase_incompatible(ptype):
 
 
 def test_query_node_empty(qnode):
-    assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
     assert qnode.get_tokens(3, query.TOKEN_WORD) is None
 
 
@@ -57,7 +56,6 @@ def test_query_node_with_content(qnode):
     assert qnode.has_tokens(2, query.TOKEN_PARTIAL)
     assert qnode.has_tokens(2, query.TOKEN_WORD)
 
-    assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None
     assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None
     assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2
     assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1
@@ -101,7 +99,6 @@ def test_query_struct_incompatible_token():
     q.add_token(query.TokenRange(0, 1), query.TOKEN_PARTIAL, mktoken(1))
     q.add_token(query.TokenRange(1, 2), query.TOKEN_COUNTRY, mktoken(100))
 
-    assert q.get_tokens(query.TokenRange(0, 1), query.TOKEN_PARTIAL) == []
     assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_COUNTRY)) == 1
 
 
@@ -113,7 +110,7 @@ def test_query_struct_amenity_single_word():
     q.add_token(query.TokenRange(0, 1), query.TOKEN_NEAR_ITEM, mktoken(2))
     q.add_token(query.TokenRange(0, 1), query.TOKEN_QUALIFIER, mktoken(3))
 
-    assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_PARTIAL)) == 1
+    assert q.nodes[0].partial.token == 1
     assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_NEAR_ITEM)) == 1
     assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_QUALIFIER)) == 0
 
@@ -128,10 +125,10 @@ def test_query_struct_amenity_two_words():
         q.add_token(query.TokenRange(*trange), query.TOKEN_NEAR_ITEM, mktoken(2))
         q.add_token(query.TokenRange(*trange), query.TOKEN_QUALIFIER, mktoken(3))
 
-    assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_PARTIAL)) == 1
+    assert q.nodes[0].partial.token == 1
     assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_NEAR_ITEM)) == 0
     assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_QUALIFIER)) == 1
 
-    assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_PARTIAL)) == 1
+    assert q.nodes[1].partial.token == 1
     assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_NEAR_ITEM)) == 0
     assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_QUALIFIER)) == 1
diff --git a/test/python/api/search/test_icu_query_analyzer.py b/test/python/api/search/test_icu_query_analyzer.py
index 72535026..39ec8fd6 100644
--- a/test/python/api/search/test_icu_query_analyzer.py
+++ b/test/python/api/search/test_icu_query_analyzer.py
@@ -69,8 +69,8 @@ async def test_single_phrase_with_unknown_terms(conn):
     assert query.source[0].text == 'foo bar'
 
     assert query.num_token_slots() == 2
-    assert len(query.nodes[0].starting) == 1
-    assert not query.nodes[1].starting
+    assert query.nodes[0].partial.token == 1
+    assert query.nodes[1].partial is None
 
 
 @pytest.mark.asyncio
@@ -103,8 +103,8 @@ async def test_splitting_in_transliteration(conn):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
-                                        ('3', ['H', 'W', 'w'])])
+@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W']),
+                                        ('3', ['H', 'W'])])
 async def test_penalty_postcodes_and_housenumbers(conn, term, order):
     ana = await tok.create_query_analyzer(conn)
 
-- 
2.39.5