From 497e27bb9a36d88bfb7d1ef7ad5c6d0ec76ae8d6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 11 Apr 2025 08:57:34 +0200 Subject: [PATCH] move partial token into a separate field in the query struct There is exactly one token to be expected and the token is usually present. --- src/nominatim_api/search/db_search_builder.py | 33 +++++++------- src/nominatim_api/search/icu_tokenizer.py | 8 +++- src/nominatim_api/search/query.py | 44 ++++++++++++------- src/nominatim_api/search/token_assignment.py | 27 ++++++++---- .../api/search/test_api_search_query.py | 9 ++-- .../api/search/test_icu_query_analyzer.py | 8 ++-- 6 files changed, 78 insertions(+), 51 deletions(-) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index 4dcaf14d..03563f17 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Conversion from token assignment to an abstract DB search. @@ -296,26 +296,27 @@ class SearchBuilder: while todo: neglen, pos, rank = heapq.heappop(todo) + # partial node + partial = self.query.nodes[pos].partial + if partial is not None: + if pos + 1 < trange.end: + penalty = rank.penalty + partial.penalty \ + + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype] + heapq.heappush(todo, (neglen - 1, pos + 1, + dbf.RankedTokens(penalty, rank.tokens))) + else: + ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty, + rank.tokens)) + # full words for tlist in self.query.nodes[pos].starting: - if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD): + if tlist.ttype == qmod.TOKEN_WORD: if tlist.end < trange.end: chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype] - if tlist.ttype == qmod.TOKEN_PARTIAL: - penalty = rank.penalty + chgpenalty \ - + max(t.penalty for t in tlist.tokens) + for t in tlist.tokens: heapq.heappush(todo, (neglen - 1, tlist.end, - dbf.RankedTokens(penalty, rank.tokens))) - else: - for t in tlist.tokens: - heapq.heappush(todo, (neglen - 1, tlist.end, - rank.with_token(t, chgpenalty))) + rank.with_token(t, chgpenalty))) elif tlist.end == trange.end: - if tlist.ttype == qmod.TOKEN_PARTIAL: - ranks.append(dbf.RankedTokens(rank.penalty - + max(t.penalty for t in tlist.tokens), - rank.tokens)) - else: - ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens) + ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens) if len(ranks) >= 10: # Too many variants, bail out and only add # Worst-case Fallback: sum of penalty of partials diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 1bd0030d..85b0a9f0 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Implementation of query analysis for the ICU tokenizer. @@ -280,7 +280,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): for repl in node.starting: if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER: repl.add_penalty(0.5 - tlist.tokens[0].penalty) - elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL): + elif tlist.ttype != qmod.TOKEN_COUNTRY: norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1] if n.btype != qmod.BREAK_TOKEN) if not norm: @@ -293,6 +293,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] for i, node in enumerate(query.nodes): + if node.partial is not None: + t = cast(ICUToken, node.partial) + yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token, + t.word_token, t.lookup_word, t.penalty, t.count, t.info] for tlist in node.starting: for token in tlist.tokens: t = cast(ICUToken, token) diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 07bb685b..8c5983c4 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Datastructures for a tokenized query. @@ -192,6 +192,14 @@ class QueryNode: """ starting: List[TokenList] = dataclasses.field(default_factory=list) + """ List of all full tokens starting at this node. + """ + partial: Optional[Token] = None + """ Base token going to the next node. + May be None when the query has parts for which no words are known. + Note that the query may still be parsable when there are other + types of tokens spanning over the gap. + """ def adjust_break(self, btype: BreakType, penalty: float) -> None: """ Change the break type and penalty for this node. @@ -269,33 +277,37 @@ class QueryStruct: be added to, then the token is silently dropped. """ snode = self.nodes[trange.start] - full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\ - and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END) - if _phrase_compatible_with(snode.ptype, ttype, full_phrase): - tlist = snode.get_tokens(trange.end, ttype) - if tlist is None: - snode.starting.append(TokenList(trange.end, ttype, [token])) - else: - tlist.append(token) + if ttype == TOKEN_PARTIAL: + assert snode.partial is None + if _phrase_compatible_with(snode.ptype, TOKEN_PARTIAL, False): + snode.partial = token + else: + full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\ + and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END) + if _phrase_compatible_with(snode.ptype, ttype, full_phrase): + tlist = snode.get_tokens(trange.end, ttype) + if tlist is None: + snode.starting.append(TokenList(trange.end, ttype, [token])) + else: + tlist.append(token) def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]: """ Get the list of tokens of a given type, spanning the given nodes. The nodes must exist. If no tokens exist, an empty list is returned. + + Cannot be used to get the partial token. """ + assert ttype != TOKEN_PARTIAL return self.nodes[trange.start].get_tokens(trange.end, ttype) or [] def get_partials_list(self, trange: TokenRange) -> List[Token]: """ Create a list of partial tokens between the given nodes. - The list is composed of the first token of type PARTIAL - going to the subsequent node. Such PARTIAL tokens are - assumed to exist. """ - return [next(iter(self.get_tokens(TokenRange(i, i+1), TOKEN_PARTIAL))) - for i in range(trange.start, trange.end)] + return list(filter(None, (self.nodes[i].partial for i in range(trange.start, trange.end)))) def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]: - """ Iterator over all token lists in the query. + """ Iterator over all token lists except partial tokens in the query. """ for i, node in enumerate(self.nodes): for tlist in node.starting: @@ -308,6 +320,8 @@ class QueryStruct: debugging. """ for node in self.nodes: + if node.partial is not None and node.partial.token == token: + return f"[P]{node.partial.lookup_word}" for tlist in node.starting: for t in tlist.tokens: if t.token == token: diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py index 8d25aa8f..a7a2e8d5 100644 --- a/src/nominatim_api/search/token_assignment.py +++ b/src/nominatim_api/search/token_assignment.py @@ -409,11 +409,22 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment node = query.nodes[state.end_pos] for tlist in node.starting: - newstate = state.advance(tlist.ttype, tlist.end, node.btype) - if newstate is not None: - if newstate.end_pos == query.num_token_slots(): - if newstate.recheck_sequence(): - log().var_dump('Assignment', newstate) - yield from newstate.get_assignments(query) - elif not newstate.is_final(): - todo.append(newstate) + yield from _append_state_to_todo( + query, todo, + state.advance(tlist.ttype, tlist.end, node.btype)) + + if node.partial is not None: + yield from _append_state_to_todo( + query, todo, + state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1, node.btype)) + + +def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence], + newstate: Optional[_TokenSequence]) -> Iterator[TokenAssignment]: + if newstate is not None: + if newstate.end_pos == query.num_token_slots(): + if newstate.recheck_sequence(): + log().var_dump('Assignment', newstate) + yield from newstate.get_assignments(query) + elif not newstate.is_final(): + todo.append(newstate) diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index c9e8de93..e54da1e9 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -44,7 +44,6 @@ def test_phrase_incompatible(ptype): def test_query_node_empty(qnode): - assert not qnode.has_tokens(3, query.TOKEN_PARTIAL) assert qnode.get_tokens(3, query.TOKEN_WORD) is None @@ -57,7 +56,6 @@ def test_query_node_with_content(qnode): assert qnode.has_tokens(2, query.TOKEN_PARTIAL) assert qnode.has_tokens(2, query.TOKEN_WORD) - assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2 assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1 @@ -101,7 +99,6 @@ def test_query_struct_incompatible_token(): q.add_token(query.TokenRange(0, 1), query.TOKEN_PARTIAL, mktoken(1)) q.add_token(query.TokenRange(1, 2), query.TOKEN_COUNTRY, mktoken(100)) - assert q.get_tokens(query.TokenRange(0, 1), query.TOKEN_PARTIAL) == [] assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_COUNTRY)) == 1 @@ -113,7 +110,7 @@ def test_query_struct_amenity_single_word(): q.add_token(query.TokenRange(0, 1), query.TOKEN_NEAR_ITEM, mktoken(2)) q.add_token(query.TokenRange(0, 1), query.TOKEN_QUALIFIER, mktoken(3)) - assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_PARTIAL)) == 1 + assert q.nodes[0].partial.token == 1 assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_NEAR_ITEM)) == 1 assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_QUALIFIER)) == 0 @@ -128,10 +125,10 @@ def test_query_struct_amenity_two_words(): q.add_token(query.TokenRange(*trange), query.TOKEN_NEAR_ITEM, mktoken(2)) q.add_token(query.TokenRange(*trange), query.TOKEN_QUALIFIER, mktoken(3)) - assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_PARTIAL)) == 1 + assert q.nodes[0].partial.token == 1 assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_NEAR_ITEM)) == 0 assert len(q.get_tokens(query.TokenRange(0, 1), query.TOKEN_QUALIFIER)) == 1 - assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_PARTIAL)) == 1 + assert q.nodes[1].partial.token == 1 assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_NEAR_ITEM)) == 0 assert len(q.get_tokens(query.TokenRange(1, 2), query.TOKEN_QUALIFIER)) == 1 diff --git a/test/python/api/search/test_icu_query_analyzer.py b/test/python/api/search/test_icu_query_analyzer.py index 72535026..39ec8fd6 100644 --- a/test/python/api/search/test_icu_query_analyzer.py +++ b/test/python/api/search/test_icu_query_analyzer.py @@ -69,8 +69,8 @@ async def test_single_phrase_with_unknown_terms(conn): assert query.source[0].text == 'foo bar' assert query.num_token_slots() == 2 - assert len(query.nodes[0].starting) == 1 - assert not query.nodes[1].starting + assert query.nodes[0].partial.token == 1 + assert query.nodes[1].partial is None @pytest.mark.asyncio @@ -103,8 +103,8 @@ async def test_splitting_in_transliteration(conn): @pytest.mark.asyncio -@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']), - ('3', ['H', 'W', 'w'])]) +@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W']), + ('3', ['H', 'W'])]) async def test_penalty_postcodes_and_housenumbers(conn, term, order): ana = await tok.create_query_analyzer(conn) -- 2.39.5