Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
diff --combined src/nominatim_api/search/db_search_builder.py

index 0292335eb918391c296cb8d05735aeb82e5ea501,43e384e676ae999403870d4f12a41e5cb3399dd7..de85cefa3b81a7d16c154d5b35ad89cee6c66006
--- 1/src/nominatim_api/search/db_search_builder.py
--- 2/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@@ -2,7 -2,7 +2,7 @@@
   #
   # This file is part of Nominatim. (https://nominatim.org)
   #
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
   # For a full list of authors see the git log.
   """
   Conversion from token assignment to an abstract DB search.
@@@ -146,7 -146,7 +146,7 @@@ class SearchBuilder
               if address:
                   sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
                                                    [t.token for r in address
-                                                   for t in self.query.get_partials_list(r)],
+                                                   for t in self.query.iter_partials(r)],
                                                    lookups.Restrict)]
               yield dbs.PostcodeSearch(penalty, sdata)
   
@@@ -159,7 -159,7 +159,7 @@@
           expected_count = sum(t.count for t in hnrs)
   
           partials = {t.token: t.addr_count for trange in address
-                     for t in self.query.get_partials_list(trange)}
+                     for t in self.query.iter_partials(trange)}
   
           if not partials:
               # can happen when none of the partials is indexed
@@@ -203,9 -203,9 +203,9 @@@
               are and tries to find a lookup that optimizes index use.
           """
           penalty = 0.0  # extra penalty
-         name_partials = {t.token: t for t in self.query.get_partials_list(name)}
+         name_partials = {t.token: t for t in self.query.iter_partials(name)}
   
-         addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
+         addr_partials = [t for r in address for t in self.query.iter_partials(r)]
           addr_tokens = list({t.token for t in addr_partials})
   
           exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
@@@ -220,7 -220,7 +220,7 @@@
           if name_fulls:
               fulls_count = sum(t.count for t in name_fulls)
   
- -            if fulls_count < 50000 or addr_count < 50000:
+ +            if fulls_count < 80000 or addr_count < 50000:
                   yield penalty, fulls_count / (2**len(addr_tokens)), \
                       self.get_full_name_ranking(name_fulls, addr_partials,
                                                  fulls_count > 30000 / max(1, len(addr_tokens)))
@@@ -282,8 -282,7 +282,7 @@@
           ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
           ranks.sort(key=lambda r: r.penalty)
           # Fallback, sum of penalty for partials
-         name_partials = self.query.get_partials_list(trange)
-         default = sum(t.penalty for t in name_partials) + 0.2
+         default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
           return dbf.FieldRanking(db_field, default, ranks)
   
       def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
@@@ -296,35 -295,35 +295,35 @@@
   
           while todo:
               neglen, pos, rank = heapq.heappop(todo)
+             # partial node
+             partial = self.query.nodes[pos].partial
+             if partial is not None:
+                 if pos + 1 < trange.end:
+                     penalty = rank.penalty + partial.penalty \
+                               + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+                     heapq.heappush(todo, (neglen - 1, pos + 1,
+                                    dbf.RankedTokens(penalty, rank.tokens)))
+                 else:
+                     ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
+                                                   rank.tokens))
+             # full words
               for tlist in self.query.nodes[pos].starting:
-                 if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
+                 if tlist.ttype == qmod.TOKEN_WORD:
                       if tlist.end < trange.end:
                           chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
-                         if tlist.ttype == qmod.TOKEN_PARTIAL:
-                             penalty = rank.penalty + chgpenalty \
-                                       + max(t.penalty for t in tlist.tokens)
+                         for t in tlist.tokens:
                               heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                   dbf.RankedTokens(penalty, rank.tokens)))
-                         else:
-                             for t in tlist.tokens:
-                                 heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                       rank.with_token(t, chgpenalty)))
+                                                   rank.with_token(t, chgpenalty)))
                       elif tlist.end == trange.end:
-                         if tlist.ttype == qmod.TOKEN_PARTIAL:
-                             ranks.append(dbf.RankedTokens(rank.penalty
-                                                           + max(t.penalty for t in tlist.tokens),
-                                                           rank.tokens))
-                         else:
-                             ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
-                         if len(ranks) >= 10:
-                             # Too many variants, bail out and only add
-                             # Worst-case Fallback: sum of penalty of partials
-                             name_partials = self.query.get_partials_list(trange)
-                             default = sum(t.penalty for t in name_partials) + 0.2
-                             ranks.append(dbf.RankedTokens(rank.penalty + default, []))
-                             # Bail out of outer loop
-                             todo.clear()
-                             break
+                         ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
+ 
+             if len(ranks) >= 10:
+                 # Too many variants, bail out and only add
+                 # Worst-case Fallback: sum of penalty of partials
+                 default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
+                 ranks.append(dbf.RankedTokens(rank.penalty + default, []))
+                 # Bail out of outer loop
+                 break
   
           ranks.sort(key=lambda r: len(r.tokens))
           default = ranks[0].penalty + 0.3
diff --combined src/nominatim_api/search/icu_tokenizer.py

index cc5b6cf098c1c00bdc0e30ebf8a4d44ccc1f640f,35171344f870b8b4ce7c20f125c58049759fa21f..1cb34f72311ee1244d33c2bd184a8e24d82674c4
--- 1/src/nominatim_api/search/icu_tokenizer.py
--- 2/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@@ -2,7 -2,7 +2,7 @@@
   #
   # This file is part of Nominatim. (https://nominatim.org)
   #
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
   # For a full list of authors see the git log.
   """
   Implementation of query analysis for the ICU tokenizer.
@@@ -166,12 -166,6 +166,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
           log().section('Analyze query (using ICU tokenizer)')
           for func in self.preprocessors:
               phrases = func(phrases)
+ +
+ +        if len(phrases) == 1 \
+ +                and phrases[0].text.count(' ') > 3 \
+ +                and max(len(s) for s in phrases[0].text.split()) < 3:
+ +            normalized = []
+ +
           query = qmod.QueryStruct(phrases)
   
           log().var_dump('Normalized query', query.source)
@@@ -273,32 -267,47 +273,47 @@@
       def rerank_tokens(self, query: qmod.QueryStruct) -> None:
           """ Add penalties to tokens that depend on presence of other token.
           """
-         for i, node, tlist in query.iter_token_lists():
-             if tlist.ttype == qmod.TOKEN_POSTCODE:
-                 tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
-                 for repl in node.starting:
-                     if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
-                        and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
-                         repl.add_penalty(0.39)
-             elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
-                   and len(tlist.tokens[0].lookup_word) <= 3):
-                 if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
-                     for repl in node.starting:
-                         if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
-                             repl.add_penalty(0.5 - tlist.tokens[0].penalty)
-             elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
-                 norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
-                                 if n.btype != qmod.BREAK_TOKEN)
-                 if not norm:
-                     # Can happen when the token only covers a partial term
-                     norm = query.nodes[i + 1].term_normalized
-                 for token in tlist.tokens:
-                     cast(ICUToken, token).rematch(norm)
+         for start, end, tlist in query.iter_tokens_by_edge():
+             if len(tlist) > 1:
+                 # If it looks like a Postcode, give preference.
+                 if qmod.TOKEN_POSTCODE in tlist:
+                     for ttype, tokens in tlist.items():
+                         if ttype != qmod.TOKEN_POSTCODE and \
+                                (ttype != qmod.TOKEN_HOUSENUMBER or
+                                 start + 1 > end or
+                                 len(query.nodes[end].term_lookup) > 4):
+                             for token in tokens:
+                                 token.penalty += 0.39
+ 
+                 # If it looks like a simple housenumber, prefer that.
+                 if qmod.TOKEN_HOUSENUMBER in tlist:
+                     hnr_lookup = tlist[qmod.TOKEN_HOUSENUMBER][0].lookup_word
+                     if len(hnr_lookup) <= 3 and any(c.isdigit() for c in hnr_lookup):
+                         penalty = 0.5 - tlist[qmod.TOKEN_HOUSENUMBER][0].penalty
+                         for ttype, tokens in tlist.items():
+                             if ttype != qmod.TOKEN_HOUSENUMBER:
+                                 for token in tokens:
+                                     token.penalty += penalty
+ 
+             # rerank tokens against the normalized form
+             norm = ' '.join(n.term_normalized for n in query.nodes[start + 1:end + 1]
+                             if n.btype != qmod.BREAK_TOKEN)
+             if not norm:
+                 # Can happen when the token only covers a partial term
+                 norm = query.nodes[start + 1].term_normalized
+             for ttype, tokens in tlist.items():
+                 if ttype != qmod.TOKEN_COUNTRY:
+                     for token in tokens:
+                         cast(ICUToken, token).rematch(norm)
   
   
   def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
       yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
       for i, node in enumerate(query.nodes):
+         if node.partial is not None:
+             t = cast(ICUToken, node.partial)
+             yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token,
+                    t.word_token, t.lookup_word, t.penalty, t.count, t.info]
           for tlist in node.starting:
               for token in tlist.tokens:
                   t = cast(ICUToken, token)
diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py

index 19b838639ab0e557a7cba97fbe5e012a9bf81b70,889bf5315e960dbd8c0f1834a1d128f629ce5df4..3fa867df550ac550659e873951c8dc9a6e913e2f
--- 1/src/nominatim_db/tokenizer/icu_tokenizer.py
--- 2/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@@ -2,7 -2,7 +2,7 @@@
   #
   # This file is part of Nominatim. (https://nominatim.org)
   #
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
   # For a full list of authors see the git log.
   """
   Tokenizer implementing normalisation as used before Nominatim 4 but using
@@@ -12,7 -12,6 +12,6 @@@ from typing import Optional, Sequence, 
                      Dict, Set, Iterable
   import itertools
   import logging
- from pathlib import Path
   
   from psycopg.types.json import Jsonb
   from psycopg import sql as pysql
@@@ -38,10 -37,10 +37,10 @@@ WORD_TYPES = (('country_names', 'C')
                 ('housenumbers', 'H'))
   
   
- def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+ def create(dsn: str) -> 'ICUTokenizer':
       """ Create a new instance of the tokenizer provided by this module.
       """
-     return ICUTokenizer(dsn, data_dir)
+     return ICUTokenizer(dsn)
   
   
   class ICUTokenizer(AbstractTokenizer):
@@@ -50,9 -49,8 +49,8 @@@
           normalization routines in Nominatim 3.
       """
   
-     def __init__(self, dsn: str, data_dir: Path) -> None:
+     def __init__(self, dsn: str) -> None:
           self.dsn = dsn
-         self.data_dir = data_dir
           self.loader: Optional[ICURuleLoader] = None
   
       def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
@@@ -140,7 -138,6 +138,7 @@@
                                          as info
                                   FROM word LEFT JOIN word_frequencies wf
                                        ON word.word_id = wf.id
+ +                                ORDER BY word_id
                               """)
                   drop_tables(conn, 'word_frequencies')
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
		1	2
src/nominatim_api/search/db_search_builder.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_api/search/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history