]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 17 Apr 2025 13:18:29 +0000 (15:18 +0200)
1  2 
src/nominatim_api/search/db_search_builder.py
src/nominatim_api/search/icu_tokenizer.py
src/nominatim_db/tokenizer/icu_tokenizer.py

index 0292335eb918391c296cb8d05735aeb82e5ea501,43e384e676ae999403870d4f12a41e5cb3399dd7..de85cefa3b81a7d16c154d5b35ad89cee6c66006
@@@ -2,7 -2,7 +2,7 @@@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Conversion from token assignment to an abstract DB search.
@@@ -146,7 -146,7 +146,7 @@@ class SearchBuilder
              if address:
                  sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
                                                   [t.token for r in address
-                                                   for t in self.query.get_partials_list(r)],
+                                                   for t in self.query.iter_partials(r)],
                                                   lookups.Restrict)]
              yield dbs.PostcodeSearch(penalty, sdata)
  
          expected_count = sum(t.count for t in hnrs)
  
          partials = {t.token: t.addr_count for trange in address
-                     for t in self.query.get_partials_list(trange)}
+                     for t in self.query.iter_partials(trange)}
  
          if not partials:
              # can happen when none of the partials is indexed
              are and tries to find a lookup that optimizes index use.
          """
          penalty = 0.0  # extra penalty
-         name_partials = {t.token: t for t in self.query.get_partials_list(name)}
+         name_partials = {t.token: t for t in self.query.iter_partials(name)}
  
-         addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
+         addr_partials = [t for r in address for t in self.query.iter_partials(r)]
          addr_tokens = list({t.token for t in addr_partials})
  
          exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
          if name_fulls:
              fulls_count = sum(t.count for t in name_fulls)
  
 -            if fulls_count < 50000 or addr_count < 50000:
 +            if fulls_count < 80000 or addr_count < 50000:
                  yield penalty, fulls_count / (2**len(addr_tokens)), \
                      self.get_full_name_ranking(name_fulls, addr_partials,
                                                 fulls_count > 30000 / max(1, len(addr_tokens)))
          ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
          ranks.sort(key=lambda r: r.penalty)
          # Fallback, sum of penalty for partials
-         name_partials = self.query.get_partials_list(trange)
-         default = sum(t.penalty for t in name_partials) + 0.2
+         default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
          return dbf.FieldRanking(db_field, default, ranks)
  
      def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
  
          while todo:
              neglen, pos, rank = heapq.heappop(todo)
+             # partial node
+             partial = self.query.nodes[pos].partial
+             if partial is not None:
+                 if pos + 1 < trange.end:
+                     penalty = rank.penalty + partial.penalty \
+                               + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+                     heapq.heappush(todo, (neglen - 1, pos + 1,
+                                    dbf.RankedTokens(penalty, rank.tokens)))
+                 else:
+                     ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
+                                                   rank.tokens))
+             # full words
              for tlist in self.query.nodes[pos].starting:
-                 if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
+                 if tlist.ttype == qmod.TOKEN_WORD:
                      if tlist.end < trange.end:
                          chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
-                         if tlist.ttype == qmod.TOKEN_PARTIAL:
-                             penalty = rank.penalty + chgpenalty \
-                                       + max(t.penalty for t in tlist.tokens)
+                         for t in tlist.tokens:
                              heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                   dbf.RankedTokens(penalty, rank.tokens)))
-                         else:
-                             for t in tlist.tokens:
-                                 heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                       rank.with_token(t, chgpenalty)))
+                                                   rank.with_token(t, chgpenalty)))
                      elif tlist.end == trange.end:
-                         if tlist.ttype == qmod.TOKEN_PARTIAL:
-                             ranks.append(dbf.RankedTokens(rank.penalty
-                                                           + max(t.penalty for t in tlist.tokens),
-                                                           rank.tokens))
-                         else:
-                             ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
-                         if len(ranks) >= 10:
-                             # Too many variants, bail out and only add
-                             # Worst-case Fallback: sum of penalty of partials
-                             name_partials = self.query.get_partials_list(trange)
-                             default = sum(t.penalty for t in name_partials) + 0.2
-                             ranks.append(dbf.RankedTokens(rank.penalty + default, []))
-                             # Bail out of outer loop
-                             todo.clear()
-                             break
+                         ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
+             if len(ranks) >= 10:
+                 # Too many variants, bail out and only add
+                 # Worst-case Fallback: sum of penalty of partials
+                 default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
+                 ranks.append(dbf.RankedTokens(rank.penalty + default, []))
+                 # Bail out of outer loop
+                 break
  
          ranks.sort(key=lambda r: len(r.tokens))
          default = ranks[0].penalty + 0.3
index cc5b6cf098c1c00bdc0e30ebf8a4d44ccc1f640f,35171344f870b8b4ce7c20f125c58049759fa21f..1cb34f72311ee1244d33c2bd184a8e24d82674c4
@@@ -2,7 -2,7 +2,7 @@@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Implementation of query analysis for the ICU tokenizer.
@@@ -166,12 -166,6 +166,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
          log().section('Analyze query (using ICU tokenizer)')
          for func in self.preprocessors:
              phrases = func(phrases)
 +
 +        if len(phrases) == 1 \
 +                and phrases[0].text.count(' ') > 3 \
 +                and max(len(s) for s in phrases[0].text.split()) < 3:
 +            normalized = []
 +
          query = qmod.QueryStruct(phrases)
  
          log().var_dump('Normalized query', query.source)
      def rerank_tokens(self, query: qmod.QueryStruct) -> None:
          """ Add penalties to tokens that depend on presence of other token.
          """
-         for i, node, tlist in query.iter_token_lists():
-             if tlist.ttype == qmod.TOKEN_POSTCODE:
-                 tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
-                 for repl in node.starting:
-                     if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
-                        and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
-                         repl.add_penalty(0.39)
-             elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
-                   and len(tlist.tokens[0].lookup_word) <= 3):
-                 if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
-                     for repl in node.starting:
-                         if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
-                             repl.add_penalty(0.5 - tlist.tokens[0].penalty)
-             elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
-                 norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
-                                 if n.btype != qmod.BREAK_TOKEN)
-                 if not norm:
-                     # Can happen when the token only covers a partial term
-                     norm = query.nodes[i + 1].term_normalized
-                 for token in tlist.tokens:
-                     cast(ICUToken, token).rematch(norm)
+         for start, end, tlist in query.iter_tokens_by_edge():
+             if len(tlist) > 1:
+                 # If it looks like a Postcode, give preference.
+                 if qmod.TOKEN_POSTCODE in tlist:
+                     for ttype, tokens in tlist.items():
+                         if ttype != qmod.TOKEN_POSTCODE and \
+                                (ttype != qmod.TOKEN_HOUSENUMBER or
+                                 start + 1 > end or
+                                 len(query.nodes[end].term_lookup) > 4):
+                             for token in tokens:
+                                 token.penalty += 0.39
+                 # If it looks like a simple housenumber, prefer that.
+                 if qmod.TOKEN_HOUSENUMBER in tlist:
+                     hnr_lookup = tlist[qmod.TOKEN_HOUSENUMBER][0].lookup_word
+                     if len(hnr_lookup) <= 3 and any(c.isdigit() for c in hnr_lookup):
+                         penalty = 0.5 - tlist[qmod.TOKEN_HOUSENUMBER][0].penalty
+                         for ttype, tokens in tlist.items():
+                             if ttype != qmod.TOKEN_HOUSENUMBER:
+                                 for token in tokens:
+                                     token.penalty += penalty
+             # rerank tokens against the normalized form
+             norm = ' '.join(n.term_normalized for n in query.nodes[start + 1:end + 1]
+                             if n.btype != qmod.BREAK_TOKEN)
+             if not norm:
+                 # Can happen when the token only covers a partial term
+                 norm = query.nodes[start + 1].term_normalized
+             for ttype, tokens in tlist.items():
+                 if ttype != qmod.TOKEN_COUNTRY:
+                     for token in tokens:
+                         cast(ICUToken, token).rematch(norm)
  
  
  def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
      yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
      for i, node in enumerate(query.nodes):
+         if node.partial is not None:
+             t = cast(ICUToken, node.partial)
+             yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token,
+                    t.word_token, t.lookup_word, t.penalty, t.count, t.info]
          for tlist in node.starting:
              for token in tlist.tokens:
                  t = cast(ICUToken, token)
index 19b838639ab0e557a7cba97fbe5e012a9bf81b70,889bf5315e960dbd8c0f1834a1d128f629ce5df4..3fa867df550ac550659e873951c8dc9a6e913e2f
@@@ -2,7 -2,7 +2,7 @@@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
@@@ -12,7 -12,6 +12,6 @@@ from typing import Optional, Sequence, 
                     Dict, Set, Iterable
  import itertools
  import logging
- from pathlib import Path
  
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
@@@ -38,10 -37,10 +37,10 @@@ WORD_TYPES = (('country_names', 'C')
                ('housenumbers', 'H'))
  
  
- def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+ def create(dsn: str) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
-     return ICUTokenizer(dsn, data_dir)
+     return ICUTokenizer(dsn)
  
  
  class ICUTokenizer(AbstractTokenizer):
@@@ -50,9 -49,8 +49,8 @@@
          normalization routines in Nominatim 3.
      """
  
-     def __init__(self, dsn: str, data_dir: Path) -> None:
+     def __init__(self, dsn: str) -> None:
          self.dsn = dsn
-         self.data_dir = data_dir
          self.loader: Optional[ICURuleLoader] = None
  
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
                                         as info
                                  FROM word LEFT JOIN word_frequencies wf
                                       ON word.word_id = wf.id
 +                                ORDER BY word_id
                              """)
                  drop_tables(conn, 'word_frequencies')