#
# This file is part of Nominatim. (https://nominatim.org)
#
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Conversion from token assignment to an abstract DB search.
if address:
sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
[t.token for r in address
- for t in self.query.get_partials_list(r)],
+ for t in self.query.iter_partials(r)],
lookups.Restrict)]
yield dbs.PostcodeSearch(penalty, sdata)
expected_count = sum(t.count for t in hnrs)
partials = {t.token: t.addr_count for trange in address
- for t in self.query.get_partials_list(trange)}
+ for t in self.query.iter_partials(trange)}
if not partials:
# can happen when none of the partials is indexed
are and tries to find a lookup that optimizes index use.
"""
penalty = 0.0 # extra penalty
- name_partials = {t.token: t for t in self.query.get_partials_list(name)}
+ name_partials = {t.token: t for t in self.query.iter_partials(name)}
- addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
+ addr_partials = [t for r in address for t in self.query.iter_partials(r)]
addr_tokens = list({t.token for t in addr_partials})
exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
- if fulls_count < 50000 or addr_count < 50000:
+ if fulls_count < 80000 or addr_count < 50000:
yield penalty, fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
ranks.sort(key=lambda r: r.penalty)
# Fallback, sum of penalty for partials
- name_partials = self.query.get_partials_list(trange)
- default = sum(t.penalty for t in name_partials) + 0.2
+ default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
return dbf.FieldRanking(db_field, default, ranks)
def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
while todo:
neglen, pos, rank = heapq.heappop(todo)
+ # partial node
+ partial = self.query.nodes[pos].partial
+ if partial is not None:
+ if pos + 1 < trange.end:
+ penalty = rank.penalty + partial.penalty \
+ + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+ heapq.heappush(todo, (neglen - 1, pos + 1,
+ dbf.RankedTokens(penalty, rank.tokens)))
+ else:
+ ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
+ rank.tokens))
+ # full words
for tlist in self.query.nodes[pos].starting:
- if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
+ if tlist.ttype == qmod.TOKEN_WORD:
if tlist.end < trange.end:
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
- if tlist.ttype == qmod.TOKEN_PARTIAL:
- penalty = rank.penalty + chgpenalty \
- + max(t.penalty for t in tlist.tokens)
+ for t in tlist.tokens:
heapq.heappush(todo, (neglen - 1, tlist.end,
- dbf.RankedTokens(penalty, rank.tokens)))
- else:
- for t in tlist.tokens:
- heapq.heappush(todo, (neglen - 1, tlist.end,
- rank.with_token(t, chgpenalty)))
+ rank.with_token(t, chgpenalty)))
elif tlist.end == trange.end:
- if tlist.ttype == qmod.TOKEN_PARTIAL:
- ranks.append(dbf.RankedTokens(rank.penalty
- + max(t.penalty for t in tlist.tokens),
- rank.tokens))
- else:
- ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
- if len(ranks) >= 10:
- # Too many variants, bail out and only add
- # Worst-case Fallback: sum of penalty of partials
- name_partials = self.query.get_partials_list(trange)
- default = sum(t.penalty for t in name_partials) + 0.2
- ranks.append(dbf.RankedTokens(rank.penalty + default, []))
- # Bail out of outer loop
- todo.clear()
- break
+ ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
+
+ if len(ranks) >= 10:
+ # Too many variants, bail out and only add
+ # Worst-case Fallback: sum of penalty of partials
+ default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
+ ranks.append(dbf.RankedTokens(rank.penalty + default, []))
+ # Bail out of outer loop
+ break
ranks.sort(key=lambda r: len(r.tokens))
default = ranks[0].penalty + 0.3
#
# This file is part of Nominatim. (https://nominatim.org)
#
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of query analysis for the ICU tokenizer.
log().section('Analyze query (using ICU tokenizer)')
for func in self.preprocessors:
phrases = func(phrases)
+
+ if len(phrases) == 1 \
+ and phrases[0].text.count(' ') > 3 \
+ and max(len(s) for s in phrases[0].text.split()) < 3:
+ normalized = []
+
query = qmod.QueryStruct(phrases)
log().var_dump('Normalized query', query.source)
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
- for i, node, tlist in query.iter_token_lists():
- if tlist.ttype == qmod.TOKEN_POSTCODE:
- tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
- for repl in node.starting:
- if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
- and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
- repl.add_penalty(0.39)
- elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
- and len(tlist.tokens[0].lookup_word) <= 3):
- if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
- for repl in node.starting:
- if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
- repl.add_penalty(0.5 - tlist.tokens[0].penalty)
- elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
- norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
- if n.btype != qmod.BREAK_TOKEN)
- if not norm:
- # Can happen when the token only covers a partial term
- norm = query.nodes[i + 1].term_normalized
- for token in tlist.tokens:
- cast(ICUToken, token).rematch(norm)
+ for start, end, tlist in query.iter_tokens_by_edge():
+ if len(tlist) > 1:
+ # If it looks like a Postcode, give preference.
+ if qmod.TOKEN_POSTCODE in tlist:
+ for ttype, tokens in tlist.items():
+ if ttype != qmod.TOKEN_POSTCODE and \
+ (ttype != qmod.TOKEN_HOUSENUMBER or
+ start + 1 > end or
+ len(query.nodes[end].term_lookup) > 4):
+ for token in tokens:
+ token.penalty += 0.39
+
+ # If it looks like a simple housenumber, prefer that.
+ if qmod.TOKEN_HOUSENUMBER in tlist:
+ hnr_lookup = tlist[qmod.TOKEN_HOUSENUMBER][0].lookup_word
+ if len(hnr_lookup) <= 3 and any(c.isdigit() for c in hnr_lookup):
+ penalty = 0.5 - tlist[qmod.TOKEN_HOUSENUMBER][0].penalty
+ for ttype, tokens in tlist.items():
+ if ttype != qmod.TOKEN_HOUSENUMBER:
+ for token in tokens:
+ token.penalty += penalty
+
+ # rerank tokens against the normalized form
+ norm = ' '.join(n.term_normalized for n in query.nodes[start + 1:end + 1]
+ if n.btype != qmod.BREAK_TOKEN)
+ if not norm:
+ # Can happen when the token only covers a partial term
+ norm = query.nodes[start + 1].term_normalized
+ for ttype, tokens in tlist.items():
+ if ttype != qmod.TOKEN_COUNTRY:
+ for token in tokens:
+ cast(ICUToken, token).rematch(norm)
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for i, node in enumerate(query.nodes):
+ if node.partial is not None:
+ t = cast(ICUToken, node.partial)
+ yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token,
+ t.word_token, t.lookup_word, t.penalty, t.count, t.info]
for tlist in node.starting:
for token in tlist.tokens:
t = cast(ICUToken, token)
#
# This file is part of Nominatim. (https://nominatim.org)
#
- # Copyright (C) 2024 by the Nominatim developer community.
+ # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
Dict, Set, Iterable
import itertools
import logging
- from pathlib import Path
from psycopg.types.json import Jsonb
from psycopg import sql as pysql
('housenumbers', 'H'))
- def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+ def create(dsn: str) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
- return ICUTokenizer(dsn, data_dir)
+ return ICUTokenizer(dsn)
class ICUTokenizer(AbstractTokenizer):
normalization routines in Nominatim 3.
"""
- def __init__(self, dsn: str, data_dir: Path) -> None:
+ def __init__(self, dsn: str) -> None:
self.dsn = dsn
- self.data_dir = data_dir
self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
+ ORDER BY word_id
""")
drop_tables(conn, 'word_frequencies')