nominatim/api/search/query.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2023 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Datastructures for a tokenized query.
   9 """
  10 from typing import List, Tuple, Optional, NamedTuple
  11 from abc import ABC, abstractmethod
  12 import dataclasses
  13 import enum
  14
  15 class BreakType(enum.Enum):
  16     """ Type of break between tokens.
  17     """
  18     START = '<'
  19     """ Begin of the query. """
  20     END = '>'
  21     """ End of the query. """
  22     PHRASE = ','
  23     """ Break between two phrases. """
  24     WORD = ' '
  25     """ Break between words. """
  26     PART = '-'
  27     """ Break inside a word, for example a hyphen or apostrophe. """
  28     TOKEN = '`'
  29     """ Break created as a result of tokenization.
  30         This may happen in languages without spaces between words.
  31     """
  32
  33
  34 class TokenType(enum.Enum):
  35     """ Type of token.
  36     """
  37     WORD = enum.auto()
  38     """ Full name of a place. """
  39     PARTIAL = enum.auto()
  40     """ Word term without breaks, does not necessarily represent a full name. """
  41     HOUSENUMBER = enum.auto()
  42     """ Housenumber term. """
  43     POSTCODE = enum.auto()
  44     """ Postal code term. """
  45     COUNTRY = enum.auto()
  46     """ Country name or reference. """
  47     QUALIFIER = enum.auto()
  48     """ Special term used together with name (e.g. _Hotel_ Bellevue). """
  49     CATEGORY = enum.auto()
  50     """ Special term used as searchable object(e.g. supermarket in ...). """
  51
  52
  53 class PhraseType(enum.Enum):
  54     """ Designation of a phrase.
  55     """
  56     NONE = 0
  57     """ No specific designation (i.e. source is free-form query). """
  58     AMENITY = enum.auto()
  59     """ Contains name or type of a POI. """
  60     STREET = enum.auto()
  61     """ Contains a street name optionally with a housenumber. """
  62     CITY = enum.auto()
  63     """ Contains the postal city. """
  64     COUNTY = enum.auto()
  65     """ Contains the equivalent of a county. """
  66     STATE = enum.auto()
  67     """ Contains a state or province. """
  68     POSTCODE = enum.auto()
  69     """ Contains a postal code. """
  70     COUNTRY = enum.auto()
  71     """ Contains the country name or code. """
  72
  73     def compatible_with(self, ttype: TokenType) -> bool:
  74         """ Check if the given token type can be used with the phrase type.
  75         """
  76         if self == PhraseType.NONE:
  77             return True
  78         if self == PhraseType.AMENITY:
  79             return ttype in (TokenType.WORD, TokenType.PARTIAL,
  80                              TokenType.QUALIFIER, TokenType.CATEGORY)
  81         if self == PhraseType.STREET:
  82             return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
  83         if self == PhraseType.POSTCODE:
  84             return ttype == TokenType.POSTCODE
  85         if self == PhraseType.COUNTRY:
  86             return ttype == TokenType.COUNTRY
  87
  88         return ttype in (TokenType.WORD, TokenType.PARTIAL)
  89
  90
  91 @dataclasses.dataclass
  92 class Token(ABC):
  93     """ Base type for tokens.
  94         Specific query analyzers must implement the concrete token class.
  95     """
  96
  97     penalty: float
  98     token: int
  99     count: int
 100     lookup_word: str
 101     is_indexed: bool
 102
 103
 104     @abstractmethod
 105     def get_category(self) -> Tuple[str, str]:
 106         """ Return the category restriction for qualifier terms and
 107             category objects.
 108         """
 109
 110
 111 class TokenRange(NamedTuple):
 112     """ Indexes of query nodes over which a token spans.
 113     """
 114     start: int
 115     end: int
 116
 117
 118 @dataclasses.dataclass
 119 class TokenList:
 120     """ List of all tokens of a given type going from one breakpoint to another.
 121     """
 122     end: int
 123     ttype: TokenType
 124     tokens: List[Token]
 125
 126
 127 @dataclasses.dataclass
 128 class QueryNode:
 129     """ A node of the querry representing a break between terms.
 130     """
 131     btype: BreakType
 132     ptype: PhraseType
 133     starting: List[TokenList] = dataclasses.field(default_factory=list)
 134
 135     def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
 136         """ Check if there are tokens of the given types ending at the
 137             given node.
 138         """
 139         return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
 140
 141
 142     def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
 143         """ Get the list of tokens of the given type starting at this node
 144             and ending at the node 'end'. Returns 'None' if no such
 145             tokens exist.
 146         """
 147         return next((t.tokens for t in self.starting if t.end == end and t.ttype == ttype), None)
 148
 149
 150 @dataclasses.dataclass
 151 class Phrase:
 152     """ A normalized query part. Phrases may be typed which means that
 153         they then represent a specific part of the address.
 154     """
 155     ptype: PhraseType
 156     text: str
 157
 158
 159 class QueryStruct:
 160     """ A tokenized search query together with the normalized source
 161         from which the tokens have been parsed.
 162
 163         The query contains a list of nodes that represent the breaks
 164         between words. Tokens span between nodes, which don't necessarily
 165         need to be direct neighbours. Thus the query is represented as a
 166         directed acyclic graph.
 167
 168         When created, a query contains a single node: the start of the
 169         query. Further nodes can be added by appending to 'nodes'.
 170     """
 171
 172     def __init__(self, source: List[Phrase]) -> None:
 173         self.source = source
 174         self.nodes: List[QueryNode] = \
 175             [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
 176
 177
 178     def num_token_slots(self) -> int:
 179         """ Return the length of the query in vertice steps.
 180         """
 181         return len(self.nodes) - 1
 182
 183
 184     def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
 185         """ Append a new break node with the given break type.
 186             The phrase type denotes the type for any tokens starting
 187             at the node.
 188         """
 189         self.nodes.append(QueryNode(btype, ptype))
 190
 191
 192     def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
 193         """ Add a token to the query. 'start' and 'end' are the indexes of the
 194             nodes from which to which the token spans. The indexes must exist
 195             and are expected to be in the same phrase.
 196             'ttype' denotes the type of the token and 'token' the token to
 197             be inserted.
 198
 199             If the token type is not compatible with the phrase it should
 200             be added to, then the token is silently dropped.
 201         """
 202         snode = self.nodes[trange.start]
 203         if snode.ptype.compatible_with(ttype):
 204             tlist = snode.get_tokens(trange.end, ttype)
 205             if tlist is None:
 206                 snode.starting.append(TokenList(trange.end, ttype, [token]))
 207             else:
 208                 tlist.append(token)
 209
 210
 211     def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
 212         """ Get the list of tokens of a given type, spanning the given
 213             nodes. The nodes must exist. If no tokens exist, an
 214             empty list is returned.
 215         """
 216         return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
 217
 218
 219     def get_partials_list(self, trange: TokenRange) -> List[Token]:
 220         """ Create a list of partial tokens between the given nodes.
 221             The list is composed of the first token of type PARTIAL
 222             going to the subsequent node. Such PARTIAL tokens are
 223             assumed to exist.
 224         """
 225         return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
 226                           for i in range(trange.start, trange.end)]
 227
 228
 229     def find_lookup_word_by_id(self, token: int) -> str:
 230         """ Find the first token with the given token ID and return
 231             its lookup word. Returns 'None' if no such token exists.
 232             The function is very slow and must only be used for
 233             debugging.
 234         """
 235         for node in self.nodes:
 236             for tlist in node.starting:
 237                 for t in tlist.tokens:
 238                     if t.token == token:
 239                         return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
 240         return 'None'