src/nominatim_db/tokenizer/base.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Abstract class definitions for tokenizers. These base classes are here
   9 mainly for documentation purposes.
  10 """
  11 from abc import ABC, abstractmethod
  12 from typing import List, Tuple, Dict, Any, Optional, Iterable
  13 from pathlib import Path
  14
  15 from ..typing import Protocol
  16 from ..config import Configuration
  17 from ..db.connection import Connection
  18 from ..data.place_info import PlaceInfo
  19
  20
  21 class AbstractAnalyzer(ABC):
  22     """ The analyzer provides the functions for analysing names and building
  23         the token database.
  24
  25         Analyzers are instantiated on a per-thread base. Access to global data
  26         structures must be synchronised accordingly.
  27     """
  28
  29     def __enter__(self) -> 'AbstractAnalyzer':
  30         return self
  31
  32     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
  33         self.close()
  34
  35     @abstractmethod
  36     def close(self) -> None:
  37         """ Free all resources used by the analyzer.
  38         """
  39
  40     @abstractmethod
  41     def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
  42         """ Return token information for the given list of words.
  43
  44             The function is used for testing and debugging only
  45             and does not need to be particularly efficient.
  46
  47             Arguments:
  48                 words: A list of words to look up the tokens for.
  49                        If a word starts with # it is assumed to be a full name
  50                        otherwise is a partial term.
  51
  52             Returns:
  53                 The function returns the list of all tuples that could be
  54                     found for the given words. Each list entry is a tuple of
  55                     (original word, word token, word id).
  56         """
  57
  58     @abstractmethod
  59     def normalize_postcode(self, postcode: str) -> str:
  60         """ Convert the postcode to its standardized form.
  61
  62             This function must yield exactly the same result as the SQL function
  63             `token_normalized_postcode()`.
  64
  65             Arguments:
  66                 postcode: The postcode to be normalized.
  67
  68             Returns:
  69                 The given postcode after normalization.
  70         """
  71
  72     @abstractmethod
  73     def update_postcodes_from_db(self) -> None:
  74         """ Update the tokenizer's postcode tokens from the current content
  75             of the `location_postcode` table.
  76         """
  77
  78     @abstractmethod
  79     def update_special_phrases(self,
  80                                phrases: Iterable[Tuple[str, str, str, str]],
  81                                should_replace: bool) -> None:
  82         """ Update the tokenizer's special phrase tokens from the given
  83             list of special phrases.
  84
  85             Arguments:
  86                 phrases: The new list of special phrases. Each entry is
  87                          a tuple of (phrase, class, type, operator).
  88                 should_replace: If true, replace the current list of phrases.
  89                                 When false, just add the given phrases to the
  90                                 ones that already exist.
  91         """
  92
  93     @abstractmethod
  94     def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
  95         """ Add the given names to the tokenizer's list of country tokens.
  96
  97             Arguments:
  98                 country_code: two-letter country code for the country the names
  99                               refer to.
 100                 names: Dictionary of name type to name.
 101         """
 102
 103     @abstractmethod
 104     def process_place(self, place: PlaceInfo) -> Any:
 105         """ Extract tokens for the given place and compute the
 106             information to be handed to the PL/pgSQL processor for building
 107             the search index.
 108
 109             Arguments:
 110                 place: Place information retrieved from the database.
 111
 112             Returns:
 113                 A JSON-serialisable structure that will be handed into
 114                     the database via the `token_info` field.
 115         """
 116
 117
 118 class AbstractTokenizer(ABC):
 119     """ The tokenizer instance is the central instance of the tokenizer in
 120         the system. There will only be a single instance of the tokenizer
 121         active at any time.
 122     """
 123
 124     @abstractmethod
 125     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 126         """ Set up a new tokenizer for the database.
 127
 128             The function should copy all necessary data into the project
 129             directory or save it in the property table to make sure that
 130             the tokenizer remains stable over updates.
 131
 132             Arguments:
 133               config: Read-only object with configuration options.
 134
 135               init_db: When set to False, then initialisation of database
 136                 tables should be skipped. This option is only required for
 137                 migration purposes and can be safely ignored by custom
 138                 tokenizers.
 139         """
 140
 141     @abstractmethod
 142     def init_from_project(self, config: Configuration) -> None:
 143         """ Initialise the tokenizer from an existing database setup.
 144
 145             The function should load all previously saved configuration from
 146             the project directory and/or the property table.
 147
 148             Arguments:
 149               config: Read-only object with configuration options.
 150         """
 151
 152     @abstractmethod
 153     def finalize_import(self, config: Configuration) -> None:
 154         """ This function is called at the very end of an import when all
 155             data has been imported and indexed. The tokenizer may create
 156             at this point any additional indexes and data structures needed
 157             during query time.
 158
 159             Arguments:
 160               config: Read-only object with configuration options.
 161         """
 162
 163     @abstractmethod
 164     def update_sql_functions(self, config: Configuration) -> None:
 165         """ Update the SQL part of the tokenizer. This function is called
 166             automatically on migrations or may be called explicitly by the
 167             user through the `nominatim refresh --functions` command.
 168
 169             The tokenizer must only update the code of the tokenizer. The
 170             data structures or data itself must not be changed by this function.
 171
 172             Arguments:
 173               config: Read-only object with configuration options.
 174         """
 175
 176     @abstractmethod
 177     def check_database(self, config: Configuration) -> Optional[str]:
 178         """ Check that the database is set up correctly and ready for being
 179             queried.
 180
 181             Arguments:
 182               config: Read-only object with configuration options.
 183
 184             Returns:
 185               If an issue was found, return an error message with the
 186                   description of the issue as well as hints for the user on
 187                   how to resolve the issue. If everything is okay, return `None`.
 188         """
 189
 190     @abstractmethod
 191     def update_statistics(self, config: Configuration, threads: int = 1) -> None:
 192         """ Recompute any tokenizer statistics necessary for efficient lookup.
 193             This function is meant to be called from time to time by the user
 194             to improve performance. However, the tokenizer must not depend on
 195             it to be called in order to work.
 196         """
 197
 198     @abstractmethod
 199     def update_word_tokens(self) -> None:
 200         """ Do house-keeping on the tokenizers internal data structures.
 201             Remove unused word tokens, resort data etc.
 202         """
 203
 204     @abstractmethod
 205     def name_analyzer(self) -> AbstractAnalyzer:
 206         """ Create a new analyzer for tokenizing names and queries
 207             using this tokinzer. Analyzers are context managers and should
 208             be used accordingly:
 209
 210             ```
 211             with tokenizer.name_analyzer() as analyzer:
 212                 analyser.tokenize()
 213             ```
 214
 215             When used outside the with construct, the caller must ensure to
 216             call the close() function before destructing the analyzer.
 217         """
 218
 219     @abstractmethod
 220     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 221         """ Return a list of the most frequent full words in the database.
 222
 223             Arguments:
 224               conn: Open connection to the database which may be used to
 225                     retrieve the words.
 226               num: Maximum number of words to return.
 227         """
 228
 229
 230 class TokenizerModule(Protocol):
 231     """ Interface that must be exported by modules that implement their
 232         own tokenizer.
 233     """
 234
 235     def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
 236         """ Factory for new tokenizers.
 237         """