#
# This file is part of Nominatim. (https://nominatim.org)
#
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
-from pathlib import Path
from ..typing import Protocol
from ..config import Configuration
from ..db.connection import Connection
from ..data.place_info import PlaceInfo
+
class AbstractAnalyzer(ABC):
""" The analyzer provides the functions for analysing names and building
the token database.
def __enter__(self) -> 'AbstractAnalyzer':
return self
-
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
self.close()
-
@abstractmethod
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
-
@abstractmethod
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
(original word, word token, word id).
"""
-
@abstractmethod
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to its standardized form.
The given postcode after normalization.
"""
-
@abstractmethod
def update_postcodes_from_db(self) -> None:
""" Update the tokenizer's postcode tokens from the current content
of the `location_postcode` table.
"""
-
@abstractmethod
def update_special_phrases(self,
phrases: Iterable[Tuple[str, str, str, str]],
ones that already exist.
"""
-
@abstractmethod
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
""" Add the given names to the tokenizer's list of country tokens.
names: Dictionary of name type to name.
"""
-
@abstractmethod
def process_place(self, place: PlaceInfo) -> Any:
""" Extract tokens for the given place and compute the
"""
-
class AbstractTokenizer(ABC):
""" The tokenizer instance is the central instance of the tokenizer in
the system. There will only be a single instance of the tokenizer
tokenizers.
"""
-
@abstractmethod
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
config: Read-only object with configuration options.
"""
-
@abstractmethod
def finalize_import(self, config: Configuration) -> None:
""" This function is called at the very end of an import when all
config: Read-only object with configuration options.
"""
-
@abstractmethod
def update_sql_functions(self, config: Configuration) -> None:
""" Update the SQL part of the tokenizer. This function is called
config: Read-only object with configuration options.
"""
-
@abstractmethod
def check_database(self, config: Configuration) -> Optional[str]:
""" Check that the database is set up correctly and ready for being
how to resolve the issue. If everything is okay, return `None`.
"""
-
@abstractmethod
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
it to be called in order to work.
"""
-
@abstractmethod
def update_word_tokens(self) -> None:
""" Do house-keeping on the tokenizers internal data structures.
Remove unused word tokens, resort data etc.
"""
-
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries
call the close() function before destructing the analyzer.
"""
-
@abstractmethod
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the most frequent full words in the database.
own tokenizer.
"""
- def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+ def create(self, dsn: str) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""