release 5.1.0.post6

[nominatim.git] / src / nominatim_db / tokenizer / base.py
diff --git a/src/nominatim_db/tokenizer/base.py b/src/nominatim_db/tokenizer/base.py

index d3aeeacac628afd19fdd5cff9e6bc3dbb61419a9..af2816ecd055a360f3d79b6730f12c676623e385 100644 (file)
--- a/src/nominatim_db/tokenizer/base.py
+++ b/src/nominatim_db/tokenizer/base.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Abstract class definitions for tokenizers. These base classes are here
@@ -10,13 +10,13 @@ mainly for documentation purposes.
  """
  from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any, Optional, Iterable
-from pathlib import Path
  
  from ..typing import Protocol
  from ..config import Configuration
  from ..db.connection import Connection
  from ..data.place_info import PlaceInfo
  
+
  class AbstractAnalyzer(ABC):
      """ The analyzer provides the functions for analysing names and building
          the token database.
@@ -28,17 +28,14 @@ class AbstractAnalyzer(ABC):
      def __enter__(self) -> 'AbstractAnalyzer':
          return self
  
-
      def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
          self.close()
  
-
      @abstractmethod
      def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
  
-
      @abstractmethod
      def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
          """ Return token information for the given list of words.
@@ -57,7 +54,6 @@ class AbstractAnalyzer(ABC):
                      (original word, word token, word id).
          """
  
-
      @abstractmethod
      def normalize_postcode(self, postcode: str) -> str:
          """ Convert the postcode to its standardized form.
@@ -72,14 +68,12 @@ class AbstractAnalyzer(ABC):
                  The given postcode after normalization.
          """
  
-
      @abstractmethod
      def update_postcodes_from_db(self) -> None:
          """ Update the tokenizer's postcode tokens from the current content
              of the `location_postcode` table.
          """
  
-
      @abstractmethod
      def update_special_phrases(self,
                                 phrases: Iterable[Tuple[str, str, str, str]],
@@ -95,7 +89,6 @@ class AbstractAnalyzer(ABC):
                                  ones that already exist.
          """
  
-
      @abstractmethod
      def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
          """ Add the given names to the tokenizer's list of country tokens.
@@ -106,7 +99,6 @@ class AbstractAnalyzer(ABC):
                  names: Dictionary of name type to name.
          """
  
-
      @abstractmethod
      def process_place(self, place: PlaceInfo) -> Any:
          """ Extract tokens for the given place and compute the
@@ -122,7 +114,6 @@ class AbstractAnalyzer(ABC):
          """
  
  
-
  class AbstractTokenizer(ABC):
      """ The tokenizer instance is the central instance of the tokenizer in
          the system. There will only be a single instance of the tokenizer
@@ -146,7 +137,6 @@ class AbstractTokenizer(ABC):
                  tokenizers.
          """
  
-
      @abstractmethod
      def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from an existing database setup.
@@ -158,7 +148,6 @@ class AbstractTokenizer(ABC):
                config: Read-only object with configuration options.
          """
  
-
      @abstractmethod
      def finalize_import(self, config: Configuration) -> None:
          """ This function is called at the very end of an import when all
@@ -170,7 +159,6 @@ class AbstractTokenizer(ABC):
                config: Read-only object with configuration options.
          """
  
-
      @abstractmethod
      def update_sql_functions(self, config: Configuration) -> None:
          """ Update the SQL part of the tokenizer. This function is called
@@ -184,7 +172,6 @@ class AbstractTokenizer(ABC):
                config: Read-only object with configuration options.
          """
  
-
      @abstractmethod
      def check_database(self, config: Configuration) -> Optional[str]:
          """ Check that the database is set up correctly and ready for being
@@ -199,7 +186,6 @@ class AbstractTokenizer(ABC):
                    how to resolve the issue. If everything is okay, return `None`.
          """
  
-
      @abstractmethod
      def update_statistics(self, config: Configuration, threads: int = 1) -> None:
          """ Recompute any tokenizer statistics necessary for efficient lookup.
@@ -208,14 +194,12 @@ class AbstractTokenizer(ABC):
              it to be called in order to work.
          """
  
-
      @abstractmethod
      def update_word_tokens(self) -> None:
          """ Do house-keeping on the tokenizers internal data structures.
              Remove unused word tokens, resort data etc.
          """
  
-
      @abstractmethod
      def name_analyzer(self) -> AbstractAnalyzer:
          """ Create a new analyzer for tokenizing names and queries
@@ -231,7 +215,6 @@ class AbstractTokenizer(ABC):
              call the close() function before destructing the analyzer.
          """
  
-
      @abstractmethod
      def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
          """ Return a list of the most frequent full words in the database.
@@ -248,6 +231,6 @@ class TokenizerModule(Protocol):
          own tokenizer.
      """
  
-    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+    def create(self, dsn: str) -> AbstractTokenizer:
          """ Factory for new tokenizers.
          """