From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 12 Jul 2022 21:15:19 +0000 (+0200)
Subject: add type hints for sanitizers
X-Git-Tag: v4.1.0~10^2~17
X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/62eedbb8f6ddb7928022d799a1b1fea62844faf8?hp=5617bffe2fcf527365bcd801304180e1cc07cb98

add type hints for sanitizers
---

diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py
index 99065965..dad35b7a 100644
--- a/nominatim/data/postcode_format.py
+++ b/nominatim/data/postcode_format.py
@@ -79,7 +79,7 @@ class PostcodeFormatter:
         self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
 
 
-    def get_matcher(self, country_code: str) -> Optional[CountryPostcodeMatcher]:
+    def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
         """ Return the CountryPostcodeMatcher for the given country.
             Returns None if the country doesn't have a postcode and the
             default matcher if there is no specific matcher configured for
@@ -88,10 +88,12 @@ class PostcodeFormatter:
         if country_code in self.country_without_postcode:
             return None
 
+        assert country_code is not None
+
         return self.country_matcher.get(country_code, self.default_matcher)
 
 
-    def match(self, country_code: str, postcode: str) -> Optional[Match[str]]:
+    def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
         """ Match the given postcode against the postcode pattern for this
             matcher. Returns a `re.Match` object if the country has a pattern
             and the match was successful or None if the match failed.
@@ -99,6 +101,8 @@ class PostcodeFormatter:
         if country_code in self.country_without_postcode:
             return None
 
+        assert country_code is not None
+
         return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
 
 
diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py
index 913b363c..3f548e06 100644
--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -8,100 +8,13 @@
 Handler for cleaning name and address tags in place information before it
 is handed to the token analysis.
 """
+from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
 import importlib
 
 from nominatim.errors import UsageError
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
-
-class PlaceName:
-    """ A searchable name for a place together with properties.
-        Every name object saves the name proper and two basic properties:
-        * 'kind' describes the name of the OSM key used without any suffixes
-          (i.e. the part after the colon removed)
-        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
-          is the part of the key after the first colon.
-        In addition to that, the name may have arbitrary additional attributes.
-        Which attributes are used, depends on the token analyser.
-    """
-
-    def __init__(self, name, kind, suffix):
-        self.name = name
-        self.kind = kind
-        self.suffix = suffix
-        self.attr = {}
-
-
-    def __repr__(self):
-        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
-
-
-    def clone(self, name=None, kind=None, suffix=None, attr=None):
-        """ Create a deep copy of the place name, optionally with the
-            given parameters replaced. In the attribute list only the given
-            keys are updated. The list is not replaced completely.
-            In particular, the function cannot to be used to remove an
-            attribute from a place name.
-        """
-        newobj = PlaceName(name or self.name,
-                           kind or self.kind,
-                           suffix or self.suffix)
-
-        newobj.attr.update(self.attr)
-        if attr:
-            newobj.attr.update(attr)
-
-        return newobj
-
-
-    def set_attr(self, key, value):
-        """ Add the given property to the name. If the property was already
-            set, then the value is overwritten.
-        """
-        self.attr[key] = value
-
-
-    def get_attr(self, key, default=None):
-        """ Return the given property or the value of 'default' if it
-            is not set.
-        """
-        return self.attr.get(key, default)
-
-
-    def has_attr(self, key):
-        """ Check if the given attribute is set.
-        """
-        return key in self.attr
-
-
-class _ProcessInfo:
-    """ Container class for information handed into to handler functions.
-        The 'names' and 'address' members are mutable. A handler must change
-        them by either modifying the lists place or replacing the old content
-        with a new list.
-    """
-
-    def __init__(self, place):
-        self.place = place
-        self.names = self._convert_name_dict(place.name)
-        self.address = self._convert_name_dict(place.address)
-
-
-    @staticmethod
-    def _convert_name_dict(names):
-        """ Convert a dictionary of names into a list of PlaceNames.
-            The dictionary key is split into the primary part of the key
-            and the suffix (the part after an optional colon).
-        """
-        out = []
-
-        if names:
-            for key, value in names.items():
-                parts = key.split(':', 1)
-                out.append(PlaceName(value.strip(),
-                                     parts[0].strip(),
-                                     parts[1].strip() if len(parts) > 1 else None))
-
-        return out
+from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
+from nominatim.data.place_info import PlaceInfo
 
 
 class PlaceSanitizer:
@@ -109,24 +22,24 @@ class PlaceSanitizer:
         names and address before they are used by the token analysers.
     """
 
-    def __init__(self, rules):
-        self.handlers = []
+    def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
+        self.handlers: List[Callable[[ProcessInfo], None]] = []
 
         if rules:
             for func in rules:
                 if 'step' not in func:
                     raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                 module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
-                handler_module = importlib.import_module(module_name)
+                handler_module: SanitizerHandler = importlib.import_module(module_name)
                 self.handlers.append(handler_module.create(SanitizerConfig(func)))
 
 
-    def process_names(self, place):
+    def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
         """ Extract a sanitized list of names and address parts from the
             given place. The function returns a tuple
             (list of names, list of address names)
         """
-        obj = _ProcessInfo(place)
+        obj = ProcessInfo(place)
 
         for func in self.handlers:
             func(obj)
diff --git a/nominatim/tokenizer/sanitizers/base.py b/nominatim/tokenizer/sanitizers/base.py
new file mode 100644
index 00000000..f2e1bc41
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/base.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for sanitizers.
+"""
+from typing import Optional, Dict, List, Mapping, Callable
+
+from typing_extensions import Protocol, Final
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+from nominatim.data.place_info import PlaceInfo
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name: str, kind: str, suffix: Optional[str]):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr: Dict[str, str] = {}
+
+
+    def __repr__(self) -> str:
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name: Optional[str] = None,
+              kind: Optional[str] = None,
+              suffix: Optional[str] = None,
+              attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key: str, value: str) -> None:
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key: str) -> bool:
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place: PlaceInfo):
+        self.place: Final = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class SanitizerHandler(Protocol):
+    """ Protocol for sanitizer modules.
+    """
+
+    def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+        """
+        A sanitizer must define a single function `create`. It takes the
+        dictionary with the configuration information for the sanitizer and
+        returns a function that transforms name and address.
+        """
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
index c229716f..5df057d0 100644
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -24,11 +24,15 @@ Arguments:
                      or a list of strings, where each string is a regular
                      expression that must match the full house number value.
 """
+from typing import Callable, Iterator, List
 import re
 
+from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
 class _HousenumberSanitizer:
 
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
         self.filter_kind = config.get_filter_kind('housenumber')
         self.split_regexp = config.get_delimiter()
 
@@ -37,13 +41,13 @@ class _HousenumberSanitizer:
 
 
 
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
         if not obj.address:
             return
 
-        new_address = []
+        new_address: List[PlaceName] = []
         for item in obj.address:
-            if self.filter_kind(item):
+            if self.filter_kind(item.kind):
                 if self._treat_as_name(item.name):
                     obj.names.append(item.clone(kind='housenumber'))
                 else:
@@ -56,7 +60,7 @@ class _HousenumberSanitizer:
         obj.address = new_address
 
 
-    def sanitize(self, value):
+    def sanitize(self, value: str) -> Iterator[str]:
         """ Extract housenumbers in a regularized format from an OSM value.
 
             The function works as a generator that yields all valid housenumbers
@@ -67,16 +71,15 @@ class _HousenumberSanitizer:
                 yield from self._regularize(hnr)
 
 
-    @staticmethod
-    def _regularize(hnr):
+    def _regularize(self, hnr: str) -> Iterator[str]:
         yield hnr
 
 
-    def _treat_as_name(self, housenumber):
+    def _treat_as_name(self, housenumber: str) -> bool:
         return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
 
 
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a housenumber processing function.
     """
 
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index 05e90ca1..cabacff4 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -20,11 +20,15 @@ Arguments:
                         objects that have no country assigned. These are always
                         assumed to have no postcode.
 """
+from typing import Callable, Optional, Tuple
+
 from nominatim.data.postcode_format import PostcodeFormatter
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 
 class _PostcodeSanitizer:
 
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
         self.convert_to_address = config.get_bool('convert-to-address', True)
         self.matcher = PostcodeFormatter()
 
@@ -33,7 +37,7 @@ class _PostcodeSanitizer:
             self.matcher.set_default_pattern(default_pattern)
 
 
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
         if not obj.address:
             return
 
@@ -52,7 +56,7 @@ class _PostcodeSanitizer:
                 postcode.set_attr('variant', formatted[1])
 
 
-    def scan(self, postcode, country):
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
         """ Check the postcode for correct formatting and return the
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
@@ -61,13 +65,15 @@ class _PostcodeSanitizer:
         if match is None:
             return None
 
+        assert country is not None
+
         return self.matcher.normalize(country, match),\
                ' '.join(filter(lambda p: p is not None, match.groups()))
 
 
 
 
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a housenumber processing function.
     """
 
diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py
index ce5ce1eb..fd05848b 100644
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -7,20 +7,28 @@
 """
 Configuration for Sanitizers.
 """
+from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
 from collections import UserDict
 import re
 
 from nominatim.errors import UsageError
 
-class SanitizerConfig(UserDict):
+# working around missing generics in Python < 3.8
+# See https://github.com/python/typing/issues/60#issuecomment-869757075
+if TYPE_CHECKING:
+    _BaseUserDict = UserDict[str, Any]
+else:
+    _BaseUserDict = UserDict
+
+class SanitizerConfig(_BaseUserDict):
     """ Dictionary with configuration options for a sanitizer.
 
-        In addition to the usualy dictionary function, the class provides
+        In addition to the usual dictionary function, the class provides
         accessors to standard sanatizer options that are used by many of the
         sanitizers.
     """
 
-    def get_string_list(self, param, default=tuple()):
+    def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
         """ Extract a configuration parameter as a string list.
             If the parameter value is a simple string, it is returned as a
             one-item list. If the parameter value does not exist, the given
@@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
         return values
 
 
-    def get_bool(self, param, default=None):
+    def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
         """ Extract a configuration parameter as a boolean.
             The parameter must be one of the yaml boolean values or an
             user error will be raised. If `default` is given, then the parameter
@@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
         return value
 
 
-    def get_delimiter(self, default=',;'):
+    def get_delimiter(self, default: str = ',;') -> Pattern[str]:
         """ Return the 'delimiter' parameter in the configuration as a
             compiled regular expression that can be used to split the names on the
             delimiters. The regular expression makes sure that the resulting names
@@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
         return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
 
 
-    def get_filter_kind(self, *default):
+    def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
         """ Return a filter function for the name kind from the 'filter-kind'
             config parameter. The filter functions takes a name item and returns
             True when the item passes the filter.
@@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):
 
         regexes = [re.compile(regex) for regex in filters]
 
-        return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
+        return lambda name: any(regex.fullmatch(name) for regex in regexes)
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py
index c9db0a9d..7d0667b4 100644
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -11,13 +11,18 @@ Arguments:
     delimiters: Define the set of characters to be used for
                 splitting the list. (default: ',;')
 """
-def create(config):
+from typing import Callable
+
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a name processing function that splits name values with
         multiple values into their components.
     """
     regexp = config.get_delimiter()
 
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
         if not obj.names:
             return
 
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
index f8cdd035..119d5693 100644
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
 addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
 only the main name part with the bracket part removed.
 """
+from typing import Callable
 
-def create(_):
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a name processing function that creates additional name variants
         for bracket addendums.
     """
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
         """ Add variants for names that have a bracket extension.
         """
         if obj.names:
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
index d3413c1a..6d6430f0 100644
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -30,13 +30,17 @@ Arguments:
           any analyzer tagged) is retained. (default: replace)
 
 """
+from typing import Callable, Dict, Optional, List
+
 from nominatim.data import country_info
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 
 class _AnalyzerByLanguage:
     """ Processor for tagging the language of names in a place.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
         self.filter_kind = config.get_filter_kind()
         self.replace = config.get('mode', 'replace') != 'append'
         self.whitelist = config.get('whitelist')
@@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
         self._compute_default_languages(config.get('use-defaults', 'no'))
 
 
-    def _compute_default_languages(self, use_defaults):
-        self.deflangs = {}
+    def _compute_default_languages(self, use_defaults: str) -> None:
+        self.deflangs: Dict[Optional[str], List[str]] = {}
 
         if use_defaults in ('mono', 'all'):
             for ccode, clangs in country_info.iterate('languages'):
@@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
                         self.deflangs[ccode] = clangs
 
 
-    def _suffix_matches(self, suffix):
+    def _suffix_matches(self, suffix: str) -> bool:
         if self.whitelist is None:
             return len(suffix) in (2, 3) and suffix.islower()
 
         return suffix in self.whitelist
 
 
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
         if not obj.names:
             return
 
         more_names = []
 
         for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self.filter_kind(n)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
             if name.suffix:
                 langs = [name.suffix] if self._suffix_matches(name.suffix) else None
             else:
@@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
         obj.names.extend(more_names)
 
 
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a function that sets the analyzer property depending on the
         language of the tag.
     """
diff --git a/test/python/tokenizer/sanitizers/test_sanitizer_config.py b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
index 02794776..0dbbc7a0 100644
--- a/test/python/tokenizer/sanitizers/test_sanitizer_config.py
+++ b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
 def test_create_kind_filter_no_params(inp):
     filt = SanitizerConfig().get_filter_kind()
 
-    assert filt(PlaceName('something', inp, ''))
+    assert filt(inp)
 
 
 @pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
 def test_create_kind_filter_custom_regex_positive(kind):
     filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
 
-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)
 
 
 @pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
 def test_create_kind_filter_custom_regex_negative(kind):
     filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
 
-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)
 
 
 @pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
 def test_create_kind_filter_many_positive(kind):
     filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
 
-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)
 
 
 @pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
 def test_create_kind_filter_many_negative(kind):
     filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
 
-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)