Merge pull request #2757 from lonvia/filter-postcodes

[nominatim.git] / nominatim / tokenizer / sanitizers / tag_analyzer_by_language.py
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

index c98c825da8d66922fee20904c9813630f790b827..9a99d127728290264c7762f7c76fefb7177f3267 100644 (file)
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -1,8 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
  """
-Name processor for tagging the langauge of the name
-"""
-import re
+This sanitizer sets the `analyzer` property depending on the
+language of the tag. The language is taken from the suffix of the name.
+If a name already has an analyzer tagged, then this is kept.
+
+Arguments:
+
+    filter-kind: Restrict the names the sanitizer should be applied to
+                 to the given tags. The parameter expects a list of
+                 regular expressions which are matched against 'kind'.
+                 Note that a match against the full string is expected.
+    whitelist: Restrict the set of languages that should be tagged.
+               Expects a list of acceptable suffixes. When unset,
+               all 2- and 3-letter lower-case codes are accepted.
+    use-defaults:  Configure what happens when the name has no suffix.
+                   When set to 'all', a variant is created for
+                   each of the default languages in the country
+                   the feature is in. When set to 'mono', a variant is
+                   only created, when exactly one language is spoken
+                   in the country. The default is to do nothing with
+                   the default languages of a country.
+    mode: Define how the variants are created and may be 'replace' or
+          'append'. When set to 'append' the original name (without
+          any analyzer tagged) is retained. (default: replace)
  
+"""
  from nominatim.tools import country_info
  
  class _AnalyzerByLanguage:
@@ -10,35 +37,23 @@ class _AnalyzerByLanguage:
      """
  
      def __init__(self, config):
-        if 'filter-kind' in config:
-            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
-        else:
-            self.regexes = None
-
-        self.use_defaults = config.get('use-defaults', 'no')
-        if self.use_defaults not in ('mono', 'all'):
-            self.use_defaults = False
-
+        self.filter_kind = config.get_filter_kind()
          self.replace = config.get('mode', 'replace') != 'append'
          self.whitelist = config.get('whitelist')
  
-        # Compute the languages to use when no suffix is given.
-        self.deflangs = {}
-        for ccode, prop in country_info.iterate():
-            clangs = prop['languages']
-            if len(clangs) == 1 or self.use_defaults == 'all':
-                if self.whitelist:
-                    self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
-                else:
-                    self.deflangs[ccode] = clangs
-
+        self._compute_default_languages(config.get('use-defaults', 'no'))
  
  
-    def _kind_matches(self, kind):
-        if self.regexes is None:
-            return True
+    def _compute_default_languages(self, use_defaults):
+        self.deflangs = {}
  
-        return any(regex.search(kind) for regex in self.regexes)
+        if use_defaults in ('mono', 'all'):
+            for ccode, clangs in country_info.iterate('languages'):
+                if len(clangs) == 1 or use_defaults == 'all':
+                    if self.whitelist:
+                        self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                    else:
+                        self.deflangs[ccode] = clangs
  
  
      def _suffix_matches(self, suffix):
@@ -55,14 +70,12 @@ class _AnalyzerByLanguage:
          more_names = []
  
          for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n)):
              if name.suffix:
                  langs = [name.suffix] if self._suffix_matches(name.suffix) else None
              else:
-                if self.use_defaults:
-                    langs = self.deflangs.get(obj.place.country_code)
-                    if self.use_defaults == 'mono' and len(langs) > 1:
-                        langs = None
+                langs = self.deflangs.get(obj.place.country_code)
+
  
              if langs:
                  if self.replace:
@@ -77,24 +90,6 @@ class _AnalyzerByLanguage:
  
  def create(config):
      """ Create a function that sets the analyzer property depending on the
-        language of the tag. The language is taken from the suffix.
-
-        To restrict the set of languages that should be tagged, use
-        'whitelist'. A list of acceptable suffixes. When unset, all 2- and
-        3-letter codes are accepted.
-
-        'use-defaults' configures what happens when the name has no suffix
-        with a language tag. When set to 'all', a variant is created for
-        each on the spoken languages in the country the feature is in. When
-        set to 'mono', a variant is created, when only one language is spoken
-        in the country. The default is, to do nothing with the default languages
-        of a country.
-
-        'mode' hay be 'replace' (the default) or 'append' and configures if
-        the original name (without any analyzer tagged) is retained.
-
-        With 'filter-kind' the set of names the sanitizer should be applied
-        to can be retricted to the given patterns of 'kind'. It expects a
-        list of regular expression to be matched against 'kind'.
+        language of the tag.
      """
      return _AnalyzerByLanguage(config)