+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
"""
-Name processor for tagging the langauge of the name
-"""
-import re
+This sanitizer sets the `analyzer` property depending on the
+language of the tag. The language is taken from the suffix of the name.
+If a name already has an analyzer tagged, then this is kept.
+
+Arguments:
+
+ filter-kind: Restrict the names the sanitizer should be applied to
+ to the given tags. The parameter expects a list of
+ regular expressions which are matched against 'kind'.
+ Note that a match against the full string is expected.
+ whitelist: Restrict the set of languages that should be tagged.
+ Expects a list of acceptable suffixes. When unset,
+ all 2- and 3-letter lower-case codes are accepted.
+ use-defaults: Configure what happens when the name has no suffix.
+ When set to 'all', a variant is created for
+ each of the default languages in the country
+ the feature is in. When set to 'mono', a variant is
+ only created, when exactly one language is spoken
+ in the country. The default is to do nothing with
+ the default languages of a country.
+ mode: Define how the variants are created and may be 'replace' or
+ 'append'. When set to 'append' the original name (without
+ any analyzer tagged) is retained. (default: replace)
+"""
from nominatim.tools import country_info
class _AnalyzerByLanguage:
"""
def __init__(self, config):
- if 'filter-kind' in config:
- self.regexes = [re.compile(regex) for regex in config['filter-kind']]
- else:
- self.regexes = None
-
- self.use_defaults = config.get('use-defaults', 'no')
- if self.use_defaults not in ('mono', 'all'):
- self.use_defaults = False
-
+ self.filter_kind = config.get_filter_kind()
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
- # Compute the languages to use when no suffix is given.
- self.deflangs = {}
- for ccode, prop in country_info.iterate():
- clangs = prop['languages']
- if len(clangs) == 1 or self.use_defaults == 'all':
- if self.whitelist:
- self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
- else:
- self.deflangs[ccode] = clangs
-
+ self._compute_default_languages(config.get('use-defaults', 'no'))
- def _kind_matches(self, kind):
- if self.regexes is None:
- return True
+ def _compute_default_languages(self, use_defaults):
+ self.deflangs = {}
- return any(regex.search(kind) for regex in self.regexes)
+ if use_defaults in ('mono', 'all'):
+ for ccode, clangs in country_info.iterate('languages'):
+ if len(clangs) == 1 or use_defaults == 'all':
+ if self.whitelist:
+ self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+ else:
+ self.deflangs[ccode] = clangs
def _suffix_matches(self, suffix):
more_names = []
for name in (n for n in obj.names
- if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+ if not n.has_attr('analyzer') and self.filter_kind(n)):
if name.suffix:
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
else:
- if self.use_defaults:
- langs = self.deflangs.get(obj.place.country_code)
- if self.use_defaults == 'mono' and len(langs) > 1:
- langs = None
+ langs = self.deflangs.get(obj.place.country_code)
+
if langs:
if self.replace:
def create(config):
""" Create a function that sets the analyzer property depending on the
- language of the tag. The language is taken from the suffix.
-
- To restrict the set of languages that should be tagged, use
- 'whitelist'. A list of acceptable suffixes. When unset, all 2- and
- 3-letter codes are accepted.
-
- 'use-defaults' configures what happens when the name has no suffix
- with a language tag. When set to 'all', a variant is created for
- each on the spoken languages in the country the feature is in. When
- set to 'mono', a variant is created, when only one language is spoken
- in the country. The default is, to do nothing with the default languages
- of a country.
-
- 'mode' hay be 'replace' (the default) or 'append' and configures if
- the original name (without any analyzer tagged) is retained.
-
- With 'filter-kind' the set of names the sanitizer should be applied
- to can be retricted to the given patterns of 'kind'. It expects a
- list of regular expression to be matched against 'kind'.
+ language of the tag.
"""
return _AnalyzerByLanguage(config)