nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

   1 """
   2 Name processor for tagging the langauge of the name
   3 """
   4 import re
   5
   6 from nominatim.tools import country_info
   7
   8 class _AnalyzerByLanguage:
   9     """ Processor for tagging the language of names in a place.
  10     """
  11
  12     def __init__(self, config):
  13         if 'filter-kind' in config:
  14             self.regexes = [re.compile(regex) for regex in config['filter-kind']]
  15         else:
  16             self.regexes = None
  17
  18         self.use_defaults = config.get('use-defaults', 'no')
  19         if self.use_defaults not in ('mono', 'all'):
  20             self.use_defaults = False
  21
  22         self.replace = config.get('mode', 'replace') != 'append'
  23         self.whitelist = config.get('whitelist')
  24
  25         # Compute the languages to use when no suffix is given.
  26         self.deflangs = {}
  27         for ccode, prop in country_info.iterate():
  28             clangs = prop['languages']
  29             if len(clangs) == 1 or self.use_defaults == 'all':
  30                 if self.whitelist:
  31                     self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
  32                 else:
  33                     self.deflangs[ccode] = clangs
  34
  35
  36
  37     def _kind_matches(self, kind):
  38         if self.regexes is None:
  39             return True
  40
  41         return any(regex.search(kind) for regex in self.regexes)
  42
  43
  44     def _suffix_matches(self, suffix):
  45         if self.whitelist is None:
  46             return len(suffix) in (2, 3) and suffix.islower()
  47
  48         return suffix in self.whitelist
  49
  50
  51     def __call__(self, obj):
  52         if not obj.names:
  53             return
  54
  55         more_names = []
  56
  57         for name in (n for n in obj.names
  58                      if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
  59             if name.suffix:
  60                 langs = [name.suffix] if self._suffix_matches(name.suffix) else None
  61             else:
  62                 if self.use_defaults:
  63                     langs = self.deflangs.get(obj.place.country_code)
  64                     if self.use_defaults == 'mono' and len(langs) > 1:
  65                         langs = None
  66
  67             if langs:
  68                 if self.replace:
  69                     name.set_attr('analyzer', langs[0])
  70                 else:
  71                     more_names.append(name.clone(attr={'analyzer': langs[0]}))
  72
  73                 more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
  74
  75         obj.names.extend(more_names)
  76
  77
  78 def create(config):
  79     """ Create a function that sets the analyzer property depending on the
  80         language of the tag. The language is taken from the suffix.
  81
  82         To restrict the set of languages that should be tagged, use
  83         'whitelist'. A list of acceptable suffixes. When unset, all 2- and
  84         3-letter codes are accepted.
  85
  86         'use-defaults' configures what happens when the name has no suffix
  87         with a language tag. When set to 'all', a variant is created for
  88         each on the spoken languages in the country the feature is in. When
  89         set to 'mono', a variant is created, when only one language is spoken
  90         in the country. The default is, to do nothing with the default languages
  91         of a country.
  92
  93         'mode' hay be 'replace' (the default) or 'append' and configures if
  94         the original name (without any analyzer tagged) is retained.
  95
  96         With 'filter-kind' the set of names the sanitizer should be applied
  97         to can be retricted to the given patterns of 'kind'. It expects a
  98         list of regular expression to be matched against 'kind'.
  99     """
 100     return _AnalyzerByLanguage(config)