]> git.openstreetmap.org Git - nominatim.git/commitdiff
apply variants by languages
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 5 Oct 2021 15:18:10 +0000 (17:18 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 6 Oct 2021 09:09:54 +0000 (11:09 +0200)
Adds a tagger for names by language so that the analyzer of that
language is used. Thus variants are now only applied to names
in the specific language and only tag name tags, no longer to
reference-like tags.

nominatim/tokenizer/icu_rule_loader.py
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py [new file with mode: 0644]
nominatim/tokenizer/token_analysis/generic.py
nominatim/tools/country_info.py
settings/country_settings.yaml
settings/icu_tokenizer.yaml
test/bdd/db/query/normalization.feature
test/python/tokenizer/token_analysis/test_generic.py

index 361b67d46c8411eb0ec2d4da28f6c84ce1573cc0..b3e9c4c7d3daa6e49c5cb3452a15ee5936487d82 100644 (file)
@@ -11,6 +11,7 @@ from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+import nominatim.tools.country_info
 
 LOG = logging.getLogger()
 
@@ -38,6 +39,9 @@ class ICURuleLoader:
         rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                               config='TOKENIZER_CONFIG')
 
+        # Make sure country information is available to analyzers and sanatizers.
+        nominatim.tools.country_info.setup_country_config(config)
+
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
         self.analysis_rules = _get_section(rules, 'token-analysis')
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
new file mode 100644 (file)
index 0000000..c98c825
--- /dev/null
@@ -0,0 +1,100 @@
+"""
+Name processor for tagging the langauge of the name
+"""
+import re
+
+from nominatim.tools import country_info
+
+class _AnalyzerByLanguage:
+    """ Processor for tagging the language of names in a place.
+    """
+
+    def __init__(self, config):
+        if 'filter-kind' in config:
+            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
+        else:
+            self.regexes = None
+
+        self.use_defaults = config.get('use-defaults', 'no')
+        if self.use_defaults not in ('mono', 'all'):
+            self.use_defaults = False
+
+        self.replace = config.get('mode', 'replace') != 'append'
+        self.whitelist = config.get('whitelist')
+
+        # Compute the languages to use when no suffix is given.
+        self.deflangs = {}
+        for ccode, prop in country_info.iterate():
+            clangs = prop['languages']
+            if len(clangs) == 1 or self.use_defaults == 'all':
+                if self.whitelist:
+                    self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                else:
+                    self.deflangs[ccode] = clangs
+
+
+
+    def _kind_matches(self, kind):
+        if self.regexes is None:
+            return True
+
+        return any(regex.search(kind) for regex in self.regexes)
+
+
+    def _suffix_matches(self, suffix):
+        if self.whitelist is None:
+            return len(suffix) in (2, 3) and suffix.islower()
+
+        return suffix in self.whitelist
+
+
+    def __call__(self, obj):
+        if not obj.names:
+            return
+
+        more_names = []
+
+        for name in (n for n in obj.names
+                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+            if name.suffix:
+                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
+            else:
+                if self.use_defaults:
+                    langs = self.deflangs.get(obj.place.country_code)
+                    if self.use_defaults == 'mono' and len(langs) > 1:
+                        langs = None
+
+            if langs:
+                if self.replace:
+                    name.set_attr('analyzer', langs[0])
+                else:
+                    more_names.append(name.clone(attr={'analyzer': langs[0]}))
+
+                more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+
+        obj.names.extend(more_names)
+
+
+def create(config):
+    """ Create a function that sets the analyzer property depending on the
+        language of the tag. The language is taken from the suffix.
+
+        To restrict the set of languages that should be tagged, use
+        'whitelist'. A list of acceptable suffixes. When unset, all 2- and
+        3-letter codes are accepted.
+
+        'use-defaults' configures what happens when the name has no suffix
+        with a language tag. When set to 'all', a variant is created for
+        each on the spoken languages in the country the feature is in. When
+        set to 'mono', a variant is created, when only one language is spoken
+        in the country. The default is, to do nothing with the default languages
+        of a country.
+
+        'mode' hay be 'replace' (the default) or 'append' and configures if
+        the original name (without any analyzer tagged) is retained.
+
+        With 'filter-kind' the set of names the sanitizer should be applied
+        to can be retricted to the given patterns of 'kind'. It expects a
+        list of regular expression to be matched against 'kind'.
+    """
+    return _AnalyzerByLanguage(config)
index c904d87d4eb18e15143b12935891955930dd5113..b8cfde3997640788fa36a686d65d63c4367a6e32 100644 (file)
@@ -18,7 +18,19 @@ ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
 def configure(rules, normalization_rules):
     """ Extract and preprocess the configuration for this module.
     """
-    rules = rules.get('variants')
+    config = {}
+
+    config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
+                                                                  normalization_rules)
+    config['variant_only'] = rules.get('mode', '') == 'variant-only'
+
+    return config
+
+
+def _get_variant_config(rules, normalization_rules):
+    """ Convert the variant definition from the configuration into
+        replacement sets.
+    """
     immediate = defaultdict(list)
     chars = set()
 
@@ -41,8 +53,7 @@ def configure(rules, normalization_rules):
             immediate[variant.source].append(replstr)
             chars.update(variant.source)
 
-    return {'replacements': list(immediate.items()),
-            'chars': ''.join(chars)}
+    return list(immediate.items()), ''.join(chars)
 
 
 class _VariantMaker:
@@ -144,11 +155,15 @@ class GenericTokenAnalysis:
 
     def __init__(self, to_ascii, config):
         self.to_ascii = to_ascii
+        self.variant_only = config['variant_only']
 
         # Set up datrie
-        self.replacements = datrie.Trie(config['chars'])
-        for src, repllist in config['replacements']:
-            self.replacements[src] = repllist
+        if config['replacements']:
+            self.replacements = datrie.Trie(config['chars'])
+            for src, repllist in config['replacements']:
+                self.replacements[src] = repllist
+        else:
+            self.replacements = None
 
 
     def get_variants_ascii(self, norm_name):
@@ -159,45 +174,51 @@ class GenericTokenAnalysis:
         partials = ['']
 
         startpos = 0
-        pos = 0
-        force_space = False
-        while pos < len(baseform):
-            full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                               (None, None))
-            if full is not None:
-                done = baseform[startpos:pos]
-                partials = [v + done + r
-                            for v, r in itertools.product(partials, repl)
-                            if not force_space or r.startswith(' ')]
-                if len(partials) > 128:
-                    # If too many variants are produced, they are unlikely
-                    # to be helpful. Only use the original term.
-                    startpos = 0
-                    break
-                startpos = pos + len(full)
-                if full[-1] == ' ':
-                    startpos -= 1
-                    force_space = True
-                pos = startpos
-            else:
-                pos += 1
-                force_space = False
+        if self.replacements is not None:
+            pos = 0
+            force_space = False
+            while pos < len(baseform):
+                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                                   (None, None))
+                if full is not None:
+                    done = baseform[startpos:pos]
+                    partials = [v + done + r
+                                for v, r in itertools.product(partials, repl)
+                                if not force_space or r.startswith(' ')]
+                    if len(partials) > 128:
+                        # If too many variants are produced, they are unlikely
+                        # to be helpful. Only use the original term.
+                        startpos = 0
+                        break
+                    startpos = pos + len(full)
+                    if full[-1] == ' ':
+                        startpos -= 1
+                        force_space = True
+                    pos = startpos
+                else:
+                    pos += 1
+                    force_space = False
 
         # No variants detected? Fast return.
         if startpos == 0:
+            if self.variant_only:
+                return []
+
             trans_name = self.to_ascii.transliterate(norm_name).strip()
             return [trans_name] if trans_name else []
 
-        return self._compute_result_set(partials, baseform[startpos:])
+        return self._compute_result_set(partials, baseform[startpos:],
+                                        norm_name if self.variant_only else '')
 
 
-    def _compute_result_set(self, partials, prefix):
+    def _compute_result_set(self, partials, prefix, exclude):
         results = set()
 
         for variant in partials:
-            vname = variant + prefix
-            trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
-            if trans_name:
-                results.add(trans_name)
+            vname = (variant + prefix)[1:-1].strip()
+            if vname != exclude:
+                trans_name = self.to_ascii.transliterate(vname).strip()
+                if trans_name:
+                    results.add(trans_name)
 
         return list(results)
index e04a8693f116bccd6d7e609de0c463b74170e46a..635d15840a84b8197efb9f5cb358344a78a0c2b9 100644 (file)
@@ -13,12 +13,21 @@ class _CountryInfo:
     def __init__(self):
         self._info = {}
 
+
     def load(self, config):
         """ Load the country properties from the configuration files,
             if they are not loaded yet.
         """
         if not self._info:
             self._info = config.load_sub_configuration('country_settings.yaml')
+            # Convert languages into a list for simpler handling.
+            for prop in self._info.values():
+                if 'languages' not in prop:
+                    prop['languages'] = []
+                elif not isinstance(prop['languages'], list):
+                    prop['languages'] = [x.strip()
+                                         for x in prop['languages'].split(',')]
+
 
     def items(self):
         """ Return tuples of (country_code, property dict) as iterable.
@@ -36,6 +45,12 @@ def setup_country_config(config):
     _COUNTRY_INFO.load(config)
 
 
+def iterate():
+    """ Iterate over country code and properties.
+    """
+    return _COUNTRY_INFO.items()
+
+
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
     """ Create and populate the tables with basic static data that provides
         the background for geocoding. Data is assumed to not yet exist.
@@ -50,10 +65,7 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
                 partition = 0
             else:
                 partition = props.get('partition')
-            if ',' in (props.get('languages', ',') or ','):
-                lang = None
-            else:
-                lang = props['languages']
+            lang = props['languages'][0] if len(props['languages']) == 1 else None
             params.append((ccode, partition, lang))
 
     with connect(dsn) as conn:
index 77b137a1b8019fcfce1facbee8c9f7fc32891e94..dcbb1847f8fd1d7158d4dae8122081346e061e34 100644 (file)
@@ -171,7 +171,7 @@ bt:
 #  (Bouvet Island)
 bv:
     partition: 185
-    languages: no
+    languages: "no"
 
 # Botswana (Botswana)
 bw:
@@ -1006,7 +1006,7 @@ si:
 #  (Svalbard and Jan Mayen)
 sj:
     partition: 197
-    languages: no
+    languages: "no"
 
 # Slovakia (Slovensko)
 sk:
index d070adcbbd649122aa5a37c621271b7d8635cb01..41760c49e0fbd2122d2f1e7fd1966fc4278d1975 100644 (file)
@@ -27,36 +27,160 @@ transliteration:
 sanitizers:
     - step: split-name-list
     - step: strip-brace-terms
+    - step: tag-analyzer-by-language
+      filter-kind: [".*name.*"]
+      whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
+      use-defaults: all
+      mode: append
 token-analysis:
     - analyzer: generic
+    - id: bg
+      analyzer: generic
+      mode: variant-only
       variants:
           - !include icu-rules/variants-bg.yaml
+    - id: ca
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-ca.yaml
+    - id: cs
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-cs.yaml
+    - id: da
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-da.yaml
+    - id: de
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-de.yaml
+    - id: el
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-el.yaml
+    - id: en
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-en.yaml
+    - id: es
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-es.yaml
+    - id: et
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-et.yaml
+    - id: eu
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-eu.yaml
+    - id: fi
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-fi.yaml
+    - id: fr
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-fr.yaml
+    - id: gl
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-gl.yaml
+    - id: hu
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-hu.yaml
+    - id: it
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-it.yaml
+    - id: ja
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-ja.yaml
+    - id: mg
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-mg.yaml
+    - id: ms
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-ms.yaml
+    - id: nl
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-nl.yaml
+    - id: no
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-no.yaml
+    - id: pl
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-pl.yaml
+    - id: pt
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-pt.yaml
+    - id: ro
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-ro.yaml
+    - id: ru
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-ru.yaml
+    - id: sk
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-sk.yaml
+    - id: sl
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-sl.yaml
+    - id: sv
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-sv.yaml
+    - id: tr
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-tr.yaml
+    - id: uk
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-uk.yaml
+    - id: vi
+      analyzer: generic
+      mode: variant-only
+      variants:
           - !include icu-rules/variants-vi.yaml
index b8a760f99bd0bc03e127c14ae60de5f93fdf0290..deaa635e0b190733a99306795d2bef779f5caaa0 100644 (file)
@@ -52,7 +52,7 @@ Feature: Import and search of names
 
     Scenario: Special characters in name
         Given the places
-          | osm | class | type      | name |
+          | osm | class | type      | name+name:de |
           | N1  | place | locality  | Jim-Knopf-Straße |
           | N2  | place | locality  | Smith/Weston |
           | N3  | place | locality  | space mountain |
index f0ce4208e288afbe9fdce86f255abd6220c69de2..a9b09ea43fecd51a60d15a83cc0e7cfdd764b675 100644 (file)
@@ -40,7 +40,7 @@ def cfgfile(def_config, tmp_path):
 
 
 def get_normalized_variants(proc, name):
-    return proc.get_variants_ascii(proc.get_normalized(name))
+    return proc.analysis[None].get_variants_ascii(proc.normalizer.transliterate(name).strip())
 
 
 def test_variants_empty(cfgfile):
@@ -99,6 +99,6 @@ def test_search_normalized(cfgfile):
     config = cfgfile('~street => s,st', 'master => mstr')
     proc = ICURuleLoader(config).make_token_analysis()
 
-    assert proc.get_search_normalized('Master Street') == 'master street'
-    assert proc.get_search_normalized('Earnes St') == 'earnes st'
-    assert proc.get_search_normalized('Nostreet') == 'nostreet'
+    assert proc.search.transliterate('Master Street').strip() == 'master street'
+    assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
+    assert proc.search.transliterate('Nostreet').strip() == 'nostreet'