]> git.openstreetmap.org Git - nominatim.git/commitdiff
move abbreviation computation into import phase
authorSarah Hoffmann <lonvia@denofr.de>
Fri, 28 May 2021 20:06:13 +0000 (22:06 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.

lib-php/tokenizer/legacy_icu_tokenizer.php
lib-sql/tokenizer/legacy_icu_tokenizer.sql
nominatim/tokenizer/icu_name_processor.py [new file with mode: 0644]
nominatim/tokenizer/icu_rule_loader.py [new file with mode: 0644]
nominatim/tokenizer/legacy_icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
nominatim/tools/database_import.py
settings/legacy_icu_tokenizer.yaml [new file with mode: 0644]
test/python/test_tokenizer_icu_name_processor.py [new file with mode: 0644]
test/python/test_tokenizer_icu_rule_loader.py [new file with mode: 0644]

index 09cfe70fbf661a3e0440531310a45fb1fabbfab7..92dd727283019ea3454b20ee7232f0234f583b0c 100644 (file)
@@ -47,9 +47,7 @@ class Tokenizer
 
     private function makeStandardWord($sTerm)
     {
-        $sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' ';
-
-        return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
+        return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
     }
 
 
@@ -90,6 +88,7 @@ class Tokenizer
         foreach ($aPhrases as $iPhrase => $oPhrase) {
             $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
             $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
+            Debug::printVar('Phrase', $sPhrase);
             if (strlen($sPhrase) > 0) {
                 $aWords = explode(' ', $sPhrase);
                 Tokenizer::addTokens($aTokens, $aWords);
index 8fd0ede40e87f5560fbfe16471879deaf6491e18..686137de5f11a5bbdeb350350340c91a508f93da 100644 (file)
@@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 --------------- private functions ----------------------------------------------
 
-CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
-  RETURNS INTEGER
+CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
+                                                 OUT full_token INT,
+                                                 OUT partial_tokens INT[])
   AS $$
 DECLARE
-  return_id INTEGER;
+  partial_terms TEXT[] = '{}'::TEXT[];
+  term TEXT;
+  term_id INTEGER;
   term_count INTEGER;
 BEGIN
-  SELECT min(word_id), max(search_name_count) INTO return_id, term_count
-    FROM word WHERE word_token = lookup_term and class is null and type is null;
+  SELECT min(word_id) INTO full_token
+    FROM word WHERE word = norm_term and class is null and country_code is null;
 
-  IF return_id IS NULL THEN
-    return_id := nextval('seq_word');
-    INSERT INTO word (word_id, word_token, search_name_count)
-      VALUES (return_id, lookup_term, 0);
-  ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
-    return_id := 0;
+  IF full_token IS NULL THEN
+    full_token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, word, search_name_count)
+      SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
   END IF;
 
-  RETURN return_id;
+  FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
+    term := trim(term);
+    IF NOT (ARRAY[term] <@ partial_terms) THEN
+      partial_terms := partial_terms || term;
+    END IF;
+  END LOOP;
+
+  partial_tokens := '{}'::INT[];
+  FOR term IN SELECT unnest(partial_terms) LOOP
+    SELECT min(word_id), max(search_name_count) INTO term_id, term_count
+      FROM word WHERE word_token = term and class is null and country_code is null;
+
+    IF term_id IS NULL THEN
+      term_id := nextval('seq_word');
+      term_count := 0;
+      INSERT INTO word (word_id, word_token, search_name_count)
+        VALUES (term_id, term, 0);
+    END IF;
+
+    IF term_count < {{ max_word_freq }} THEN
+      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
+    END IF;
+  END LOOP;
 END;
 $$
 LANGUAGE plpgsql;
diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py
new file mode 100644 (file)
index 0000000..0e71799
--- /dev/null
@@ -0,0 +1,111 @@
+"""
+Processor for names that are imported into the database based on the
+ICU library.
+"""
+import json
+import itertools
+
+from icu import Transliterator
+import datrie
+
+from nominatim.db.properties import set_property, get_property
+
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
+DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
+
+
+class ICUNameProcessorRules:
+    """ Data object that saves the rules needed for the name processor.
+
+        The rules can either be initialised through an ICURuleLoader or
+        be loaded from a database when a connection is given.
+    """
+    def __init__(self, loader=None, conn=None):
+        if loader is not None:
+            self.norm_rules = loader.get_normalization_rules()
+            self.trans_rules = loader.get_transliteration_rules()
+            self.replacements = loader.get_replacement_pairs()
+            self.search_rules = loader.get_search_rules()
+        elif conn is not None:
+            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+            self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
+            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
+        else:
+            assert False, "Parameter loader or conn required."
+
+        # Compute the set of characters used in the replacement list.
+        # We need this later when computing the tree.
+        chars = set()
+        for full, repl in self.replacements:
+            chars.update(full)
+            for word in repl:
+                chars.update(word)
+        self.replacement_charset = ''.join(chars)
+
+
+    def save_rules(self, conn):
+        """ Save the rules in the property table of the given database.
+            the rules can be loaded again by handing in a connection into
+            the constructor of the class.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
+        set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements))
+        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
+
+
+class ICUNameProcessor:
+
+    def __init__(self, rules):
+        self.normalizer = Transliterator.createFromRules("icu_normalization",
+                                                         rules.norm_rules)
+        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
+                                                       rules.trans_rules)
+        self.search = Transliterator.createFromRules("icu_search",
+                                                     rules.search_rules)
+
+        self.replacements = datrie.Trie(rules.replacement_charset)
+        for full, repl in rules.replacements:
+            self.replacements[full] = repl
+
+
+    def get_normalized(self, name):
+        """ Normalize the given name, i.e. remove all elements not relevant
+            for search.
+        """
+        return self.normalizer.transliterate(name)
+
+    def get_variants_ascii(self, norm_name):
+        """ Compute the spelling variants for the given normalized name
+            and transliterate the result.
+        """
+        baseform = ' ' + norm_name + ' '
+        variants = ['']
+
+        startpos = 0
+        pos = 0
+        while pos < len(baseform):
+            full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                               (None, None))
+            if full is not None:
+                done = baseform[startpos:pos]
+                variants = [v + done + r for v, r in itertools.product(variants, repl)]
+                startpos = pos + len(full)
+                pos = startpos
+            else:
+                pos += 1
+
+        if startpos == 0:
+            return [self.to_ascii.transliterate(norm_name)]
+
+        return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants]
+
+
+    def get_search_normalized(self, name):
+        """ Return the normalized version of the name (including transliteration)
+            to be applied at search time.
+        """
+        return self.search.transliterate(name)
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py
new file mode 100644 (file)
index 0000000..3b72116
--- /dev/null
@@ -0,0 +1,161 @@
+"""
+Helper class to create ICU rules from a configuration file.
+"""
+import io
+import yaml
+import logging
+from collections import defaultdict
+import itertools
+
+from icu import Transliterator
+
+from nominatim.errors import UsageError
+
+LOG = logging.getLogger()
+
+
+class ICURuleLoader:
+    """ Compiler for ICU rules from a tokenizer configuration file.
+    """
+
+    def __init__(self, configfile):
+        self.configfile = configfile
+
+        if configfile.suffix == '.yaml':
+            self._load_from_yaml()
+        else:
+            raise UsageError("Unknown format of tokenizer configuration.")
+
+
+    def get_search_rules(self):
+        """ Returns the ICU rules to be used during search.
+            The rules combine normalization, compound decomposition (including
+            abbreviated compounds) and transliteration.
+        """
+        # First apply the normalization rules.
+        rules = io.StringIO()
+        rules.write(self.normalization_rules)
+
+        # For all compound suffixes: add them in their full and any abbreviated form.
+        suffixes = set()
+        for suffix in self.compound_suffixes:
+            suffixes.add(suffix)
+            suffixes.update(self.abbreviations.get(suffix, []))
+
+        for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
+            rules.write("'{0} ' > ' {0} ';".format(suffix))
+
+        # Finally add transliteration.
+        rules.write(self.transliteration_rules)
+        return rules.getvalue()
+
+    def get_normalization_rules(self):
+        """ Return rules for normalisation of a term.
+        """
+        return self.normalization_rules
+
+    def get_transliteration_rules(self):
+        """ Return the rules for converting a string into its asciii representation.
+        """
+        return self.transliteration_rules
+
+    def get_replacement_pairs(self):
+        """ Returns the list of possible compound decompositions with
+            application of abbreviations included.
+            The result is a list of pairs: the first item is the sequence to
+            replace, the second is a list of replacements.
+        """
+        synonyms = defaultdict(set)
+
+        for full, abbr in self.abbreviations.items():
+            key = ' ' + full + ' '
+            # Entries in the abbreviation list always apply to full words:
+            synonyms[key].update((' ' + a + ' ' for a in abbr))
+            # Replacements are optional, so add a noop
+            synonyms[key].add(key)
+
+        # Entries in the compound list expand to themselves and to
+        # abbreviations.
+        for suffix in self.compound_suffixes:
+            keyset = synonyms[suffix + ' ']
+            keyset.add(' ' + suffix + ' ')
+            keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
+            # The terms the entries are shortended to, need to be decompunded as well.
+            for abbr in self.abbreviations.get(suffix, []):
+                synonyms[abbr + ' '].add(' ' + abbr + ' ')
+
+        # sort the resulting list by descending length (longer matches are prefered).
+        sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+
+        return [(k, list(synonyms[k])) for k in sorted_keys]
+
+
+    def _load_from_yaml(self):
+        rules = yaml.load(self.configfile.read_text())
+
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
+        self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
+
+
+    def _get_section(self, rules, section):
+        """ Get the section named 'section' from the rules. If the section does
+            not exist, raise a usage error with a meaningful message.
+        """
+        if section not in rules:
+            LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
+                      section, str(self.configfile))
+            raise UsageError("Syntax error in tokenizer configuration file.")
+
+        return rules[section]
+
+
+    def _cfg_to_icu_rules(self, rules, section):
+        """ Load an ICU ruleset from the given section. If the section is a
+            simple string, it is interpreted as a file name and the rules are
+            loaded verbatim from the given file. The filename is expected to be
+            relative to the tokenizer rule file. If the section is a list then
+            each line is assumed to be a rule. All rules are concatenated and returned.
+        """
+        content = self._get_section(rules, section)
+
+        if isinstance(content, str):
+            return (self.configfile.parent / content).read_text().replace('\n', ' ')
+
+        return ';'.join(content) + ';'
+
+
+    def _parse_compound_suffix_list(self, rules):
+        if not rules:
+            self.compound_suffixes = set()
+            return
+
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+
+        # Make sure all suffixes are in their normalised form.
+        self.compound_suffixes = set((norm.transliterate(s) for s in rules))
+
+
+    def _parse_abbreviation_list(self, rules):
+        self.abbreviations = defaultdict(list)
+
+        if not rules:
+            return
+
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+
+        for rule in rules:
+            parts = rule.split('=>')
+            if len(parts) != 2:
+                LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
+                raise UsageError("Syntax error in tokenizer configuration file.")
+
+            # Make sure all terms match the normalised version.
+            fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
+            abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
+
+            for full, abbr in itertools.product(fullterms, abbrterms):
+                self.abbreviations[full].append(abbr)
index 689318d7e87bc79024b5b0c645db4252f19b62ae..eb8502377f1c9e0a65a5b7d2808e3e6c9ab226dd 100644 (file)
@@ -18,11 +18,11 @@ import psycopg2.extras
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 
-DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
 
@@ -41,9 +41,9 @@ class LegacyICUTokenizer:
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None
 
 
     def init_new_db(self, config, init_db=True):
@@ -55,14 +55,14 @@ class LegacyICUTokenizer:
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
 
-        rules = json.loads(cfgfile.read_text())
-        self._load_transliteration(rules['normalization'], cfgfile.parent)
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY
 
-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
         self._save_config(config)
 
         if init_db:
@@ -70,19 +70,13 @@ class LegacyICUTokenizer:
             self._init_db_tables(config)
 
 
-    def _load_transliteration(self, rules, cfg_path):
-        if isinstance(rules, str):
-            self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
-        else:
-            self.transliteration = ';'.join(rules) + ';'
-
     def init_from_project(self):
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
 
 
     def finalize_import(self, config):
@@ -132,26 +126,20 @@ class LegacyICUTokenizer:
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("trans", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 
 
-    def _install_php(self, config):
+    def _install_php(self, phpdir):
         """ Install the php script for the tokenizer.
         """
-        abbr_inverse = list(zip(*self.abbreviations))
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent("""\
             <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
+            @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
+            require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
+            """.format(self, phpdir)))
 
 
     def _save_config(self, config):
@@ -159,10 +147,10 @@ class LegacyICUTokenizer:
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
     def _init_db_tables(self, config):
@@ -178,15 +166,14 @@ class LegacyICUTokenizer:
 
             # get partial words and their frequencies
             words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+            name_proc = ICUNameProcessor(self.naming_rules)
+            with conn.cursor(name="words") as cur:
+                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 
-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+                for name, cnt in cur:
+                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                        for term in word.split():
+                            words[term] += cnt
 
             # copy them back into the word table
             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
@@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
         normalization.
     """
 
-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc
 
         self._cache = _TokenCache()
 
@@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
         tokens = {}
         for word in words:
             if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
             else:
-                tokens[word] = self.make_standard_word(word)
+                tokens[word] = self.name_processor.get_normalized(word)
 
         with conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
@@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
         return [(k, v, ids[v]) for k, v in tokens.items()]
 
 
-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
-
     @staticmethod
     def normalize_postcode(postcode):
         """ Convert the postcode to a standardized form.
@@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
         return postcode.strip().upper()
 
 
-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
-        """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
-
-
     def _make_standard_hnr(self, hnr):
         """ Create a normalised version of a housenumber.
 
             This function takes minor shortcuts on transliteration.
         """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
@@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
                 else:
                     copystr.write(postcode)
                     copystr.write('\t ')
-                    copystr.write(self.transliterator.transliterate(postcode))
+                    copystr.write(self.name_processor.get_search_normalized(postcode))
                     copystr.write('\tplace\tpostcode\t0\n')
 
             if to_delete:
@@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
     def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
             if to_add:
                 copystr = io.StringIO()
                 for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
+                    term = self.name_processor.get_search_normalized(word)
                     if term:
                         copystr.write(word)
                         copystr.write('\t ')
@@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
     def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
-
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            if name:
+                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 
-    def _add_normalized_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
-        """
-        word_tokens = set((' ' + name for name in names))
         with self.conn.cursor() as cur:
             # Get existing names
             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
         names = place.get('name')
 
         if names:
-            full_names = self._compute_full_names(names)
+            fulls, partials = self._compute_name_tokens(names)
 
-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)
 
         address = place.get('address')
 
@@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                     hnrs.append(value)
                 elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
+                    token_info.add_street(*self._compute_name_tokens({'name': value}))
                 elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
+                    token_info.add_place(*self._compute_name_tokens({'name': value}))
                 elif not key.startswith('_') and \
                      key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
+                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 
             if hnrs:
                 hnrs = self._split_housenumbers(hnrs)
                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 
             if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+                token_info.add_address_terms(addr_terms)
 
         return token_info.data
 
 
+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
     def _compute_full_names(self, names):
         """ Return the set of all full name word ids to be used with the
             given dictionary of names.
         """
         full_names = set()
         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            word = self.make_standard_word(name)
-            if word:
-                full_names.add(word)
+            full_names.add(name.strip())
 
-                brace_split = name.split('(', 2)
-                if len(brace_split) > 1:
-                    word = self.make_standard_word(brace_split[0])
-                    if word:
-                        full_names.add(word)
+            brace_idx = name.find('(')
+            if brace_idx >= 0:
+                full_names.add(name[:brace_idx].strip())
 
         return full_names
 
@@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
-                term = self.make_standard_word(postcode)
+                term = self.name_processor.get_search_normalized(postcode)
                 if not term:
                     return
 
@@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
                                 """, (' ' + term, postcode))
                 self._cache.postcodes.add(postcode)
 
+
     @staticmethod
     def _split_housenumbers(hnrs):
         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -530,7 +512,7 @@ class _TokenInfo:
     """ Collect token information to be sent back to the database.
     """
     def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
         self.data = {}
 
     @staticmethod
@@ -538,86 +520,44 @@ class _TokenInfo:
         return '{%s}' % ','.join((str(s) for s in tokens))
 
 
-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
         """ Adds token information for the normalised names.
         """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 
 
     def add_housenumbers(self, conn, hnrs):
         """ Extract housenumber information from a list of normalised
             housenumbers.
         """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, conn, street):
+    def add_street(self, fulls, partials):
         """ Add addr:street match terms.
         """
-        if not street:
-            return
-
-        term = ' ' + street
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)
 
-        tid = self.cache.names.get(term)
 
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
-
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
-
-
-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
         """ Add addr:place search and match terms.
         """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
-
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)
 
 
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
         """ Add additional address terms.
         """
         tokens = {}
 
-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]
 
         if tokens:
             self.data['addr'] = tokens
@@ -635,32 +575,6 @@ class _TokenCache:
         self.housenumbers = {}
 
 
-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
             database if necessary.
index d6fbc2cda6987bc18d708970ed2d2e767c7ac8b2..bb37115bf814054eccf12863c926db756f420024 100644 (file)
@@ -404,7 +404,7 @@ class LegacyNameAnalyzer:
                             FROM unnest(%s)n) y
                       WHERE NOT EXISTS(SELECT * FROM word
                                        WHERE word_token = lookup_token and country_code = %s))
-                """, (country_code, names, country_code))
+                """, (country_code, list(names.values()), country_code))
 
 
     def process_place(self, place):
@@ -422,7 +422,7 @@ class LegacyNameAnalyzer:
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), list(names.values()))
+                self.add_country_names(country_feature.lower(), names)
 
         address = place.get('address')
 
index 28a10ebeb742f907f4b4efc1c3b2abe9925d40fa..efbf2ec80c0c771dbea2e71390c5ba184b782280 100644 (file)
@@ -272,15 +272,15 @@ def create_country_names(conn, tokenizer, languages=None):
 
         with tokenizer.name_analyzer() as analyzer:
             for code, name in cur:
-                names = [code]
+                names = {'countrycode' : code}
                 if code == 'gb':
-                    names.append('UK')
+                    names['short_name'] = 'UK'
                 if code == 'us':
-                    names.append('United States')
+                    names['short_name'] = 'United States'
 
                 # country names (only in languages as provided)
                 if name:
-                    names.extend((v for k, v in name.items() if _include_key(k)))
+                    names.update(((k, v) for k, v in name.items() if _include_key(k)))
 
                 analyzer.add_country_names(code, names)
 
diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml
new file mode 100644 (file)
index 0000000..34cd8b0
--- /dev/null
@@ -0,0 +1,116 @@
+normalization:
+    - ":: NFD ()"
+    - "[[:Nonspacing Mark:] [:Cf:]] >"
+    - ":: lower ()"
+    - "ß > 'ss'" # German szet is unimbigiously equal to double ss
+    - "[[:Punctuation:][:Space:]]+ > ' '"
+    - ":: NFC ()"
+transliteration: icu_transliteration.rules
+compound_suffixes:
+    # Danish
+    - hal
+    - hallen
+    - hallerne
+    # German
+    - berg
+    - brücke
+    - fabrik
+    - gasse
+    - graben
+    - haus
+    - höhle
+    - hütte
+    - kapelle
+    - kogel
+    - pfad
+    - platz
+    - quelle
+    - spitze
+    - stiege
+    - strasse
+    - teich
+    - universität
+    - wald
+    - weg
+    - wiese
+    # Dutch
+    - gracht
+    - laan
+    - markt
+    - plein
+    - straat
+    - vliet
+    - weg
+    # Norwegian
+    - vei
+    - veien
+    - veg
+    - vegen
+    - gate
+    - gaten
+    - gata
+    - plass
+    - plassen
+    - sving
+    - svingen
+    # Finnish
+    - alue
+    - asema
+    - aukio
+    - kaari
+    - katu
+    - kuja
+    - kylä
+    - penger
+    - polku
+    - puistikko
+    - puisto
+    - raitti
+    - ranta
+    - rinne
+    - taival
+    - tie
+    - tori
+    - väylä
+    # Swedish
+    - väg
+    - vägen
+    - gatan
+    - gata
+    - gränd
+    - gränden
+    - stig
+    - stigen
+    - plats
+    - platsen
+abbreviations:
+    # German
+    - am => a
+    - an der => a d
+    - allgemeines krankenhaus => akh
+    - altstoffsammelzentrum => asz
+    - auf der => a d
+    - bach => b
+    - bad => b
+    - bahnhof => bhf,bf
+    - berg => bg
+    - bezirk => bez
+    - brücke => br
+    - burg => bg
+    - chaussee => ch
+    - deutsche,deutscher,deutsches => dt
+    - dorf => df
+    - doktor => dr
+    - fachhochschule => fh
+    - Freiwillige Feuerwehr => ff
+    - sankt => st
+    - strasse => str
+    - weg => wg
+    # English
+    - alley => al
+    - beach => bch
+    - street => st
+    - road => rd
+    - bridge => brdg
+
+
diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py
new file mode 100644 (file)
index 0000000..9c09bcc
--- /dev/null
@@ -0,0 +1,60 @@
+"""
+Tests for import name normalisation and variant generation.
+"""
+from textwrap import dedent
+
+import pytest
+
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+
+from nominatim.errors import UsageError
+
+@pytest.fixture
+def cfgfile(tmp_path, suffix='.yaml'):
+    def _create_config(suffixes, abbr):
+        content = dedent("""\
+        normalization:
+            - ":: NFD ()"
+            - "[[:Nonspacing Mark:] [:Cf:]] >"
+            - ":: lower ()"
+            - "[[:Punctuation:][:Space:]]+ > ' '"
+            - ":: NFC ()"
+        transliteration:
+            - "::  Latin ()"
+        """)
+        content += "compound_suffixes:\n"
+        content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
+        content += "abbreviations:\n"
+        content += '\n'.join(("    - " + s for s in abbr)) + '\n'
+        fpath = tmp_path / ('test_config' + suffix)
+        fpath.write_text(dedent(content))
+        return fpath
+
+    return _create_config
+
+
+def test_simple_variants(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str',
+                     'prospekt => pr'])
+
+    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
+    proc = ICUNameProcessor(rules)
+
+    assert set(proc.get_normalized_variants("Bauwegstraße")) \
+            == {'bauweg straße', 'bauweg str'}
+    assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str']
+    assert proc.get_normalized_variants("holzweg") == ['holz weg']
+    assert proc.get_normalized_variants("hallo") == ['hallo']
+
+
+def test_multiple_replacements(cfgfile):
+    fpath = cfgfile([], ['saint => s,st', 'street => st'])
+
+    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
+    proc = ICUNameProcessor(rules)
+
+    assert set(proc.get_normalized_variants("Saint Johns Street")) == \
+            {'saint johns street', 's johns street', 'st johns street',
+             'saint johns st', 's johns st', 'st johns st'}
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py
new file mode 100644 (file)
index 0000000..d89e13b
--- /dev/null
@@ -0,0 +1,75 @@
+"""
+Tests for converting a config file to ICU rules.
+"""
+import pytest
+from textwrap import dedent
+
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.errors import UsageError
+
+from icu import Transliterator
+
+@pytest.fixture
+def cfgfile(tmp_path, suffix='.yaml'):
+    def _create_config(suffixes, abbr):
+        content = dedent("""\
+        normalization:
+            - ":: NFD ()"
+            - "[[:Nonspacing Mark:] [:Cf:]] >"
+            - ":: lower ()"
+            - "[[:Punctuation:][:Space:]]+ > ' '"
+            - ":: NFC ()"
+        transliteration:
+            - "::  Latin ()"
+        """)
+        content += "compound_suffixes:\n"
+        content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
+        content += "abbreviations:\n"
+        content += '\n'.join(("    - " + s for s in abbr)) + '\n'
+        fpath = tmp_path / ('test_config' + suffix)
+        fpath.write_text(dedent(content))
+        return fpath
+
+    return _create_config
+
+def test_missing_normalization(tmp_path):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalizatio:
+            - ":: NFD ()"
+        """))
+
+    with pytest.raises(UsageError):
+        ICURuleLoader(fpath)
+
+
+def test_get_search_rules(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str',
+                     'prospekt => pr'])
+
+    loader = ICURuleLoader(fpath)
+
+    rules = loader.get_search_rules()
+    trans = Transliterator.createFromRules("test", rules)
+
+    assert trans.transliterate(" Baumstraße ") == " baum straße "
+    assert trans.transliterate(" Baumstrasse ") == " baum strasse "
+    assert trans.transliterate(" Baumstr ") == " baum str "
+    assert trans.transliterate(" Baumwegstr ") == " baumweg str "
+    assert trans.transliterate(" Αθήνα ") == " athēna "
+    assert trans.transliterate(" проспект ") == " prospekt "
+
+
+def test_get_synonym_pairs(cfgfile):
+    fpath = cfgfile(['Weg', 'Strasse'],
+                    ['Strasse => str,st'])
+
+    loader = ICURuleLoader(fpath)
+
+    repl = loader.get_replacement_pairs()
+
+    assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
+                    ('strasse ', {' strasse ', ' str ', ' st '}),
+                    ('weg ', {' weg '})]
+