From: Sarah Hoffmann Date: Fri, 28 May 2021 20:06:13 +0000 (+0200) Subject: move abbreviation computation into import phase X-Git-Tag: v4.0.0~58^2~21 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/8413075249e1bb2832df4edd0f66d61f77fb9f99 move abbreviation computation into import phase This adds precomputation of abbreviated terms for names and removes abbreviation of terms in the query. Basic import works but still needs some thorough testing as well as speed improvements during import. New dependency for python library datrie. --- diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 09cfe70f..92dd7272 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -47,9 +47,7 @@ class Tokenizer private function makeStandardWord($sTerm) { - $sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' '; - - return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm)); + return trim($this->oTransliterator->transliterate(' '.$sTerm.' ')); } @@ -90,6 +88,7 @@ class Tokenizer foreach ($aPhrases as $iPhrase => $oPhrase) { $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); $sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); + Debug::printVar('Phrase', $sPhrase); if (strlen($sPhrase) > 0) { $aWords = explode(' ', $sPhrase); Tokenizer::addTokens($aTokens, $aWords); diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index 8fd0ede4..686137de 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; --------------- private functions ---------------------------------------------- -CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT) - RETURNS INTEGER +CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[], + OUT full_token INT, + OUT partial_tokens INT[]) AS $$ DECLARE - return_id INTEGER; + partial_terms TEXT[] = '{}'::TEXT[]; + term TEXT; + term_id INTEGER; term_count INTEGER; BEGIN - SELECT min(word_id), max(search_name_count) INTO return_id, term_count - FROM word WHERE word_token = lookup_term and class is null and type is null; + SELECT min(word_id) INTO full_token + FROM word WHERE word = norm_term and class is null and country_code is null; - IF return_id IS NULL THEN - return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (return_id, lookup_term, 0); - ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN - return_id := 0; + IF full_token IS NULL THEN + full_token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, word, search_name_count) + SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; END IF; - RETURN return_id; + FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP + term := trim(term); + IF NOT (ARRAY[term] <@ partial_terms) THEN + partial_terms := partial_terms || term; + END IF; + END LOOP; + + partial_tokens := '{}'::INT[]; + FOR term IN SELECT unnest(partial_terms) LOOP + SELECT min(word_id), max(search_name_count) INTO term_id, term_count + FROM word WHERE word_token = term and class is null and country_code is null; + + IF term_id IS NULL THEN + term_id := nextval('seq_word'); + term_count := 0; + INSERT INTO word (word_id, word_token, search_name_count) + VALUES (term_id, term, 0); + END IF; + + IF term_count < {{ max_word_freq }} THEN + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); + END IF; + END LOOP; END; $$ LANGUAGE plpgsql; diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py new file mode 100644 index 00000000..0e717995 --- /dev/null +++ b/nominatim/tokenizer/icu_name_processor.py @@ -0,0 +1,111 @@ +""" +Processor for names that are imported into the database based on the +ICU library. +""" +import json +import itertools + +from icu import Transliterator +import datrie + +from nominatim.db.properties import set_property, get_property + +DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" +DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" +DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements" +DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization" + + +class ICUNameProcessorRules: + """ Data object that saves the rules needed for the name processor. + + The rules can either be initialised through an ICURuleLoader or + be loaded from a database when a connection is given. + """ + def __init__(self, loader=None, conn=None): + if loader is not None: + self.norm_rules = loader.get_normalization_rules() + self.trans_rules = loader.get_transliteration_rules() + self.replacements = loader.get_replacement_pairs() + self.search_rules = loader.get_search_rules() + elif conn is not None: + self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) + self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) + self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) + self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) + else: + assert False, "Parameter loader or conn required." + + # Compute the set of characters used in the replacement list. + # We need this later when computing the tree. + chars = set() + for full, repl in self.replacements: + chars.update(full) + for word in repl: + chars.update(word) + self.replacement_charset = ''.join(chars) + + + def save_rules(self, conn): + """ Save the rules in the property table of the given database. + the rules can be loaded again by handing in a connection into + the constructor of the class. + """ + set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) + set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) + set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements)) + set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) + + +class ICUNameProcessor: + + def __init__(self, rules): + self.normalizer = Transliterator.createFromRules("icu_normalization", + rules.norm_rules) + self.to_ascii = Transliterator.createFromRules("icu_to_ascii", + rules.trans_rules) + self.search = Transliterator.createFromRules("icu_search", + rules.search_rules) + + self.replacements = datrie.Trie(rules.replacement_charset) + for full, repl in rules.replacements: + self.replacements[full] = repl + + + def get_normalized(self, name): + """ Normalize the given name, i.e. remove all elements not relevant + for search. + """ + return self.normalizer.transliterate(name) + + def get_variants_ascii(self, norm_name): + """ Compute the spelling variants for the given normalized name + and transliterate the result. + """ + baseform = ' ' + norm_name + ' ' + variants = [''] + + startpos = 0 + pos = 0 + while pos < len(baseform): + full, repl = self.replacements.longest_prefix_item(baseform[pos:], + (None, None)) + if full is not None: + done = baseform[startpos:pos] + variants = [v + done + r for v, r in itertools.product(variants, repl)] + startpos = pos + len(full) + pos = startpos + else: + pos += 1 + + if startpos == 0: + return [self.to_ascii.transliterate(norm_name)] + + return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants] + + + def get_search_normalized(self, name): + """ Return the normalized version of the name (including transliteration) + to be applied at search time. + """ + return self.search.transliterate(name) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py new file mode 100644 index 00000000..3b721169 --- /dev/null +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -0,0 +1,161 @@ +""" +Helper class to create ICU rules from a configuration file. +""" +import io +import yaml +import logging +from collections import defaultdict +import itertools + +from icu import Transliterator + +from nominatim.errors import UsageError + +LOG = logging.getLogger() + + +class ICURuleLoader: + """ Compiler for ICU rules from a tokenizer configuration file. + """ + + def __init__(self, configfile): + self.configfile = configfile + + if configfile.suffix == '.yaml': + self._load_from_yaml() + else: + raise UsageError("Unknown format of tokenizer configuration.") + + + def get_search_rules(self): + """ Returns the ICU rules to be used during search. + The rules combine normalization, compound decomposition (including + abbreviated compounds) and transliteration. + """ + # First apply the normalization rules. + rules = io.StringIO() + rules.write(self.normalization_rules) + + # For all compound suffixes: add them in their full and any abbreviated form. + suffixes = set() + for suffix in self.compound_suffixes: + suffixes.add(suffix) + suffixes.update(self.abbreviations.get(suffix, [])) + + for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True): + rules.write("'{0} ' > ' {0} ';".format(suffix)) + + # Finally add transliteration. + rules.write(self.transliteration_rules) + return rules.getvalue() + + def get_normalization_rules(self): + """ Return rules for normalisation of a term. + """ + return self.normalization_rules + + def get_transliteration_rules(self): + """ Return the rules for converting a string into its asciii representation. + """ + return self.transliteration_rules + + def get_replacement_pairs(self): + """ Returns the list of possible compound decompositions with + application of abbreviations included. + The result is a list of pairs: the first item is the sequence to + replace, the second is a list of replacements. + """ + synonyms = defaultdict(set) + + for full, abbr in self.abbreviations.items(): + key = ' ' + full + ' ' + # Entries in the abbreviation list always apply to full words: + synonyms[key].update((' ' + a + ' ' for a in abbr)) + # Replacements are optional, so add a noop + synonyms[key].add(key) + + # Entries in the compound list expand to themselves and to + # abbreviations. + for suffix in self.compound_suffixes: + keyset = synonyms[suffix + ' '] + keyset.add(' ' + suffix + ' ') + keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, []))) + # The terms the entries are shortended to, need to be decompunded as well. + for abbr in self.abbreviations.get(suffix, []): + synonyms[abbr + ' '].add(' ' + abbr + ' ') + + # sort the resulting list by descending length (longer matches are prefered). + sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True) + + return [(k, list(synonyms[k])) for k in sorted_keys] + + + def _load_from_yaml(self): + rules = yaml.load(self.configfile.read_text()) + + self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') + self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') + self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes')) + self._parse_abbreviation_list(self._get_section(rules, 'abbreviations')) + + + def _get_section(self, rules, section): + """ Get the section named 'section' from the rules. If the section does + not exist, raise a usage error with a meaningful message. + """ + if section not in rules: + LOG.fatal("Section '%s' not found in tokenizer config '%s'.", + section, str(self.configfile)) + raise UsageError("Syntax error in tokenizer configuration file.") + + return rules[section] + + + def _cfg_to_icu_rules(self, rules, section): + """ Load an ICU ruleset from the given section. If the section is a + simple string, it is interpreted as a file name and the rules are + loaded verbatim from the given file. The filename is expected to be + relative to the tokenizer rule file. If the section is a list then + each line is assumed to be a rule. All rules are concatenated and returned. + """ + content = self._get_section(rules, section) + + if isinstance(content, str): + return (self.configfile.parent / content).read_text().replace('\n', ' ') + + return ';'.join(content) + ';' + + + def _parse_compound_suffix_list(self, rules): + if not rules: + self.compound_suffixes = set() + return + + norm = Transliterator.createFromRules("rule_loader_normalization", + self.normalization_rules) + + # Make sure all suffixes are in their normalised form. + self.compound_suffixes = set((norm.transliterate(s) for s in rules)) + + + def _parse_abbreviation_list(self, rules): + self.abbreviations = defaultdict(list) + + if not rules: + return + + norm = Transliterator.createFromRules("rule_loader_normalization", + self.normalization_rules) + + for rule in rules: + parts = rule.split('=>') + if len(parts) != 2: + LOG.fatal("Syntax error in abbreviation section, line: %s", rule) + raise UsageError("Syntax error in tokenizer configuration file.") + + # Make sure all terms match the normalised version. + fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(',')) + abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(',')) + + for full, abbr in itertools.product(fullterms, abbrterms): + self.abbreviations[full].append(abbr) diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 689318d7..eb850237 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -18,11 +18,11 @@ import psycopg2.extras from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules -DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" -DBCFG_TRANSLITERATION = "tokenizer_transliteration" -DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" +DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -41,9 +41,9 @@ class LegacyICUTokenizer: def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.normalization = None - self.transliteration = None - self.abbreviations = None + self.naming_rules = None + self.term_normalization = None + self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -55,14 +55,14 @@ class LegacyICUTokenizer: if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: - cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' + cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' - rules = json.loads(cfgfile.read_text()) - self._load_transliteration(rules['normalization'], cfgfile.parent) - self.abbreviations = rules["abbreviations"] - self.normalization = config.TERM_NORMALIZATION + loader = ICURuleLoader(cfgfile) + self.naming_rules = ICUNameProcessorRules(loader=loader) + self.term_normalization = config.TERM_NORMALIZATION + self.max_word_frequency = config.MAX_WORD_FREQUENCY - self._install_php(config) + self._install_php(config.lib_dir.php) self._save_config(config) if init_db: @@ -70,19 +70,13 @@ class LegacyICUTokenizer: self._init_db_tables(config) - def _load_transliteration(self, rules, cfg_path): - if isinstance(rules, str): - self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ') - else: - self.transliteration = ';'.join(rules) + ';' - def init_from_project(self): """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: - self.normalization = get_property(conn, DBCFG_NORMALIZATION) - self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) - self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) + self.naming_rules = ICUNameProcessorRules(conn=conn) + self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) + self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, config): @@ -132,26 +126,20 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ - norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("trans", self.transliteration) - return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) + return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - def _install_php(self, config): + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ - abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent("""\ 1: - word = self.make_standard_word(brace_split[0]) - if word: - full_names.add(word) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer: postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.make_standard_word(postcode) + term = self.name_processor.get_search_normalized(postcode) if not term: return @@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer: """, (' ' + term, postcode)) self._cache.postcodes.add(postcode) + @staticmethod def _split_housenumbers(hnrs): if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: @@ -530,7 +512,7 @@ class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self, cache): - self.cache = cache + self._cache = cache self.data = {} @staticmethod @@ -538,86 +520,44 @@ class _TokenInfo: return '{%s}' % ','.join((str(s) for s in tokens)) - def add_names(self, conn, names): + def add_names(self, fulls, partials): """ Adds token information for the normalised names. """ - # Start with all partial names - terms = set((part for ns in names for part in ns.split())) - # Add the full names - terms.update((' ' + n for n in names)) - - self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) + self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) self.data['hnr'] = ';'.join(hnrs) - def add_street(self, conn, street): + def add_street(self, fulls, partials): """ Add addr:street match terms. """ - if not street: - return - - term = ' ' + street + if fulls: + self.data['street'] = self._mk_array(fulls) - tid = self.cache.names.get(term) - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - if tid is not None: - self.data['street'] = '{%d}' % tid - - - def add_place(self, conn, place): + def add_place(self, fulls, partials): """ Add addr:place search and match terms. """ - if not place: - return - - partial_ids = self.cache.get_term_tokens(conn, place.split()) - tid = self.cache.get_term_tokens(conn, [' ' + place]) - - self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) - self.data['place_match'] = '{%s}' % tid[0] + if fulls: + self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials)) + self.data['place_match'] = self._mk_array(fulls) - def add_address_terms(self, conn, terms): + def add_address_terms(self, terms): """ Add additional address terms. """ tokens = {} - for key, value in terms: - if not value: - continue - partial_ids = self.cache.get_term_tokens(conn, value.split()) - term = ' ' + value - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - tokens[key] = [self._mk_array(partial_ids), - '{%s}' % ('' if tid is None else str(tid))] + for key, fulls, partials in terms: + if fulls: + tokens[key] = [self._mk_array(itertools.chain(fulls, partials)), + self._mk_array(fulls)] if tokens: self.data['addr'] = tokens @@ -635,32 +575,6 @@ class _TokenCache: self.housenumbers = {} - def get_term_tokens(self, conn, terms): - """ Get token ids for a list of terms, looking them up in the database - if necessary. - """ - tokens = [] - askdb = [] - - for term in terms: - token = self.names.get(term) - if token is None: - askdb.append(term) - elif token != 0: - tokens.append(token) - - if askdb: - with conn.cursor() as cur: - cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", - (askdb, )) - for term, tid in cur: - self.names[term] = tid - if tid != 0: - tokens.append(tid) - - return tokens - - def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the database if necessary. diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index d6fbc2cd..bb37115b 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -404,7 +404,7 @@ class LegacyNameAnalyzer: FROM unnest(%s)n) y WHERE NOT EXISTS(SELECT * FROM word WHERE word_token = lookup_token and country_code = %s)) - """, (country_code, names, country_code)) + """, (country_code, list(names.values()), country_code)) def process_place(self, place): @@ -422,7 +422,7 @@ class LegacyNameAnalyzer: country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), list(names.values())) + self.add_country_names(country_feature.lower(), names) address = place.get('address') diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 28a10ebe..efbf2ec8 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -272,15 +272,15 @@ def create_country_names(conn, tokenizer, languages=None): with tokenizer.name_analyzer() as analyzer: for code, name in cur: - names = [code] + names = {'countrycode' : code} if code == 'gb': - names.append('UK') + names['short_name'] = 'UK' if code == 'us': - names.append('United States') + names['short_name'] = 'United States' # country names (only in languages as provided) if name: - names.extend((v for k, v in name.items() if _include_key(k))) + names.update(((k, v) for k, v in name.items() if _include_key(k))) analyzer.add_country_names(code, names) diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml new file mode 100644 index 00000000..34cd8b0b --- /dev/null +++ b/settings/legacy_icu_tokenizer.yaml @@ -0,0 +1,116 @@ +normalization: + - ":: NFD ()" + - "[[:Nonspacing Mark:] [:Cf:]] >" + - ":: lower ()" + - "ß > 'ss'" # German szet is unimbigiously equal to double ss + - "[[:Punctuation:][:Space:]]+ > ' '" + - ":: NFC ()" +transliteration: icu_transliteration.rules +compound_suffixes: + # Danish + - hal + - hallen + - hallerne + # German + - berg + - brücke + - fabrik + - gasse + - graben + - haus + - höhle + - hütte + - kapelle + - kogel + - pfad + - platz + - quelle + - spitze + - stiege + - strasse + - teich + - universität + - wald + - weg + - wiese + # Dutch + - gracht + - laan + - markt + - plein + - straat + - vliet + - weg + # Norwegian + - vei + - veien + - veg + - vegen + - gate + - gaten + - gata + - plass + - plassen + - sving + - svingen + # Finnish + - alue + - asema + - aukio + - kaari + - katu + - kuja + - kylä + - penger + - polku + - puistikko + - puisto + - raitti + - ranta + - rinne + - taival + - tie + - tori + - väylä + # Swedish + - väg + - vägen + - gatan + - gata + - gränd + - gränden + - stig + - stigen + - plats + - platsen +abbreviations: + # German + - am => a + - an der => a d + - allgemeines krankenhaus => akh + - altstoffsammelzentrum => asz + - auf der => a d + - bach => b + - bad => b + - bahnhof => bhf,bf + - berg => bg + - bezirk => bez + - brücke => br + - burg => bg + - chaussee => ch + - deutsche,deutscher,deutsches => dt + - dorf => df + - doktor => dr + - fachhochschule => fh + - Freiwillige Feuerwehr => ff + - sankt => st + - strasse => str + - weg => wg + # English + - alley => al + - beach => bch + - street => st + - road => rd + - bridge => brdg + + diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py new file mode 100644 index 00000000..9c09bccc --- /dev/null +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -0,0 +1,60 @@ +""" +Tests for import name normalisation and variant generation. +""" +from textwrap import dedent + +import pytest + +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules + +from nominatim.errors import UsageError + +@pytest.fixture +def cfgfile(tmp_path, suffix='.yaml'): + def _create_config(suffixes, abbr): + content = dedent("""\ + normalization: + - ":: NFD ()" + - "[[:Nonspacing Mark:] [:Cf:]] >" + - ":: lower ()" + - "[[:Punctuation:][:Space:]]+ > ' '" + - ":: NFC ()" + transliteration: + - ":: Latin ()" + """) + content += "compound_suffixes:\n" + content += '\n'.join((" - " + s for s in suffixes)) + '\n' + content += "abbreviations:\n" + content += '\n'.join((" - " + s for s in abbr)) + '\n' + fpath = tmp_path / ('test_config' + suffix) + fpath.write_text(dedent(content)) + return fpath + + return _create_config + + +def test_simple_variants(cfgfile): + fpath = cfgfile(['strasse', 'straße', 'weg'], + ['strasse,straße => str', + 'prospekt => pr']) + + rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) + proc = ICUNameProcessor(rules) + + assert set(proc.get_normalized_variants("Bauwegstraße")) \ + == {'bauweg straße', 'bauweg str'} + assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str'] + assert proc.get_normalized_variants("holzweg") == ['holz weg'] + assert proc.get_normalized_variants("hallo") == ['hallo'] + + +def test_multiple_replacements(cfgfile): + fpath = cfgfile([], ['saint => s,st', 'street => st']) + + rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) + proc = ICUNameProcessor(rules) + + assert set(proc.get_normalized_variants("Saint Johns Street")) == \ + {'saint johns street', 's johns street', 'st johns street', + 'saint johns st', 's johns st', 'st johns st'} diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py new file mode 100644 index 00000000..d89e13b5 --- /dev/null +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -0,0 +1,75 @@ +""" +Tests for converting a config file to ICU rules. +""" +import pytest +from textwrap import dedent + +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.errors import UsageError + +from icu import Transliterator + +@pytest.fixture +def cfgfile(tmp_path, suffix='.yaml'): + def _create_config(suffixes, abbr): + content = dedent("""\ + normalization: + - ":: NFD ()" + - "[[:Nonspacing Mark:] [:Cf:]] >" + - ":: lower ()" + - "[[:Punctuation:][:Space:]]+ > ' '" + - ":: NFC ()" + transliteration: + - ":: Latin ()" + """) + content += "compound_suffixes:\n" + content += '\n'.join((" - " + s for s in suffixes)) + '\n' + content += "abbreviations:\n" + content += '\n'.join((" - " + s for s in abbr)) + '\n' + fpath = tmp_path / ('test_config' + suffix) + fpath.write_text(dedent(content)) + return fpath + + return _create_config + +def test_missing_normalization(tmp_path): + fpath = tmp_path / ('test_config.yaml') + fpath.write_text(dedent("""\ + normalizatio: + - ":: NFD ()" + """)) + + with pytest.raises(UsageError): + ICURuleLoader(fpath) + + +def test_get_search_rules(cfgfile): + fpath = cfgfile(['strasse', 'straße', 'weg'], + ['strasse,straße => str', + 'prospekt => pr']) + + loader = ICURuleLoader(fpath) + + rules = loader.get_search_rules() + trans = Transliterator.createFromRules("test", rules) + + assert trans.transliterate(" Baumstraße ") == " baum straße " + assert trans.transliterate(" Baumstrasse ") == " baum strasse " + assert trans.transliterate(" Baumstr ") == " baum str " + assert trans.transliterate(" Baumwegstr ") == " baumweg str " + assert trans.transliterate(" Αθήνα ") == " athēna " + assert trans.transliterate(" проспект ") == " prospekt " + + +def test_get_synonym_pairs(cfgfile): + fpath = cfgfile(['Weg', 'Strasse'], + ['Strasse => str,st']) + + loader = ICURuleLoader(fpath) + + repl = loader.get_replacement_pairs() + + assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}), + ('strasse ', {' strasse ', ' str ', ' st '}), + ('weg ', {' weg '})] +