From 62828fc5c11a1c332e18206b181b84928980319b Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 24 Jun 2021 20:02:07 +0200 Subject: [PATCH] switch to a more flexible variant description format The new format combines compound splitting and abbreviation. It also allows to restrict rules to additional conditions (like language or region). This latter ability is not used yet. --- CMakeLists.txt | 4 +- nominatim/tokenizer/icu_name_processor.py | 43 +- nominatim/tokenizer/icu_rule_loader.py | 170 +- nominatim/tokenizer/icu_variants.py | 80 + settings/legacy_icu_tokenizer.yaml | 1699 ++++++++--------- .../test_tokenizer_icu_name_processor.py | 22 +- test/python/test_tokenizer_icu_rule_loader.py | 225 ++- test/python/test_tokenizer_legacy_icu.py | 5 +- 8 files changed, 1201 insertions(+), 1047 deletions(-) create mode 100644 nominatim/tokenizer/icu_variants.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 34e3ea78..38551da7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,6 +258,6 @@ install(FILES settings/env.defaults settings/import-address.style settings/import-full.style settings/import-extratags.style - settings/legacy_icu_tokenizer.json - settings/icu_transliteration.rules + settings/legacy_icu_tokenizer.yaml + settings/icu-rules/extended-unicode-to-asccii.yaml DESTINATION ${NOMINATIM_CONFIGDIR}) diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index ed0a20d9..1888a716 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -2,13 +2,14 @@ Processor for names that are imported into the database based on the ICU library. """ -import json +from collections import defaultdict import itertools from icu import Transliterator import datrie from nominatim.db.properties import set_property, get_property +from nominatim.tokenizer import icu_variants as variants DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" @@ -31,20 +32,12 @@ class ICUNameProcessorRules: elif conn is not None: self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) - self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) + self.replacements = \ + variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) else: assert False, "Parameter loader or conn required." - # Compute the set of characters used in the replacement list. - # We need this later when computing the tree. - chars = set() - for full, repl in self.replacements: - chars.update(full) - for word in repl: - chars.update(word) - self.replacement_charset = ''.join(chars) - def save_rules(self, conn): """ Save the rules in the property table of the given database. @@ -53,7 +46,8 @@ class ICUNameProcessorRules: """ set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) - set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements)) + set_property(conn, DBCFG_IMPORT_REPLACEMENTS, + variants.pickle_variant_set(self.replacements)) set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) @@ -70,9 +64,16 @@ class ICUNameProcessor: self.search = Transliterator.createFromRules("icu_search", rules.search_rules) - self.replacements = datrie.Trie(rules.replacement_charset) - for full, repl in rules.replacements: - self.replacements[full] = repl + # Intermediate reorder by source. Also compute required character set. + immediate = defaultdict(list) + chars = set() + for variant in rules.replacements: + immediate[variant.source].append(variant) + chars.update(variant.source) + # Then copy to datrie + self.replacements = datrie.Trie(''.join(chars)) + for src, repllist in immediate.items(): + self.replacements[src] = repllist def get_normalized(self, name): @@ -85,8 +86,8 @@ class ICUNameProcessor: """ Compute the spelling variants for the given normalized name and transliterate the result. """ - baseform = ' ' + norm_name + ' ' - variants = [''] + baseform = '^ ' + norm_name + ' ^' + partials = [''] startpos = 0 pos = 0 @@ -95,7 +96,8 @@ class ICUNameProcessor: (None, None)) if full is not None: done = baseform[startpos:pos] - variants = [v + done + r for v, r in itertools.product(variants, repl)] + partials = [v + done + r.replacement + for v, r in itertools.product(partials, repl)] startpos = pos + len(full) pos = startpos else: @@ -108,8 +110,9 @@ class ICUNameProcessor: if trans_name: results.append(trans_name) else: - for variant in variants: - trans_name = self.to_ascii.transliterate(variant + baseform[startpos:pos]).strip() + for variant in partials: + name = variant[1:] + baseform[startpos:-1] + trans_name = self.to_ascii.transliterate(name).strip() if trans_name: results.append(trans_name) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index ddb17ae7..d3141bf7 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -3,14 +3,15 @@ Helper class to create ICU rules from a configuration file. """ import io import logging -from collections import defaultdict import itertools from pathlib import Path +import re import yaml from icu import Transliterator from nominatim.errors import UsageError +import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() @@ -31,14 +32,25 @@ def _flatten_yaml_list(content): return output +class VariantRule: + """ Saves a single variant expansion. + + An expansion consists of the normalized replacement term and + a dicitonary of properties that describe when the expansion applies. + """ + + def __init__(self, replacement, properties): + self.replacement = replacement + self.properties = properties or {} + + class ICURuleLoader: """ Compiler for ICU rules from a tokenizer configuration file. """ def __init__(self, configfile): self.configfile = configfile - self.compound_suffixes = set() - self.abbreviations = defaultdict() + self.variants = set() if configfile.suffix == '.yaml': self._load_from_yaml() @@ -74,35 +86,7 @@ class ICURuleLoader: The result is a list of pairs: the first item is the sequence to replace, the second is a list of replacements. """ - synonyms = defaultdict(set) - - # First add entries for compound decomposition. - for suffix in self.compound_suffixes: - variants = (suffix + ' ', ' ' + suffix + ' ') - for key in variants: - synonyms[key].update(variants) - - for full, abbr in self.abbreviations.items(): - key = ' ' + full + ' ' - # Entries in the abbreviation list always apply to full words: - synonyms[key].update((' ' + a + ' ' for a in abbr)) - # Replacements are optional, so add a noop - synonyms[key].add(key) - - if full in self.compound_suffixes: - # Full word abbreviating to compunded version. - synonyms[key].update((a + ' ' for a in abbr)) - - key = full + ' ' - # Uncompunded suffix abbrevitating to decompounded version. - synonyms[key].update((' ' + a + ' ' for a in abbr)) - # Uncompunded suffix abbrevitating to compunded version. - synonyms[key].update((a + ' ' for a in abbr)) - - # sort the resulting list by descending length (longer matches are prefered). - sorted_keys = sorted(synonyms.keys(), key=len, reverse=True) - - return [(k, list(synonyms[k])) for k in sorted_keys] + return self.variants def _yaml_include_representer(self, loader, node): value = loader.construct_scalar(node) @@ -122,8 +106,7 @@ class ICURuleLoader: self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') - self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes')) - self._parse_abbreviation_list(self._get_section(rules, 'abbreviations')) + self._parse_variant_list(self._get_section(rules, 'variants')) def _get_section(self, rules, section): @@ -153,38 +136,111 @@ class ICURuleLoader: return ';'.join(_flatten_yaml_list(content)) + ';' + def _parse_variant_list(self, rules): + self.variants.clear() - def _parse_compound_suffix_list(self, rules): if not rules: - self.compound_suffixes = set() return - norm = Transliterator.createFromRules("rule_loader_normalization", - self.normalization_rules) + rules = _flatten_yaml_list(rules) - # Make sure all suffixes are in their normalised form. - self.compound_suffixes = set((norm.transliterate(s) for s in rules)) + vmaker = _VariantMaker(self.normalization_rules) + properties = [] + for section in rules: + # Create the property field and deduplicate against existing + # instances. + props = variants.ICUVariantProperties.from_rules(section) + for existing in properties: + if existing == props: + props = existing + break + else: + properties.append(props) - def _parse_abbreviation_list(self, rules): - self.abbreviations = defaultdict(list) + for rule in (section.get('words') or []): + self.variants.update(vmaker.compute(rule, props)) - if not rules: - return - norm = Transliterator.createFromRules("rule_loader_normalization", - self.normalization_rules) +class _VariantMaker: + """ Generater for all necessary ICUVariants from a single variant rule. + + All text in rules is normalized to make sure the variants match later. + """ - for rule in rules: - parts = rule.split('=>') - if len(parts) != 2: - LOG.fatal("Syntax error in abbreviation section, line: %s", rule) - raise UsageError("Syntax error in tokenizer configuration file.") + def __init__(self, norm_rules): + self.norm = Transliterator.createFromRules("rule_loader_normalization", + norm_rules) - # Make sure all terms match the normalised version. - fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(',')) - abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(',')) - for full, abbr in itertools.product(fullterms, abbrterms): - if full and abbr: - self.abbreviations[full].append(abbr) + def compute(self, rule, props): + """ Generator for all ICUVariant tuples from a single variant rule. + """ + parts = re.split(r'(\|)?([=-])>', rule) + if len(parts) != 4: + raise UsageError("Syntax error in variant rule: " + rule) + + decompose = parts[1] is None + src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] + repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(',')) + + # If the source should be kept, add a 1:1 replacement + if parts[2] == '-': + for src in src_terms: + if src: + for froms, tos in _create_variants(*src, src[0], decompose): + yield variants.ICUVariant(froms, tos, props) + + for src, repl in itertools.product(src_terms, repl_terms): + if src and repl: + for froms, tos in _create_variants(*src, repl, decompose): + yield variants.ICUVariant(froms, tos, props) + + + def _parse_variant_word(self, name): + name = name.strip() + match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) + if match is None or (match.group(1) == '~' and match.group(3) == '~'): + raise UsageError("Invalid variant word descriptor '{}'".format(name)) + norm_name = self.norm.transliterate(match.group(2)) + if not norm_name: + return None + + return norm_name, match.group(1), match.group(3) + + +_FLAG_MATCH = {'^': '^ ', + '$': ' ^', + '': ' '} + + +def _create_variants(src, preflag, postflag, repl, decompose): + if preflag == '~': + postfix = _FLAG_MATCH[postflag] + # suffix decomposition + src = src + postfix + repl = repl + postfix + + yield src, repl + yield ' ' + src, ' ' + repl + + if decompose: + yield src, ' ' + repl + yield ' ' + src, repl + elif postflag == '~': + # prefix decomposition + prefix = _FLAG_MATCH[preflag] + src = prefix + src + repl = prefix + repl + + yield src, repl + yield src + ' ', repl + ' ' + + if decompose: + yield src, repl + ' ' + yield src + ' ', repl + else: + prefix = _FLAG_MATCH[preflag] + postfix = _FLAG_MATCH[postflag] + + yield prefix + src + postfix, prefix + repl + postfix diff --git a/nominatim/tokenizer/icu_variants.py b/nominatim/tokenizer/icu_variants.py new file mode 100644 index 00000000..e4348eab --- /dev/null +++ b/nominatim/tokenizer/icu_variants.py @@ -0,0 +1,80 @@ +""" +Data structures for saving variant expansions for ICU tokenizer. +""" +from collections import namedtuple +import json + +from nominatim.errors import UsageError + +_ICU_VARIANT_PORPERTY_FIELDS = ['lang'] + +def _get_strtuple_prop(rules, field): + """ Return the given field of the rules dictionary as a list. + + If the field is not defined or empty, returns None. If the field is + a singe string, it is converted into a tuple with a single element. + If the field is a list of strings, return as a string tuple. + Raise a usage error in all other cases. + """ + value = rules.get(field) + + if not value: + return None + + if isinstance(value, str): + return (value,) + + if not isinstance(value, list) or any(not isinstance(x, str) for x in value): + raise UsageError("YAML variant property '{}' should be a list.".format(field)) + + return tuple(value) + + +class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS, + defaults=(None, )*len(_ICU_VARIANT_PORPERTY_FIELDS))): + """ Data container for saving properties that describe when a variant + should be applied. + + Porperty instances are hashable. + """ + @classmethod + def from_rules(cls, rules): + """ Create a new property type from a generic dictionary. + + The function only takes into account the properties that are + understood presently and ignores all others. + """ + return cls(lang=_get_strtuple_prop(rules, 'lang')) + + +ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties']) + +def pickle_variant_set(variants): + """ Serializes an iterable of variant rules to a string. + """ + # Create a list of property sets. So they don't need to be duplicated + properties = {} + pid = 1 + for variant in variants: + if variant.properties not in properties: + properties[variant.properties] = pid + pid += 1 + + # Convert the variants into a simple list. + variants = [(v.source, v.replacement, properties[v.properties]) for v in variants] + + # Convert everythin to json. + return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()}, + 'variants': variants}) + + +def unpickle_variant_set(variant_string): + """ Deserializes a variant string that was previously created with + pickle_variant_set() into a set of ICUVariants. + """ + data = json.loads(variant_string) + + properties = {int(k): ICUVariantProperties(**v) for k, v in data['properties'].items()} + print(properties) + + return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants'])) diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml index 402893fc..a3f1c027 100644 --- a/settings/legacy_icu_tokenizer.yaml +++ b/settings/legacy_icu_tokenizer.yaml @@ -15,880 +15,825 @@ transliteration: - ":: lower ()" - "[[:Punctuation:][:Space:]]+ > ' '" - ":: NFC ()" -compound_suffixes: - # Danish - - hal - - hallen - - hallerne - # German - - berg - - brücke - - fabrik - - gasse - - graben - - haus - - höhle - - hütte - - kapelle - - kogel - - markt - - pfad - - platz - - quelle - - spitze - - stiege - - strasse - - teich - - universität - - wald - - weg - - wiese - # Dutch - - gracht - - laan - - markt - - plein - - straat - - vliet - - weg - # Norwegian - - vei - - veien - - veg - - vegen - - gate - - gaten - - gata - - plass - - plassen - - sving - - svingen - # Finnish - - alue - - asema - - aukio - - kaari - - katu - - kuja - - kylä - - penger - - polku - - puistikko - - puisto - - raitti - - ranta - - rinne - - taival - - tie - - tori - - väylä - # Swedish - - väg - - vägen - - gatan - - gata - - gränd - - gränden - - stig - - stigen - - plats - - platsen -abbreviations: - - acceso => acces - - access => accs - - acequia => aceq - - air force base => afb - - air national guard base => angb - - alameda => alam - - ale => al - - alea => al - - aleea => al - - aleja => al - - alejach => al - - aleje => al - - aleji => al - - all => al - - allee => al - - alley => al - - alleyway => alwy - - ally => al - - alqueria => alque - - alue => al - - aly => al - - am => a - - amble => ambl - - an der => a d - - andador => andad - - angosta => angta - - apartamentos => aptos - - apartments => apts - - apch => app - - apeadero => apdro - - approach => app - - arboleda => arb - - arcade => arc - - arrabal => arral - - arroyo => arry - - arterial => artl - - artery => arty - - auf der => a d - - aukio => auk - - autopista => auto - - autovia => autov - - avd => av - - avda => av - - ave => av - - avenida => av - - avenue => av - - avinguda => av - - b dul => bd - - back => bk - - bad => b - - bahnhof => bf - - bajada => bjada - - balneario => balnr - - banan => ba - - banda => b - - barranco => branc - - barranquil => bqllo - - barriada => barda - - barrio => bo - - barro => bo - - basin => basn - - bda => b - - bdge => bri - - bdul => bd - - bdwy => bway - - beach => bch - - berg => bg - - bgm => bgm - - bhf => bf - - bldngs => bldgs - - block => blk - - blok => bl - - bloque => blque - - blv => bd - - blvd => bd - - boardwalk => bwlk - - boulevard => bd - - boundary => bdy - - brace => br - - brazal => brzal - - brdg => bri - - break => brk - - bridge => bri - - broadway => bway - - broeder => br - - brücke => br - - buildings => bldgs - - bul => bd - - bulev => bd - - bulevar => bd - - bulevard => bd - - bulevardu => bd - - bulevardul => bd - - bulievard => bd - - bulvar => bl - - bulvari => bl - - burg => bg - - burgemeester => bg - - burgermeister => bgm - - business => bus - - buu dien => bd - - bvd => bd - - bypa => byp - - bypass => byp - - byway => bywy - - c le => c - - cadde => cd - - caddesi => cd - - calle => c - - callej => cjon - - calleja => cllja - - callejon => cjon - - callejuela => cjla - - callizo => cllzo - - calzada => czada - - camino => cno - - camino hondo => c h - - camino nuevo => c n - - camino viejo => c v - - camping => campg - - campo => c po - - can cu khong quan => cckq - - cantera => cantr - - cantina => canti - - canton => cant - - cao dang => cd - - caravan => cvn - - carrer => c - - carrera => cra - - carrero => cro - - carretera => ctra - - carreterin => ctrin - - carretil => crtil - - carril => crril - - caserio => csrio - - cau ldhc bo => clb - - caus => cway - - causeway => cway - - ce => cv - - cen => ctr - - center => ctr - - centre => ctr - - centreway => cnwy - - cesta => c - - chalet => chlet - - chase => ch - - che => ch - - chemin => ch - - cinturon => cint - - circle => cir - - circuit => cct - - circunvalacion => ccvcn - - circus => crcs - - city => cty - - cl => c - - cllon => cjon - - close => c - - cmno => cno - - cobertizo => cbtiz - - colonia => col - - commandant => cmdt - - common => comm - - community => comm - - complejo => compj - - concourse => con - - cong truong => ct - - cong ty => cty - - cong ty co phyn => ctcp - - cong vien => cv - - cong vien van hoa => cvvh - - conjunto => cjto - - convento => cnvto - - cooperativa => coop - - copse => cps - - corner => cnr - - corral => crral - - corralillo => crrlo - - corredor => crrdo - - corso => c so - - corte => c te - - cortijo => crtjo - - costanilla => cstan - - costera => coste - - cottages => cotts - - county => co - - county route => cr - - cours => crs - - court => ct - - courtyard => ctyd - - cove => cv - - creek => cr - - cres => cr - - crescent => cr - - crest => crst - - crk => cr - - croft => cft - - cross => crss - - crossing => xing - - crossroad => crd - - crossway => cowy - - crsg => xing - - crt => ct - - csac => cds - - cswy => cway - - cty cp => ctcp - - cuadra => cuadr - - cuesta => custa - - cul de sac => cds - - cutting => cutt - - ddhi hoc => dh - - ddhi lo => dl - - dehesa => dhsa - - demarcacion => demar - - deviation => devn - - diagonal => diag - - diseminado => disem - - distributor => dstr - - doctor => dr - - dokter => dr - - doktor => d r - - dolna => dln - - dolne => dln - - dolny => dln - - dominee => ds - - dorf => df - - dotsient => dots - - drive => dr - - driveway => dvwy - - druga => 2 - - drugi => 2 - - drugie => 2 - - drv => dr - - drwy => dvwy - - duong => d - - duong sat => ds - - duza => dz - - duze => dz - - duzy => dz - - east => e - - edificio => edifc - - elbow => elb - - empresa => empr - - entrada => entd - - entrance => ent - - errepidea => err - - escalera => esca - - escalinata => escal - - espalda => eslda - - esplanade => esp - - estacion => estcn - - estate => est - - estrada => estda - - etorbidea => etorb - - explanada => expla - - expressway => exp - - expy => exp - - extension => ex - - extramuros => extrm - - extrarradio => extrr - - fabrica => fca - - fairway => fawy - - faubourg => fg - - fbrca => fca - - ferry => fy - - fire track => ftrk - - firetrail => fit - - follow => folw - - fondamenta => f ta - - footway => ftwy - - foreshore => fshr - - formation => form - - fort => ft - - freeway => fwy - - front => frnt - - frontage => frtg - - frwy => fwy - - fundacul => fdc - - fundatura => fnd - - ga => g - - galeria => gale - - gamla => gla - - garden => gdn - - gardens => gdn - - gata => g - - gatan => g - - gate => g - - gaten => g - - gdns => gdn - - gebroeders => gebr - - generaal => gen - - gienieral => ghien - - glade => gl - - gld => gl - - glde => gl - - glorieta => gta - - gorna => gn - - gorne => gn - - gorny => gn - - gracht => gr - - grad => ghr - - gran via => g v - - grand => gr - - granden => gr - - grange => gra - - granja => granj - - green => gn - - grn => gn - - gro => gr - - grosse => gr - - grosser => gr - - grosses => gr - - ground => grnd - - grove => gr - - gt => g - - gte => g - - gully => gly - - hauptbahnhof => hbf - - heights => hts - - heiligen => hl - - hgts => hts - - high school => hs - - highroad => hrd - - highway => hwy - - hipodromo => hipod - - hird => hrd - - hospital => hosp - - house => ho - - hse => ho - - i => 1 - - ii => 2 - - iii => 3 - - im => 1 - - impasse => imp - - in => 1 - - in der => i d - - industrial => ind - - ingenieur => ir - - interchange => intg - - international => intl - - intr => int - - intrarea => int - - island => is - - jardin => jdin - - jonkheer => jhr - - junction => jnc - - k s => ks - - kaari => kri - - kalea => k - - kanunnik => kan - - kapitan => kap - - kardinaal => kard - - katu => k - - khach sdhn => ks - - khu cong nghiep => kcn - - khu du lich => kdl - - khu nghi mat => knm - - kleine => kl - - kleiner => kl - - kleines => kl - - kolo => k - - kolonel => kol - - kolonia => kol - - koning => kon - - koningin => kon - - kort e => k - - kri => kri - - kte => k - - kuja => kj - - kvartal => kv - - kyla => kl - - laan => ln - - ladera => ldera - - landing => ldg - - lane => ln - - laneway => lnwy - - lange => l - - largo => l go - - lille => ll - - line => ln - - link => lk - - lit => lt - - little => lt - - llanura => llnra - - lookout => lkt - - loop => lp - - low => lr - - lower => lr - - luitenant => luit - - lwr => lr - - m te => mt - - m tele => mt - - maantee => mt - - mala => ml - - male => ml - - malecon => malec - - maly => ml - - manor => mnr - - mansions => mans - - market => mkt - - markt => mkt - - mazowiecka => maz - - mazowiecki => maz - - mazowieckie => maz - - meadows => mdws - - meander => mr - - medical => med - - meester => mr - - mercado => merc - - mevrouw => mevr - - mews => m - - miasto => m - - middle => mid - - middle school => ms - - mile => mi - - military => mil - - mirador => mrdor - - mitropolit => mit - - mkt => mkt - - mndr => mr - - mnt => mt - - monasterio => mtrio - - monseigneur => mgr - - mont => mt - - motorway => mwy - - mount => mt - - mountain => mtn - - mtwy => mwy - - muelle => muell - - municipal => mun - - muntele => mt - - museum => mus - - mw => m - - na => n - - namesti => nam - - namestie => nam - - national park => np - - national recreation area => nra - - national wildlife refuge area => nwra - - nha hat => nh - - nha thi dzu => ntd - - nha tho => nt - - nordre => ndr - - norra => n - - north => n - - north east => ne - - north west => n - - northeast => ne - - northwest => n - - nowa => n - - nowe => n - - nowy => n - - nucleo => ncleo - - nw => n - - oa => o - - ob => o - - obere => o - - oberer => o - - oberes => o - - olv => olv - - onze lieve vrouw e => olv - - osiedle => os - - osiedlu => os - - ostra => o - - outlook => otlk - - p k => pk - - p ta => pta - - p zza => p za - - palacio => palac - - pantano => pant - - parade => pde - - paraje => praje - - park => pk - - parklands => pkld - - parkway => pwy - - parque => pque - - particular => parti - - partida => ptda - - pas => ps - - pasadizo => pzo - - pasaje => psaje - - paseo => po - - paseo maritimo => psmar - - pasillo => psllo - - pass => ps - - passage => ps - - passatge => ptge - - passeig => pg - - pastoor => past - - pathway => pway - - penger => pgr - - pfad => p - - pgr => pgr - - ph => p - - phi truong => pt - - phuong => p - - phwy => pway - - piata => pta - - piaz => p za - - piazza => p za - - piazzale => p le - - piazzetta => pta - - pierwsza => 1 - - pierwsze => 1 - - pierwszy => 1 - - pike => pk - - pko => pko - - pkwy => pwy - - pky => pwy - - plac => pl - - placa => pl - - place => pl - - placem => pl - - placu => pl - - plass => pl - - plassen => pl - - plateau => plat - - plats => pl - - platsen => pl - - platz => pl - - plaza => pl - - plazoleta => pzta - - plazuela => plzla - - plein => pln - - pln => pln - - ploshchad => pl - - plz => pl - - plza => pl - - pnt => pt - - poblado => pbdo - - pocket => pkt - - point => pt - - poligono => polig - - poligono industrial => pgind - - polku => p - - ponte => p te - - porta => pta - - portal => prtal - - portico => prtco - - portillo => ptilo - - prazuela => przla - - precinct => pct - - pres => pres - - president => pres - - prins => pr - - prinses => pr - - professor => prof - - profiesor => prof - - prolongacion => prol - - promenade => prom - - psge => ps - - pueblo => pblo - - puente => pnte - - puerta => pta - - puerto => pto - - puistikko => pko - - puisto => ps - - punto kilometrico => pk - - pza => pl - - quadrangle => qdgl - - quadrant => qdrt - - quai => qu - - quan => q - - quay => qy - - quays => qys - - qucyng truong => qt - - quelle => qu - - quoc lo => ql - - raitti => r - - rambla => rbla - - ramble => rmbl - - rampla => rampa - - range => rnge - - ranta => rt - - rdhp hat => rh - - reach => rch - - reservation => res - - reserve => res - - reservoir => res - - residencial => resid - - retreat => rtt - - rhein => rh - - ribera => rbra - - ridge => rdge - - ridgeway => rgwy - - right of way => rowy - - rincon => rcon - - rinconada => rcda - - rinne => rn - - rise => ri - - riv => r - - river => r - - riverway => rvwy - - riviera => rvra - - road => rd - - roads => rds - - roadside => rdsd - - roadway => rdwy - - ronde => rnde - - rosebowl => rsbl - - rotary => rty - - rotonda => rtda - - round => rnd - - route => rt - - rte => rt - - rue => r - - rvr => r - - sa => s - - saint => st - - sainte => ste - - salizada => s da - - san => s - - san bay => sb - - san bay quoc te => sbqt - - san van dong => svd - - sanatorio => sanat - - sankt => st - - santa => sta - - santuario => santu - - sector => sect - - sendera => sedra - - sendero => send - - serviceway => swy - - shunt => shun - - shwy => sh - - siding => sdng - - sielo => s - - sint => st - - slope => slpe - - sodra => s - - sok => sk - - sokagi => sk - - sokak => sk - - sondre => sdr - - soseaua => sos - - sound => snd - - south => s - - south east => se - - south west => sw - - south-east => se - - south-west => sw - - southeast => se - - southwest => sw - - spl => sp - - splaiul => sp - - spodnja => sp - - spodnje => sp - - spodnji => sp - - square => sq - - srednja => sr - - srednje => sr - - srednji => sr - - stara => st - - stare => st - - stary => st - - state highway => sh - - state route => sr - - station => stn - - stazione => staz - - ste => ste - - steenweg => stwg - - sth => s - - stig => st - - stigen => st - - store => st - - str la => sdla - - stra => st - - straat => st - - strada => st - - strada comunale => sc - - strada provinciale => sp - - strada regionale => sr - - strada statale => ss - - stradela => sdla - - strand => st - - strasse => str - - street => st - - strip => strp - - stwg => stwg - - subida => sbida - - subway => sbwy - - sveta => sv - - sveti => sv - - svieti => sv - - sving => sv - - svingen => sv - - svwy => swy - - taival => tvl - - tanav => tn - - tce => ter - - tcty => tct - - terr => ter - - terrace => ter - - thanh pho => tp - - thfr => thor - - thi trzn => tt - - thi xa => tx - - thoroughfare => thor - - tie => t - - tieu hoc => th - - tinh lo => tl - - tollway => tlwy - - tong cong ty => tct - - tori => tr - - torrente => trrnt - - towers => twrs - - township => twp - - tpke => tpk - - track => trk - - trail => trl - - trailer => trlr - - transito => trans - - transversal => trval - - trasera => tras - - travesia => trva - - triangle => tri - - trung hoc co so => thcs - - trung hoc pho thong => thpt - - trung tam => tt - - trung tam thuong mdhi => tttm - - trunkway => tkwy - - trzeci => 3 - - trzecia => 3 - - trzecie => 3 - - tunnel => tun - - turn => tn - - turnpike => tpk - - tvl => tvl - - ulica => ul - - ulice => ul - - ulicy => ul - - ulitsa => ul - - underpass => upas - - university => univ - - untere => u - - unterer => u - - unteres => u - - upper => up - - upr => up - - urbanizacion => urb - - utca => u - - v d => vd - - va => v - - vag => v - - vagen => v - - vale => v - - van => v - - van de => vd - - varf => vf - - varful => vf - - vastra => v - - vayla => vla - - vdct => via - - vecindario => vecin - - vei => v - - veien => v - - velika => v - - velike => v - - veliki => v - - veliko => v - - vereda => vreda - - via => v - - viad => via - - viaduct => via - - viaducto => vcto - - viale => v le - - vicolo => v lo - - vien bcyo tang => vbt - - view => vw - - villas => vlls - - virf => vf - - virful => vf - - vista => vsta - - viviendas => vvdas - - vkhod => vkh - - vla => vla - - vliet => vlt - - vlt => vlt - - vn => v - - vuon quoc gia => vqg - - walk => wlk - - walkway => wkwy - - way => wy - - west => w - - wharf => whrf - - wielka => wlk - - wielki => wlk - - wielkie => wlk - - wielkopolska => wlkp - - wielkopolski => wlkp - - wielkopolskie => wlkp - - wojewodztwie => woj - - wojewodztwo => woj - - yard => yd - - zgornja => zg - - zgornje => zg - - zgornji => zg - - zhilishchien komplieks => zh k - - zum => z +variants: + - words: + - ~hal => hal + - ~hallen => hallen + - ~hallerne => hallerne + - ~fabrik => fabrik + - ~gasse => gasse + - ~graben => graben + - ~haus => haus + - ~höhle => höhle + - ~hütte => hütte + - ~kapelle => kapelle + - ~kogel => kogel + - ~spitze => spitze + - ~stiege => stiege + - ~teich => teich + - ~universität => universität + - ~wald => wald + - ~weg => weg + - ~wiese => wiese + - ~veg => veg + - ~vegen => vegen + - ~asema => asema + - ~väylä => väylä + - acceso -> acces + - access -> accs + - acequia -> aceq + - air force base -> afb + - air national guard base -> angb + - alameda -> alam + - ale -> al + - alea -> al + - aleea -> al + - aleja -> al + - alejach -> al + - aleje -> al + - aleji -> al + - all -> al + - allee -> al + - alley -> al + - alleyway -> alwy + - ally -> al + - alqueria -> alque + - ~alue -> al + - aly -> al + - am -> a + - amble -> ambl + - an der -> a d + - andador -> andad + - angosta -> angta + - apartamentos -> aptos + - apartments -> apts + - apch -> app + - apeadero -> apdro + - approach -> app + - arboleda -> arb + - arcade -> arc + - arrabal -> arral + - arroyo -> arry + - arterial -> artl + - artery -> arty + - auf der -> a d + - ~aukio -> auk + - autopista -> auto + - autovia -> autov + - avd -> av + - avda -> av + - ave -> av + - avenida -> av + - avenue -> av + - avinguda -> av + - b dul -> bd + - back -> bk + - bad -> b + - bahnhof -> bf + - bajada -> bjada + - balneario -> balnr + - banan -> ba + - banda -> b + - barranco -> branc + - barranquil -> bqllo + - barriada -> barda + - barrio -> bo + - barro -> bo + - basin -> basn + - bda -> b + - bdge -> bri + - bdul -> bd + - bdwy -> bway + - beach -> bch + - ~berg -> bg + - bgm -> bgm + - bhf -> bf + - bldngs -> bldgs + - block -> blk + - blok -> bl + - bloque -> blque + - blv -> bd + - blvd -> bd + - boardwalk -> bwlk + - boulevard -> bd + - boundary -> bdy + - brace -> br + - brazal -> brzal + - brdg -> bri + - break -> brk + - bridge -> bri + - broadway -> bway + - broeder -> br + - ~brücke -> br + - buildings -> bldgs + - bul -> bd + - bulev -> bd + - bulevar -> bd + - bulevard -> bd + - bulevardu -> bd + - bulevardul -> bd + - bulievard -> bd + - bulvar -> bl + - bulvari -> bl + - burg -> bg + - burgemeester -> bg + - burgermeister -> bgm + - business -> bus + - buu dien -> bd + - bvd -> bd + - bypa -> byp + - bypass -> byp + - byway -> bywy + - c le -> c + - cadde -> cd + - caddesi -> cd + - calle -> c + - callej -> cjon + - calleja -> cllja + - callejon -> cjon + - callejuela -> cjla + - callizo -> cllzo + - calzada -> czada + - camino -> cno + - camino hondo -> c h + - camino nuevo -> c n + - camino viejo -> c v + - camping -> campg + - campo -> c po + - can cu khong quan -> cckq + - cantera -> cantr + - cantina -> canti + - canton -> cant + - cao dang -> cd + - caravan -> cvn + - carrer -> c + - carrera -> cra + - carrero -> cro + - carretera -> ctra + - carreterin -> ctrin + - carretil -> crtil + - carril -> crril + - caserio -> csrio + - cau ldhc bo -> clb + - caus -> cway + - causeway -> cway + - ce -> cv + - cen -> ctr + - center -> ctr + - centre -> ctr + - centreway -> cnwy + - cesta -> c + - chalet -> chlet + - chase -> ch + - che -> ch + - chemin -> ch + - cinturon -> cint + - circle -> cir + - circuit -> cct + - circunvalacion -> ccvcn + - circus -> crcs + - city -> cty + - cl -> c + - cllon -> cjon + - close -> c + - cmno -> cno + - cobertizo -> cbtiz + - colonia -> col + - commandant -> cmdt + - common -> comm + - community -> comm + - complejo -> compj + - concourse -> con + - cong truong -> ct + - cong ty -> cty + - cong ty co phyn -> ctcp + - cong vien -> cv + - cong vien van hoa -> cvvh + - conjunto -> cjto + - convento -> cnvto + - cooperativa -> coop + - copse -> cps + - corner -> cnr + - corral -> crral + - corralillo -> crrlo + - corredor -> crrdo + - corso -> c so + - corte -> c te + - cortijo -> crtjo + - costanilla -> cstan + - costera -> coste + - cottages -> cotts + - county -> co + - county route -> cr + - cours -> crs + - court -> ct + - courtyard -> ctyd + - cove -> cv + - creek -> cr + - cres -> cr + - crescent -> cr + - crest -> crst + - crk -> cr + - croft -> cft + - cross -> crss + - crossing -> xing + - crossroad -> crd + - crossway -> cowy + - crsg -> xing + - crt -> ct + - csac -> cds + - cswy -> cway + - cty cp -> ctcp + - cuadra -> cuadr + - cuesta -> custa + - cul de sac -> cds + - cutting -> cutt + - ddhi hoc -> dh + - ddhi lo -> dl + - dehesa -> dhsa + - demarcacion -> demar + - deviation -> devn + - diagonal -> diag + - diseminado -> disem + - distributor -> dstr + - doctor -> dr + - dokter -> dr + - doktor -> d r + - dolna -> dln + - dolne -> dln + - dolny -> dln + - dominee -> ds + - dorf -> df + - dotsient -> dots + - drive -> dr + - driveway -> dvwy + - druga -> 2 + - drugi -> 2 + - drugie -> 2 + - drv -> dr + - drwy -> dvwy + - duong -> d + - duong sat -> ds + - duza -> dz + - duze -> dz + - duzy -> dz + - east -> e + - edificio -> edifc + - elbow -> elb + - empresa -> empr + - entrada -> entd + - entrance -> ent + - errepidea -> err + - escalera -> esca + - escalinata -> escal + - espalda -> eslda + - esplanade -> esp + - estacion -> estcn + - estate -> est + - estrada -> estda + - etorbidea -> etorb + - explanada -> expla + - expressway -> exp + - expy -> exp + - extension -> ex + - extramuros -> extrm + - extrarradio -> extrr + - fabrica -> fca + - fairway -> fawy + - faubourg -> fg + - fbrca -> fca + - ferry -> fy + - fire track -> ftrk + - firetrail -> fit + - follow -> folw + - fondamenta -> f ta + - footway -> ftwy + - foreshore -> fshr + - formation -> form + - fort -> ft + - freeway -> fwy + - front -> frnt + - frontage -> frtg + - frwy -> fwy + - fundacul -> fdc + - fundatura -> fnd + - ga -> g + - galeria -> gale + - gamla -> gla + - garden -> gdn + - gardens -> gdn + - ~gata -> g + - ~gatan -> g + - ~gate -> g + - ~gaten -> g + - gdns -> gdn + - gebroeders -> gebr + - generaal -> gen + - gienieral -> ghien + - glade -> gl + - gld -> gl + - glde -> gl + - glorieta -> gta + - gorna -> gn + - gorne -> gn + - gorny -> gn + - ~gracht -> gr + - grad -> ghr + - gran via -> g v + - ~gränd -> gr + - ~gränden -> gr + - grange -> gra + - granja -> granj + - green -> gn + - grn -> gn + - gro -> gr + - grosse -> gr + - grosser -> gr + - grosses -> gr + - ground -> grnd + - grove -> gr + - gt -> g + - gte -> g + - gully -> gly + - hauptbahnhof -> hbf + - heights -> hts + - heiligen -> hl + - hgts -> hts + - high school -> hs + - highroad -> hrd + - highway -> hwy + - hipodromo -> hipod + - hird -> hrd + - hospital -> hosp + - house -> ho + - hse -> ho + - i -> 1 + - ii -> 2 + - iii -> 3 + - im -> i + - impasse -> imp + - in -> i + - in der -> i d + - industrial -> ind + - ingenieur -> ir + - interchange -> intg + - international -> intl + - intr -> int + - intrarea -> int + - island -> is + - jardin -> jdin + - jonkheer -> jhr + - junction -> jnc + - k s -> ks + - ~kaari -> kri + - kalea -> k + - kanunnik -> kan + - kapitan -> kap + - kardinaal -> kard + - ~katu -> k + - khach sdhn -> ks + - khu cong nghiep -> kcn + - khu du lich -> kdl + - khu nghi mat -> knm + - kleine -> kl + - kleiner -> kl + - kleines -> kl + - kolo -> k + - kolonel -> kol + - kolonia -> kol + - koning -> kon + - koningin -> kon + - kort e -> k + - kri -> kri + - kte -> k + - ~kuja -> kj + - kvartal -> kv + - ~kylä -> kl + - ~laan -> ln + - ladera -> ldera + - landing -> ldg + - lane -> ln + - laneway -> lnwy + - lange -> l + - largo -> l go + - lille -> ll + - line -> ln + - link -> lk + - lit -> lt + - little -> lt + - llanura -> llnra + - lookout -> lkt + - loop -> lp + - low -> lr + - lower -> lr + - luitenant -> luit + - lwr -> lr + - m te -> mt + - m tele -> mt + - maantee -> mt + - mala -> ml + - male -> ml + - malecon -> malec + - maly -> ml + - manor -> mnr + - mansions -> mans + - market -> mkt + - ~markt -> mkt + - mazowiecka -> maz + - mazowiecki -> maz + - mazowieckie -> maz + - meadows -> mdws + - meander -> mr + - medical -> med + - meester -> mr + - mercado -> merc + - mevrouw -> mevr + - mews -> m + - miasto -> m + - middle -> mid + - middle school -> ms + - mile -> mi + - military -> mil + - mirador -> mrdor + - mitropolit -> mit + - mkt -> mkt + - mndr -> mr + - mnt -> mt + - monasterio -> mtrio + - monseigneur -> mgr + - mont -> mt + - motorway -> mwy + - mount -> mt + - mountain -> mtn + - mtwy -> mwy + - muelle -> muell + - municipal -> mun + - muntele -> mt + - museum -> mus + - mw -> m + - na -> n + - namesti -> nam + - namestie -> nam + - national park -> np + - national recreation area -> nra + - national wildlife refuge area -> nwra + - nha hat -> nh + - nha thi dzu -> ntd + - nha tho -> nt + - nordre -> ndr + - norra -> n + - north -> n + - north east -> ne + - north west -> n + - northeast -> ne + - northwest -> n + - nowa -> n + - nowe -> n + - nowy -> n + - nucleo -> ncleo + - nw -> n + - oa -> o + - ob -> o + - obere -> o + - oberer -> o + - oberes -> o + - olv -> olv + - onze lieve vrouw e -> olv + - osiedle -> os + - osiedlu -> os + - ostra -> o + - outlook -> otlk + - p k -> pk + - p ta -> pta + - p zza -> p za + - palacio -> palac + - pantano -> pant + - parade -> pde + - paraje -> praje + - park -> pk + - parklands -> pkld + - parkway -> pwy + - parque -> pque + - particular -> parti + - partida -> ptda + - pas -> ps + - pasadizo -> pzo + - pasaje -> psaje + - paseo -> po + - paseo maritimo -> psmar + - pasillo -> psllo + - pass -> ps + - passage -> ps + - passatge -> ptge + - passeig -> pg + - pastoor -> past + - pathway -> pway + - ~penger -> pgr + - ~pfad -> p + - pgr -> pgr + - ph -> p + - phi truong -> pt + - phuong -> p + - phwy -> pway + - piata -> pta + - piaz -> p za + - piazza -> p za + - piazzale -> p le + - piazzetta -> pta + - pierwsza -> 1 + - pierwsze -> 1 + - pierwszy -> 1 + - pike -> pk + - pko -> pko + - pkwy -> pwy + - pky -> pwy + - plac -> pl + - placa -> pl + - place -> pl + - placem -> pl + - placu -> pl + - ~plass -> pl + - ~plassen -> pl + - plateau -> plat + - ~plats -> pl + - ~platsen -> pl + - ~platz -> pl + - plaza -> pl + - plazoleta -> pzta + - plazuela -> plzla + - ~plein -> pln + - pln -> pln + - ploshchad -> pl + - plz -> pl + - plza -> pl + - pnt -> pt + - poblado -> pbdo + - pocket -> pkt + - point -> pt + - poligono -> polig + - poligono industrial -> pgind + - ~polku -> p + - ponte -> p te + - porta -> pta + - portal -> prtal + - portico -> prtco + - portillo -> ptilo + - prazuela -> przla + - precinct -> pct + - pres -> pres + - president -> pres + - prins -> pr + - prinses -> pr + - professor -> prof + - profiesor -> prof + - prolongacion -> prol + - promenade -> prom + - psge -> ps + - pueblo -> pblo + - puente -> pnte + - puerta -> pta + - puerto -> pto + - ~puistikko -> pko + - ~puisto -> ps + - punto kilometrico -> pk + - pza -> pl + - quadrangle -> qdgl + - quadrant -> qdrt + - quai -> qu + - quan -> q + - quay -> qy + - quays -> qys + - qucyng truong -> qt + - ~quelle -> qu + - quoc lo -> ql + - ~raitti -> r + - rambla -> rbla + - ramble -> rmbl + - rampla -> rampa + - range -> rnge + - ~ranta -> rt + - rdhp hat -> rh + - reach -> rch + - reservation -> res + - reserve -> res + - reservoir -> res + - residencial -> resid + - retreat -> rtt + - rhein -> rh + - ribera -> rbra + - ridge -> rdge + - ridgeway -> rgwy + - right of way -> rowy + - rincon -> rcon + - rinconada -> rcda + - ~rinne -> rn + - rise -> ri + - riv -> r + - river -> r + - riverway -> rvwy + - riviera -> rvra + - road -> rd + - roads -> rds + - roadside -> rdsd + - roadway -> rdwy + - ronde -> rnde + - rosebowl -> rsbl + - rotary -> rty + - rotonda -> rtda + - round -> rnd + - route -> rt + - rte -> rt + - rue -> r + - rvr -> r + - sa -> s + - saint -> st + - sainte -> ste + - salizada -> s da + - san -> s + - san bay -> sb + - san bay quoc te -> sbqt + - san van dong -> svd + - sanatorio -> sanat + - sankt -> st + - santa -> sta + - santuario -> santu + - sector -> sect + - sendera -> sedra + - sendero -> send + - serviceway -> swy + - shunt -> shun + - shwy -> sh + - siding -> sdng + - sielo -> s + - sint -> st + - slope -> slpe + - sodra -> s + - sok -> sk + - sokagi -> sk + - sokak -> sk + - sondre -> sdr + - soseaua -> sos + - sound -> snd + - south -> s + - south east -> se + - south west -> sw + - south-east -> se + - south-west -> sw + - southeast -> se + - southwest -> sw + - spl -> sp + - splaiul -> sp + - spodnja -> sp + - spodnje -> sp + - spodnji -> sp + - square -> sq + - srednja -> sr + - srednje -> sr + - srednji -> sr + - stara -> st + - stare -> st + - stary -> st + - state highway -> sh + - state route -> sr + - station -> stn + - stazione -> staz + - ste -> ste + - steenweg -> stwg + - sth -> s + - ~stig -> st + - ~stigen -> st + - store -> st + - str la -> sdla + - stra -> st + - ~straat -> st + - strada -> st + - strada comunale -> sc + - strada provinciale -> sp + - strada regionale -> sr + - strada statale -> ss + - stradela -> sdla + - strand -> st + - ~strasse -> str + - street -> st + - strip -> strp + - stwg -> stwg + - subida -> sbida + - subway -> sbwy + - sveta -> sv + - sveti -> sv + - svieti -> sv + - ~sving -> sv + - ~svingen -> sv + - svwy -> swy + - ~taival -> tvl + - tanav -> tn + - tce -> ter + - tcty -> tct + - terr -> ter + - terrace -> ter + - thanh pho -> tp + - thfr -> thor + - thi trzn -> tt + - thi xa -> tx + - thoroughfare -> thor + - ~tie -> t + - tieu hoc -> th + - tinh lo -> tl + - tollway -> tlwy + - tong cong ty -> tct + - ~tori -> tr + - torrente -> trrnt + - towers -> twrs + - township -> twp + - tpke -> tpk + - track -> trk + - trail -> trl + - trailer -> trlr + - transito -> trans + - transversal -> trval + - trasera -> tras + - travesia -> trva + - triangle -> tri + - trung hoc co so -> thcs + - trung hoc pho thong -> thpt + - trung tam -> tt + - trung tam thuong mdhi -> tttm + - trunkway -> tkwy + - trzeci -> 3 + - trzecia -> 3 + - trzecie -> 3 + - tunnel -> tun + - turn -> tn + - turnpike -> tpk + - tvl -> tvl + - ulica -> ul + - ulice -> ul + - ulicy -> ul + - ulitsa -> ul + - underpass -> upas + - university -> univ + - untere -> u + - unterer -> u + - unteres -> u + - upper -> up + - upr -> up + - urbanizacion -> urb + - utca -> u + - v d -> vd + - va -> v + - ~väg -> v + - ~vägen -> v + - vale -> v + - van -> v + - van de -> vd + - varf -> vf + - varful -> vf + - vastra -> v + - vayla -> vla + - vdct -> via + - vecindario -> vecin + - ~vei -> v + - ~veien -> v + - velika -> v + - velike -> v + - veliki -> v + - veliko -> v + - vereda -> vreda + - via -> v + - viad -> via + - viaduct -> via + - viaducto -> vcto + - viale -> v le + - vicolo -> v lo + - vien bcyo tang -> vbt + - view -> vw + - villas -> vlls + - virf -> vf + - virful -> vf + - vista -> vsta + - viviendas -> vvdas + - vkhod -> vkh + - vla -> vla + - ~vliet -> vlt + - vlt -> vlt + - vn -> v + - vuon quoc gia -> vqg + - walk -> wlk + - walkway -> wkwy + - way -> wy + - west -> w + - wharf -> whrf + - wielka -> wlk + - wielki -> wlk + - wielkie -> wlk + - wielkopolska -> wlkp + - wielkopolski -> wlkp + - wielkopolskie -> wlkp + - wojewodztwie -> woj + - wojewodztwo -> woj + - yard -> yd + - zgornja -> zg + - zgornje -> zg + - zgornji -> zg + - zhilishchien komplieks -> zh k + - zum -> z diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py index 68c33010..c1ad7675 100644 --- a/test/python/test_tokenizer_icu_name_processor.py +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -12,7 +12,7 @@ from nominatim.errors import UsageError @pytest.fixture def cfgfile(tmp_path, suffix='.yaml'): - def _create_config(suffixes, abbr): + def _create_config(*variants, **kwargs): content = dedent("""\ normalization: - ":: NFD ()" @@ -25,10 +25,10 @@ def cfgfile(tmp_path, suffix='.yaml'): - ":: Latin ()" - "'🜵' > ' '" """) - content += "compound_suffixes:\n" - content += '\n'.join((" - " + s for s in suffixes)) + '\n' - content += "abbreviations:\n" - content += '\n'.join((" - " + s for s in abbr)) + '\n' + content += "variants:\n - words:\n" + content += '\n'.join((" - " + s for s in variants)) + '\n' + for k, v in kwargs: + content += " {}: {}\n".format(k, v) fpath = tmp_path / ('test_config' + suffix) fpath.write_text(dedent(content)) return fpath @@ -40,9 +40,9 @@ def get_normalized_variants(proc, name): return proc.get_variants_ascii(proc.get_normalized(name)) def test_simple_variants(cfgfile): - fpath = cfgfile(['strasse', 'straße', 'weg'], - ['strasse,straße => str', - 'prospekt => pr']) + fpath = cfgfile('~strasse,~straße -> str', + '~weg => weg', + 'prospekt -> pr') rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) proc = ICUNameProcessor(rules) @@ -58,7 +58,7 @@ def test_simple_variants(cfgfile): def test_variants_empty(cfgfile): - fpath = cfgfile([], ['saint => 🜵', 'street => st']) + fpath = cfgfile('saint -> 🜵', 'street -> st') rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) proc = ICUNameProcessor(rules) @@ -69,7 +69,7 @@ def test_variants_empty(cfgfile): def test_multiple_replacements(cfgfile): - fpath = cfgfile([], ['saint => s,st', 'street => st']) + fpath = cfgfile('saint -> s,st', 'street -> st') rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) proc = ICUNameProcessor(rules) @@ -80,7 +80,7 @@ def test_multiple_replacements(cfgfile): def test_search_normalized(cfgfile): - fpath = cfgfile(['street'], ['street => s,st', 'master => mstr']) + fpath = cfgfile('~street => s,st', 'master => mstr') rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) proc = ICUNameProcessor(rules) diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py index 161dff94..bb30dc6e 100644 --- a/test/python/test_tokenizer_icu_rule_loader.py +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -11,7 +11,7 @@ from icu import Transliterator @pytest.fixture def cfgfile(tmp_path, suffix='.yaml'): - def _create_config(suffixes, abbr): + def _create_config(*variants, **kwargs): content = dedent("""\ normalization: - ":: NFD ()" @@ -23,10 +23,10 @@ def cfgfile(tmp_path, suffix='.yaml'): - ":: Latin ()" - "[[:Punctuation:][:Space:]]+ > ' '" """) - content += "compound_suffixes:\n" - content += '\n'.join((" - " + s for s in suffixes)) + '\n' - content += "abbreviations:\n" - content += '\n'.join((" - " + s for s in abbr)) + '\n' + content += "variants:\n - words:\n" + content += '\n'.join((" - " + s for s in variants)) + '\n' + for k, v in kwargs: + content += " {}: {}\n".format(k, v) fpath = tmp_path / ('test_config' + suffix) fpath.write_text(dedent(content)) return fpath @@ -39,18 +39,16 @@ def test_empty_rule_file(tmp_path): fpath.write_text(dedent("""\ normalization: transliteration: - compound_suffixes: - abbreviations: + variants: """)) rules = ICURuleLoader(fpath) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == '' - assert rules.get_replacement_pairs() == [] + assert list(rules.get_replacement_pairs()) == [] -CONFIG_SECTIONS = ('normalization', 'transliteration', - 'compound_suffixes', 'abbreviations') +CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants') @pytest.mark.parametrize("section", CONFIG_SECTIONS) def test_missing_normalization(tmp_path, section): @@ -63,29 +61,9 @@ def test_missing_normalization(tmp_path, section): with pytest.raises(UsageError): ICURuleLoader(fpath) -@pytest.mark.parametrize("abbr", ["simple", - "double => arrow => bad", - "bad = > arrow"]) -def test_bad_abbreviation_syntax(tmp_path, abbr): - fpath = tmp_path / ('test_config.yaml') - fpath.write_text(dedent("""\ - normalization: - transliteration: - compound_suffixes: - abbreviations: - - {} - """.format(abbr))) - - with pytest.raises(UsageError): - rules = ICURuleLoader(fpath) - def test_get_search_rules(cfgfile): - fpath = cfgfile(['strasse', 'straße', 'weg'], - ['strasse,straße => str', - 'prospekt => pr']) - - loader = ICURuleLoader(fpath) + loader = ICURuleLoader(cfgfile()) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) @@ -100,10 +78,7 @@ def test_get_search_rules(cfgfile): def test_get_normalization_rules(cfgfile): - fpath = cfgfile(['strasse', 'straße', 'weg'], - ['strasse,straße => str']) - - loader = ICURuleLoader(fpath) + loader = ICURuleLoader(cfgfile()) rules = loader.get_normalization_rules() trans = Transliterator.createFromRules("test", rules) @@ -111,10 +86,7 @@ def test_get_normalization_rules(cfgfile): def test_get_transliteration_rules(cfgfile): - fpath = cfgfile(['strasse', 'straße', 'weg'], - ['strasse,straße => str']) - - loader = ICURuleLoader(fpath) + loader = ICURuleLoader(cfgfile()) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) @@ -128,8 +100,7 @@ def test_transliteration_rules_from_file(tmp_path): transliteration: - "'ax' > 'b'" - !include transliteration.yaml - compound_suffixes: - abbreviations: + variants: """)) transpath = tmp_path / ('transliteration.yaml') transpath.write_text('- "x > y"') @@ -141,53 +112,153 @@ def test_transliteration_rules_from_file(tmp_path): assert trans.transliterate(" axxt ") == " byt " -def test_get_replacement_pairs_multi_to(cfgfile): - fpath = cfgfile(['Pfad', 'Strasse'], - ['Strasse => str,st']) +class TestGetReplacements: - repl = ICURuleLoader(fpath).get_replacement_pairs() + @pytest.fixture(autouse=True) + def setup_cfg(self, cfgfile): + self.cfgfile = cfgfile - assert [(a, sorted(b)) for a, b in repl] == \ - [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']), - ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']), - (' pfad ', [' pfad ', 'pfad ']), - ('pfad ', [' pfad ', 'pfad '])] + def get_replacements(self, *variants): + loader = ICURuleLoader(self.cfgfile(*variants)) + rules = loader.get_replacement_pairs() + return set((v.source, v.replacement) for v in rules) -def test_get_replacement_pairs_multi_from(cfgfile): - fpath = cfgfile([], ['saint,Sainte => st']) - repl = ICURuleLoader(fpath).get_replacement_pairs() + @pytest.mark.parametrize("variant", ['foo > bar', 'foo -> bar -> bar', + '~foo~ -> bar', 'fo~ o -> bar']) + def test_invalid_variant_description(self, variant): + with pytest.raises(UsageError): + ICURuleLoader(self.cfgfile(variant)) - assert [(a, sorted(b)) for a, b in repl] == \ - [(' sainte ', [' sainte ', ' st ']), - (' saint ', [' saint ', ' st '])] + def test_add_full(self): + repl = self.get_replacements("foo -> bar") + assert repl == {(' foo ', ' bar '), (' foo ', ' foo ')} -def test_get_replacement_pairs_cross_abbreviations(cfgfile): - fpath = cfgfile([], ['saint,Sainte => st', - 'sainte => ste']) - repl = ICURuleLoader(fpath).get_replacement_pairs() + def test_replace_full(self): + repl = self.get_replacements("foo => bar") - assert [(a, sorted(b)) for a, b in repl] == \ - [(' sainte ', [' sainte ', ' st ', ' ste ']), - (' saint ', [' saint ', ' st '])] + assert repl == {(' foo ', ' bar ')} -@pytest.mark.parametrize("abbr", ["missing to =>", - " => missing from", - "=>"]) -def test_bad_abbreviation_syntax(tmp_path, abbr): - fpath = tmp_path / ('test_config.yaml') - fpath.write_text(dedent("""\ - normalization: - transliteration: - compound_suffixes: - abbreviations: - - {} - """.format(abbr))) + def test_add_suffix_no_decompose(self): + repl = self.get_replacements("~berg |-> bg") + + assert repl == {('berg ', 'berg '), ('berg ', 'bg '), + (' berg ', ' berg '), (' berg ', ' bg ')} + + + def test_replace_suffix_no_decompose(self): + repl = self.get_replacements("~berg |=> bg") + + assert repl == {('berg ', 'bg '), (' berg ', ' bg ')} + + + def test_add_suffix_decompose(self): + repl = self.get_replacements("~berg -> bg") + + assert repl == {('berg ', 'berg '), ('berg ', ' berg '), + (' berg ', ' berg '), (' berg ', 'berg '), + ('berg ', 'bg '), ('berg ', ' bg '), + (' berg ', 'bg '), (' berg ', ' bg ')} + + + def test_replace_suffix_decompose(self): + repl = self.get_replacements("~berg => bg") + + assert repl == {('berg ', 'bg '), ('berg ', ' bg '), + (' berg ', 'bg '), (' berg ', ' bg ')} + + + def test_add_prefix_no_compose(self): + repl = self.get_replacements("hinter~ |-> hnt") + + assert repl == {(' hinter', ' hinter'), (' hinter ', ' hinter '), + (' hinter', ' hnt'), (' hinter ', ' hnt ')} + + + def test_replace_prefix_no_compose(self): + repl = self.get_replacements("hinter~ |=> hnt") + + assert repl == {(' hinter', ' hnt'), (' hinter ', ' hnt ')} + + + def test_add_prefix_compose(self): + repl = self.get_replacements("hinter~-> h") + + assert repl == {(' hinter', ' hinter'), (' hinter', ' hinter '), + (' hinter', ' h'), (' hinter', ' h '), + (' hinter ', ' hinter '), (' hinter ', ' hinter'), + (' hinter ', ' h '), (' hinter ', ' h')} + + + def test_replace_prefix_compose(self): + repl = self.get_replacements("hinter~=> h") + + assert repl == {(' hinter', ' h'), (' hinter', ' h '), + (' hinter ', ' h '), (' hinter ', ' h')} + + + def test_add_beginning_only(self): + repl = self.get_replacements("^Premier -> Pr") + + assert repl == {('^ premier ', '^ premier '), ('^ premier ', '^ pr ')} + + + def test_replace_beginning_only(self): + repl = self.get_replacements("^Premier => Pr") + + assert repl == {('^ premier ', '^ pr ')} + + + def test_add_final_only(self): + repl = self.get_replacements("road$ -> rd") + + assert repl == {(' road ^', ' road ^'), (' road ^', ' rd ^')} + + + def test_replace_final_only(self): + repl = self.get_replacements("road$ => rd") + + assert repl == {(' road ^', ' rd ^')} + + + def test_decompose_only(self): + repl = self.get_replacements("~foo -> foo") + + assert repl == {('foo ', 'foo '), ('foo ', ' foo '), + (' foo ', 'foo '), (' foo ', ' foo ')} + + + def test_add_suffix_decompose_end_only(self): + repl = self.get_replacements("~berg |-> bg", "~berg$ -> bg") + + assert repl == {('berg ', 'berg '), ('berg ', 'bg '), + (' berg ', ' berg '), (' berg ', ' bg '), + ('berg ^', 'berg ^'), ('berg ^', ' berg ^'), + ('berg ^', 'bg ^'), ('berg ^', ' bg ^'), + (' berg ^', 'berg ^'), (' berg ^', 'bg ^'), + (' berg ^', ' berg ^'), (' berg ^', ' bg ^')} + + + def test_replace_suffix_decompose_end_only(self): + repl = self.get_replacements("~berg |=> bg", "~berg$ => bg") + + assert repl == {('berg ', 'bg '), (' berg ', ' bg '), + ('berg ^', 'bg ^'), ('berg ^', ' bg ^'), + (' berg ^', 'bg ^'), (' berg ^', ' bg ^')} + - repl = ICURuleLoader(fpath).get_replacement_pairs() + def test_add_multiple_suffix(self): + repl = self.get_replacements("~berg,~burg -> bg") - assert repl == [] + assert repl == {('berg ', 'berg '), ('berg ', ' berg '), + (' berg ', ' berg '), (' berg ', 'berg '), + ('berg ', 'bg '), ('berg ', ' bg '), + (' berg ', 'bg '), (' berg ', ' bg '), + ('burg ', 'burg '), ('burg ', ' burg '), + (' burg ', ' burg '), (' burg ', 'burg '), + ('burg ', 'bg '), ('burg ', ' bg '), + (' burg ', 'bg '), (' burg ', ' bg ')} diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index b86925ee..56c08e5a 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -60,13 +60,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, monkeypatch.undo() def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), - suffixes=('gasse', ), abbr=('street => st', )): + variants=('~gasse -> gasse', 'street => st', )): cfgfile = tmp_path / 'analyser_test_config.yaml' with cfgfile.open('w') as stream: cfgstr = {'normalization' : list(norm), 'transliteration' : list(trans), - 'compound_suffixes' : list(suffixes), - 'abbreviations' : list(abbr)} + 'variants' : [ {'words': list(variants)}]} yaml.dump(cfgstr, stream) tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile)) -- 2.43.2