From: Sarah Hoffmann Date: Sun, 2 May 2021 15:52:45 +0000 (+0200) Subject: add Python part for new ICU-based tokenizer X-Git-Tag: v4.0.0~91^2~6 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/f44af49df9b86c9a35db000f7a42e65a6c4307ba add Python part for new ICU-based tokenizer --- diff --git a/.pylintrc b/.pylintrc index 65f97b14..756bba19 100644 --- a/.pylintrc +++ b/.pylintrc @@ -10,4 +10,4 @@ ignored-modules=icu # closing added here because it sometimes triggers a false positive with # 'with' statements. ignored-classes=NominatimArgs,closing -disable=too-few-public-methods +disable=too-few-public-methods,duplicate-code diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql new file mode 100644 index 00000000..8fd0ede4 --- /dev/null +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -0,0 +1,134 @@ +-- Get tokens used for searching the given place. +-- +-- These are the tokens that will be saved in the search_name table. +CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'names')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +-- Get tokens for matching the place name against others. +-- +-- This should usually be restricted to full name tokens. +CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'names')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +-- Return the housenumber tokens applicable for the place. +CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'hnr_tokens')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +-- Return the housenumber in the form that it can be matched during search. +CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB) + RETURNS TEXT +AS $$ + SELECT info->>'hnr'; +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'street')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'place_match')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'place_search')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +DROP TYPE IF EXISTS token_addresstoken CASCADE; +CREATE TYPE token_addresstoken AS ( + key TEXT, + match_tokens INT[], + search_tokens INT[] +); + +CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB) + RETURNS SETOF token_addresstoken +AS $$ + SELECT key, (value->>1)::int[] as match_tokens, + (value->>0)::int[] as search_tokens + FROM jsonb_each(info->'addr'); +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) + RETURNS TEXT +AS $$ + SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +-- Return token info that should be saved permanently in the database. +CREATE OR REPLACE FUNCTION token_strip_info(info JSONB) + RETURNS JSONB +AS $$ + SELECT NULL::JSONB; +$$ LANGUAGE SQL IMMUTABLE STRICT; + +--------------- private functions ---------------------------------------------- + +CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT) + RETURNS INTEGER + AS $$ +DECLARE + return_id INTEGER; + term_count INTEGER; +BEGIN + SELECT min(word_id), max(search_name_count) INTO return_id, term_count + FROM word WHERE word_token = lookup_term and class is null and type is null; + + IF return_id IS NULL THEN + return_id := nextval('seq_word'); + INSERT INTO word (word_id, word_token, search_name_count) + VALUES (return_id, lookup_term, 0); + ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN + return_id := 0; + END IF; + + RETURN return_id; +END; +$$ +LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) + RETURNS INTEGER + AS $$ +DECLARE + return_id INTEGER; +BEGIN + SELECT min(word_id) INTO return_id + FROM word + WHERE word_token = ' ' || lookup_term + and class = 'place' and type = 'house'; + + IF return_id IS NULL THEN + return_id := nextval('seq_word'); + INSERT INTO word (word_id, word_token, class, type, search_name_count) + VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0); + END IF; + + RETURN return_id; +END; +$$ +LANGUAGE plpgsql; diff --git a/lib-sql/tokenizer/legacy_tokenizer_tables.sql b/lib-sql/tokenizer/legacy_tokenizer_tables.sql index 3410b763..937eaaa2 100644 --- a/lib-sql/tokenizer/legacy_tokenizer_tables.sql +++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql @@ -12,6 +12,8 @@ CREATE TABLE word ( CREATE INDEX idx_word_word_token ON word USING BTREE (word_token) {{db.tablespace.search_index}}; +CREATE INDEX idx_word_word ON word + USING BTREE (word) {{db.tablespace.search_index}} WHERE word is not null; GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; DROP SEQUENCE IF EXISTS seq_word; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py new file mode 100644 index 00000000..09d4059e --- /dev/null +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -0,0 +1,594 @@ +""" +Tokenizer implementing normalisation as used before Nominatim 4 but using +libICU instead of the PostgreSQL module. +""" +from collections import Counter +import io +import itertools +import json +import logging +import re +from textwrap import dedent +from pathlib import Path + +from icu import Transliterator +import psycopg2.extras + +from nominatim.db.connection import connect +from nominatim.db.properties import set_property, get_property +from nominatim.db.sql_preprocessor import SQLPreprocessor + +DBCFG_NORMALIZATION = "tokenizer_normalization" +DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" +DBCFG_TRANSLITERATION = "tokenizer_transliteration" +DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" + +LOG = logging.getLogger() + +def create(dsn, data_dir): + """ Create a new instance of the tokenizer provided by this module. + """ + return LegacyICUTokenizer(dsn, data_dir) + + +class LegacyICUTokenizer: + """ This tokenizer uses libICU to covert names and queries to ASCII. + Otherwise it uses the same algorithms and data structures as the + normalization routines in Nominatm 3. + """ + + def __init__(self, dsn, data_dir): + self.dsn = dsn + self.data_dir = data_dir + self.normalization = None + self.transliteration = None + self.abbreviations = None + + + def init_new_db(self, config, init_db=True): + """ Set up a new tokenizer for the database. + + This copies all necessary data in the project directory to make + sure the tokenizer remains stable even over updates. + """ + if config.TOKENIZER_CONFIG: + cfgfile = Path(config.TOKENIZER_CONFIG) + else: + cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' + + rules = json.loads(cfgfile.read_text()) + self.transliteration = ';'.join(rules['normalization']) + ';' + self.abbreviations = rules["abbreviations"] + self.normalization = config.TERM_NORMALIZATION + + self._install_php(config) + self._save_config(config) + + if init_db: + self.update_sql_functions(config) + self._init_db_tables(config) + + + def init_from_project(self): + """ Initialise the tokenizer from the project directory. + """ + with connect(self.dsn) as conn: + self.normalization = get_property(conn, DBCFG_NORMALIZATION) + self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) + self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) + + + def finalize_import(self, config): + """ Do any required postprocessing to make the tokenizer data ready + for use. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + + + def update_sql_functions(self, config): + """ Reimport the SQL functions for this tokenizer. + """ + with connect(self.dsn) as conn: + max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql', + max_word_freq=max_word_freq) + + + def check_database(self): + """ Check that the tokenizer is set up correctly. + """ + self.init_from_project() + + if self.normalization is None\ + or self.transliteration is None\ + or self.abbreviations is None: + return "Configuration for tokenizer 'legacy_icu' are missing." + + return None + + + def name_analyzer(self): + """ Create a new analyzer for tokenizing names and queries + using this tokinzer. Analyzers are context managers and should + be used accordingly: + + ``` + with tokenizer.name_analyzer() as analyzer: + analyser.tokenize() + ``` + + When used outside the with construct, the caller must ensure to + call the close() function before destructing the analyzer. + + Analyzers are not thread-safe. You need to instantiate one per thread. + """ + norm = Transliterator.createFromRules("normalizer", self.normalization) + trans = Transliterator.createFromRules("normalizer", self.transliteration) + return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) + + + def _install_php(self, config): + """ Install the php script for the tokenizer. + """ + php_file = self.data_dir / "tokenizer.php" + php_file.write_text(dedent("""\ + 1 or ',' in hnrs[0] or ';' in hnrs[0]: + # split numbers if necessary + simple_list = [] + for hnr in hnrs: + simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr))) + + if len(simple_list) > 1: + hnrs = list(set(simple_list)) + else: + hnrs = simple_list + + return hnrs + + + + +class _TokenInfo: + """ Collect token information to be sent back to the database. + """ + def __init__(self, cache): + self.cache = cache + self.data = {} + + @staticmethod + def _mk_array(tokens): + return '{%s}' % ','.join((str(s) for s in tokens)) + + + def add_names(self, conn, names): + """ Adds token information for the normalised names. + """ + # Start with all partial names + terms = set((part for ns in names for part in ns.split())) + # Add partials for the full terms (TO BE REMOVED) + terms.update((n for n in names)) + # Add the full names + terms.update((' ' + n for n in names)) + + self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) + + + def add_housenumbers(self, conn, hnrs): + """ Extract housenumber information from a list of normalised + housenumbers. + """ + self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr'] = ';'.join(hnrs) + + + def add_street(self, conn, street): + """ Add addr:street match terms. + """ + if not street: + return + + term = ' ' + street + + tid = self.cache.names.get(term) + + if tid is None: + with conn.cursor() as cur: + cur.execute("""SELECT word_id FROM word + WHERE word_token = %s + and class is null and type is null""", + (term, )) + if cur.rowcount > 0: + tid = cur.fetchone()[0] + self.cache.names[term] = tid + + if tid is not None: + self.data['street'] = '{%d}' % tid + + + def add_place(self, conn, place): + """ Add addr:place search and match terms. + """ + if not place: + return + + partial_ids = self.cache.get_term_tokens(conn, place.split()) + tid = self.cache.get_term_tokens(conn, [' ' + place]) + + self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) + self.data['place_match'] = '{%s}' % tid[0] + + + def add_address_terms(self, conn, terms): + """ Add additional address terms. + """ + tokens = {} + + for key, value in terms: + if not value: + continue + partial_ids = self.cache.get_term_tokens(conn, value.split()) + term = ' ' + value + tid = self.cache.names.get(term) + + if tid is None: + with conn.cursor() as cur: + cur.execute("""SELECT word_id FROM word + WHERE word_token = %s + and class is null and type is null""", + (term, )) + if cur.rowcount > 0: + tid = cur.fetchone()[0] + self.cache.names[term] = tid + + tokens[key] = [self._mk_array(partial_ids), + '{%s}' % ('' if tid is None else str(tid))] + + if tokens: + self.data['addr'] = tokens + + +class _TokenCache: + """ Cache for token information to avoid repeated database queries. + + This cache is not thread-safe and needs to be instantiated per + analyzer. + """ + def __init__(self): + self.names = {} + self.postcodes = set() + self.housenumbers = {} + + + def get_term_tokens(self, conn, terms): + """ Get token ids for a list of terms, looking them up in the database + if necessary. + """ + tokens = [] + askdb = [] + + for term in terms: + token = self.names.get(term) + if token is None: + askdb.append(term) + elif token != 0: + tokens.append(token) + + if askdb: + with conn.cursor() as cur: + cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", + (askdb, )) + for term, tid in cur: + self.names[term] = tid + if tid != 0: + tokens.append(tid) + + return tokens + + + def get_hnr_tokens(self, conn, terms): + """ Get token ids for a list of housenumbers, looking them up in the + database if necessary. + """ + tokens = [] + askdb = [] + + for term in terms: + token = self.housenumbers.get(term) + if token is None: + askdb.append(term) + else: + tokens.append(token) + + if askdb: + with conn.cursor() as cur: + cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr", + (askdb, )) + for term, tid in cur: + self.housenumbers[term] = tid + tokens.append(tid) + + return tokens diff --git a/settings/env.defaults b/settings/env.defaults index 5fbeb0a2..cf1f5108 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -46,6 +46,12 @@ NOMINATIM_LANGUAGES= # Changing this value requires a reimport. NOMINATIM_TERM_NORMALIZATION=":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();" +# Configuration file for the tokenizer. +# The content depends on the tokenizer used. If left empty the default settings +# for the chooseen tokenizer will be used. The configuration can only be set +# on import and not be changed afterwards. +NOMINATIM_TOKENIZER_CONFIG= + # Search in the Tiger house number data for the US. # Note: The tables must already exist or queries will throw errors. # Changing this value requires to run ./utils/setup --create-functions --setup-website. diff --git a/settings/legacy_icu_tokenizer.json b/settings/legacy_icu_tokenizer.json new file mode 100644 index 00000000..faf7be34 --- /dev/null +++ b/settings/legacy_icu_tokenizer.json @@ -0,0 +1,889 @@ +{ "normalization": [ ":: Latin ()", + ":: Ascii ()", + ":: NFD ()", + "'' >", + "[[:Nonspacing Mark:] [:Cf:]] >", + "[^[:Ascii:]] >", + ":: lower ()", + "[[:Punctuation:][:Space:]]+ > ' '", + ":: NFC ()" + ], + "abbreviations": [ + [" national wildlife refuge area ", " nwra "], + [" national recreation area ", " nra "], + [" air national guard base ", " angb "], + [" zhilishchien komplieks ", " zh k "], + [" trung tam thuong mdhi ", " tttm "], + [" poligono industrial ", " pgind "], + [" trung hoc pho thong ", " thpt "], + [" onze lieve vrouw e ", " olv "], + [" strada provinciale ", " sp "], + ["onze lieve vrouw e ", " olv "], + [" punto kilometrico ", " pk "], + [" cong vien van hoa ", " cvvh "], + [" can cu khong quan ", " cckq "], + ["strada provinciale ", " sp "], + [" strada regionale ", " sr "], + [" strada comunale ", " sc "], + ["strada regionale ", " sr "], + [" trung hoc co so ", " thcs "], + [" san bay quoc te ", " sbqt "], + [" cong ty co phyn ", " ctcp "], + [" khu cong nghiep ", " kcn "], + [" air force base ", " afb "], + [" strada statale ", " ss "], + [" vien bcyo tang ", " vbt "], + ["strada comunale ", " sc "], + [" circunvalacion ", " ccvcn "], + [" paseo maritimo ", " psmar "], + [" wielkopolskie ", " wlkp "], + [" national park ", " np "], + [" middle school ", " ms "], + [" international ", " intl "], + [" burgermeister ", " bgm "], + [" vuon quoc gia ", " vqg "], + [" qucyng truong ", " qt "], + ["strada statale ", " ss "], + [" state highway ", " sh "], + ["burgermeister ", " bgm "], + [" right of way ", " rowy "], + [" hauptbahnhof ", " hbf "], + [" apartamentos ", " aptos "], + [" wielkopolski ", " wlkp "], + [" burgemeester ", " bg "], + [" camino nuevo ", " c n "], + [" camino hondo ", " c h "], + [" urbanizacion ", " urb "], + [" camino viejo ", " c v "], + [" wielkopolska ", " wlkp "], + [" wojewodztwie ", " woj "], + [" county route ", " cr "], + [" prolongacion ", " prol "], + [" thoroughfare ", " thor "], + [" san van dong ", " svd "], + [" tong cong ty ", " tct "], + [" khu nghi mat ", " knm "], + [" nha thi dzu ", " ntd "], + [" khu du lich ", " kdl "], + [" demarcacion ", " demar "], + [" cau ldhc bo ", " clb "], + [" interchange ", " intg "], + [" distributor ", " dstr "], + [" state route ", " sr "], + [" wojewodztwo ", " woj "], + [" reservation ", " res "], + [" monseigneur ", " mgr "], + [" transversal ", " trval "], + [" extrarradio ", " extrr "], + [" high school ", " hs "], + [" mazowieckie ", " maz "], + [" residencial ", " resid "], + [" cong truong ", " ct "], + [" cooperativa ", " coop "], + [" diseminado ", " disem "], + [" barranquil ", " bqllo "], + [" fire track ", " ftrk "], + [" south east ", " se "], + [" north east ", " ne "], + [" university ", " univ "], + [" south west ", " sw "], + [" monasterio ", " mtrio "], + [" vecindario ", " vecin "], + [" carreterin ", " ctrin "], + [" callejuela ", " cjla "], + [" north-east ", " ne "], + [" south-west ", " sw "], + [" gebroeders ", " gebr "], + [" serviceway ", " swy "], + [" quadrangle ", " qdgl "], + [" commandant ", " cmdt "], + [" extramuros ", " extrm "], + [" escalinata ", " escal "], + [" north-west ", " n "], + [" bulevardul ", " bd "], + [" particular ", " parti "], + [" mazowiecka ", " maz "], + [" mazowiecki ", " maz "], + [" north west ", " n "], + [" industrial ", " ind "], + [" costanilla ", " cstan "], + [" khach sdhn ", " ks "], + [" south-east ", " se "], + [" phi truong ", " pt "], + [" expressway ", " exp "], + [" fondamenta ", " f ta "], + [" apartments ", " apts "], + [" cul de sac ", " cds "], + [" corralillo ", " crrlo "], + [" mitropolit ", " mit "], + [" etorbidea ", " etorb "], + [" ploshchad ", " pl "], + [" cobertizo ", " cbtiz "], + [" underpass ", " upas "], + [" crossroad ", " crd "], + [" fundatura ", " fnd "], + [" foreshore ", " fshr "], + [" parklands ", " pkld "], + [" esplanade ", " esp "], + [" centreway ", " cnwy "], + [" formation ", " form "], + [" explanada ", " expla "], + [" viviendas ", " vvdas "], + [" northeast ", " ne "], + [" cong vien ", " cv "], + [" northwest ", " n "], + [" buildings ", " bldgs "], + [" errepidea ", " err "], + [" extension ", " ex "], + [" municipal ", " mun "], + [" southeast ", " se "], + [" sanatorio ", " sanat "], + [" thanh pho ", " tp "], + [" firetrail ", " fit "], + [" santuario ", " santu "], + [" southwest ", " sw "], + [" autopista ", " auto "], + [" president ", " pres "], + [" rinconada ", " rcda "], + [" kardinaal ", " kard "], + [" plazoleta ", " pzta "], + [" duong sat ", " ds "], + [" trung tam ", " tt "], + [" piazzetta ", " pta "], + [" boardwalk ", " bwlk "], + [" bulievard ", " bd "], + [" luitenant ", " luit "], + [" courtyard ", " ctyd "], + [" reservoir ", " res "], + [" bulevardu ", " bd "], + [" community ", " comm "], + [" concourse ", " con "], + [" profiesor ", " prof "], + [" promenade ", " prom "], + [" gienieral ", " ghien "], + [" puistikko ", " pko "], + [" balneario ", " balnr "], + [" carretera ", " ctra "], + [" ingenieur ", " ir "], + [" boulevard ", " bd "], + [" deviation ", " devn "], + [" hipodromo ", " hipod "], + [" professor ", " prof "], + [" triangle ", " tri "], + [" dotsient ", " dots "], + [" boundary ", " bdy "], + [" salizada ", " s da "], + [" trunkway ", " tkwy "], + [" cinturon ", " cint "], + ["president ", " pres "], + [" military ", " mil "], + [" jonkheer ", " jhr "], + [" motorway ", " mwy "], + [" steenweg ", " stwg "], + [" crescent ", " cr "], + [" kanunnik ", " kan "], + [" koningin ", " kon "], + [" crossing ", " xing "], + [" callejon ", " cjon "], + [" pasadizo ", " pzo "], + [" crossway ", " cowy "], + [" cottages ", " cotts "], + [" mountain ", " mtn "], + [" business ", " bus "], + [" pierwszy ", " 1 "], + [" pierwsza ", " 1 "], + [" pierwsze ", " 1 "], + [" barriada ", " barda "], + [" entrance ", " ent "], + [" causeway ", " cway "], + [" generaal ", " gen "], + [" driveway ", " dvwy "], + [" township ", " twp "], + [" stazione ", " staz "], + [" broadway ", " bway "], + [" alleyway ", " alwy "], + [" quadrant ", " qdrt "], + [" apeadero ", " apdro "], + [" arboleda ", " arb "], + [" escalera ", " esca "], + [" rdhp hat ", " rh "], + [" transito ", " trans "], + [" ddhi hoc ", " dh "], + [" travesia ", " trva "], + [" barranco ", " branc "], + [" namestie ", " nam "], + [" viaducto ", " vcto "], + [" convento ", " cnvto "], + [" estacion ", " estcn "], + ["puistikko ", " pko "], + [" precinct ", " pct "], + [" heiligen ", " hl "], + [" edificio ", " edifc "], + [" prazuela ", " przla "], + [" thi trzn ", " tt "], + [" ridgeway ", " rgwy "], + [" riverway ", " rvwy "], + [" corredor ", " crrdo "], + [" passatge ", " ptge "], + [" junction ", " jnc "], + [" hospital ", " hosp "], + [" highroad ", " hrd "], + [" torrente ", " trrnt "], + [" avinguda ", " av "], + [" portillo ", " ptilo "], + [" diagonal ", " diag "], + [" buu dien ", " bd "], + [" alqueria ", " alque "], + [" poligono ", " polig "], + [" roadside ", " rdsd "], + [" glorieta ", " gta "], + [" fundacul ", " fdc "], + [" cao dang ", " cd "], + [" rosebowl ", " rsbl "], + [" complejo ", " compj "], + [" carretil ", " crtil "], + [" intrarea ", " int "], + [" gran via ", " g v "], + [" approach ", " app "], + [" stradela ", " sdla "], + [" conjunto ", " cjto "], + [" arterial ", " artl "], + [" plazuela ", " plzla "], + [" frontage ", " frtg "], + [" faubourg ", " fg "], + [" mansions ", " mans "], + [" turnpike ", " tpk "], + [" piazzale ", " p le "], + [" tieu hoc ", " th "], + [" bulevard ", " bd "], + [" sendera ", " sedra "], + [" cutting ", " cutt "], + [" cantina ", " canti "], + [" cantera ", " cantr "], + [" rotonda ", " rtda "], + [" pasillo ", " psllo "], + [" landing ", " ldg "], + [" kolonel ", " kol "], + [" cong ty ", " cty "], + [" fairway ", " fawy "], + [" highway ", " hwy "], + [" lookout ", " lkt "], + [" meander ", " mr "], + [" carrera ", " cra "], + [" station ", " stn "], + [" kapitan ", " kap "], + [" medical ", " med "], + [" broeder ", " br "], + [" poblado ", " pbdo "], + [" impasse ", " imp "], + [" gardens ", " gdn "], + [" nha tho ", " nt "], + [" nha hat ", " nh "], + [" freeway ", " fwy "], + [" trasera ", " tras "], + [" portico ", " prtco "], + [" terrace ", " ter "], + [" heights ", " hts "], + [" camping ", " campg "], + [" callizo ", " cllzo "], + [" footway ", " ftwy "], + [" calzada ", " czada "], + [" dominee ", " ds "], + [" meadows ", " mdws "], + [" sendero ", " send "], + [" osiedle ", " os "], + [" estrada ", " estda "], + [" avenida ", " av "], + [" zgornji ", " zg "], + [" zgornje ", " zg "], + [" zgornja ", " zg "], + [" arrabal ", " arral "], + [" espalda ", " eslda "], + [" entrada ", " entd "], + [" kleiner ", " kl "], + [" kleines ", " kl "], + [" viaduct ", " via "], + [" roadway ", " rdwy "], + [" strasse ", " st "], + [" spodnje ", " sp "], + [" spodnji ", " sp "], + [" spodnja ", " sp "], + [" fabrica ", " fca "], + [" muntele ", " mt "], + [" maantee ", " mt "], + [" srednje ", " sr "], + [" unterer ", " u "], + [" unteres ", " u "], + [" plateau ", " plat "], + [" srednji ", " sr "], + [" empresa ", " empr "], + [" angosta ", " angta "], + [" costera ", " coste "], + [" tinh lo ", " tl "], + [" quoc lo ", " ql "], + [" auf der ", " a d "], + [" bulvari ", " bl "], + [" ddhi lo ", " dl "], + [" namesti ", " nam "], + [" passeig ", " pg "], + [" carrero ", " cro "], + [" cortijo ", " crtjo "], + [" san bay ", " sb "], + [" riviera ", " rvra "], + [" caddesi ", " cd "], + [" andador ", " andad "], + [" walkway ", " wkwy "], + [" granden ", " gr "], + [" grosser ", " gr "], + [" grosses ", " gr "], + [" reserve ", " res "], + [" alameda ", " alam "], + [" retreat ", " rtt "], + [" acequia ", " aceq "], + [" platsen ", " pl "], + [" bahnhof ", " bf "], + [" autovia ", " autov "], + [" srednja ", " sr "], + [" galeria ", " gale "], + [" circuit ", " cct "], + [" svingen ", " sv "], + [" plassen ", " pl "], + [" mirador ", " mrdor "], + [" laneway ", " lnwy "], + [" kolonia ", " kol "], + [" outlook ", " otlk "], + [" caravan ", " cvn "], + [" osiedlu ", " os "], + [" palacio ", " palac "], + [" pantano ", " pant "], + [" partida ", " ptda "], + [" calleja ", " cllja "], + [" mevrouw ", " mevr "], + [" meester ", " mr "], + [" pastoor ", " past "], + [" prinses ", " pr "], + [" bulevar ", " bd "], + [" tollway ", " tlwy "], + ["steenweg ", " stwg "], + [" caserio ", " csrio "], + [" mercado ", " merc "], + [" alejach ", " al "], + [" kvartal ", " kv "], + [" parkway ", " pwy "], + [" passage ", " ps "], + [" pathway ", " pway "], + [" splaiul ", " sp "], + [" soseaua ", " sos "], + [" colonia ", " col "], + [" wielkie ", " wlk "], + [" trzecie ", " 3 "], + [" llanura ", " llnra "], + [" malecon ", " malec "], + [" trzecia ", " 3 "], + [" trailer ", " trlr "], + [" cuadra ", " cuadr "], + [" cty cp ", " ctcp "], + [" paraje ", " praje "], + [" parque ", " pque "], + [" piazza ", " p za "], + [" puerta ", " pta "], + [" little ", " lt "], + [" pueblo ", " pblo "], + [" puente ", " pnte "], + [" jardin ", " jdin "], + [" granja ", " granj "], + [" market ", " mkt "], + [" pasaje ", " psaje "], + [" rotary ", " rty "], + [" corral ", " crral "], + [" siding ", " sdng "], + [" nucleo ", " ncleo "], + [" muelle ", " muell "], + [" carril ", " crril "], + [" portal ", " prtal "], + [" ramble ", " rmbl "], + [" pocket ", " pkt "], + [" chalet ", " chlet "], + [" canton ", " cant "], + [" ladera ", " ldera "], + [" parade ", " pde "], + [" dehesa ", " dhsa "], + [" museum ", " mus "], + [" middle ", " mid "], + [" cuesta ", " custa "], + [" gracht ", " gr "], + [" virful ", " vf "], + [" m tele ", " mt "], + [" varful ", " vf "], + [" str la ", " sdla "], + [" arcade ", " arc "], + [" strada ", " st "], + [" access ", " accs "], + [" bajada ", " bjada "], + [" veliki ", " v "], + ["strasse ", " st "], + [" velike ", " v "], + [" untere ", " u "], + [" velika ", " v "], + [" artery ", " arty "], + [" avenue ", " av "], + [" miasto ", " m "], + [" bypass ", " byp "], + [" placem ", " pl "], + [" barrio ", " bo "], + [" center ", " ctr "], + [" bldngs ", " bldgs "], + [" puerto ", " pto "], + [" wielka ", " wlk "], + [" tunnel ", " tun "], + [" wielki ", " wlk "], + [" bridge ", " bri "], + [" trzeci ", " 3 "], + [" veliko ", " v "], + [" quelle ", " qu "], + [" acceso ", " acces "], + [" bulvar ", " bl "], + [" sokagi ", " sk "], + ["platsen ", " pl "], + [" stigen ", " st "], + [" brucke ", " br "], + [" an der ", " a d "], + [" thi xa ", " tx "], + [" nordre ", " ndr "], + [" rambla ", " rbla "], + [" sondre ", " sdr "], + ["quoc lo ", " ql "], + [" phuong ", " p "], + [" vastra ", " v "], + [" carrer ", " c "], + [" oberes ", " o "], + [" raitti ", " r "], + [" puisto ", " ps "], + [" arroyo ", " arry "], + [" penger ", " pgr "], + [" oberer ", " o "], + [" kleine ", " kl "], + [" grosse ", " gr "], + ["granden ", " gr "], + [" villas ", " vlls "], + [" taival ", " tvl "], + [" in der ", " i d "], + [" centre ", " ctr "], + [" drugie ", " 2 "], + [" dokter ", " dr "], + [" grange ", " gra "], + [" doctor ", " dr "], + [" vicolo ", " v lo "], + [" kort e ", " k "], + [" koning ", " kon "], + [" straat ", " st "], + [" svieti ", " sv "], + [" callej ", " cjon "], + [" ground ", " grnd "], + [" vereda ", " vreda "], + [" chemin ", " ch "], + [" street ", " st "], + [" strand ", " st "], + [" sainte ", " ste "], + [" camino ", " cno "], + [" garden ", " gdn "], + [" follow ", " folw "], + [" estate ", " est "], + [" doktor ", " d r "], + [" subway ", " sbwy "], + [" ulitsa ", " ul "], + [" square ", " sq "], + [" towers ", " twrs "], + ["plassen ", " pl "], + [" county ", " co "], + [" brazal ", " brzal "], + [" circus ", " crcs "], + ["svingen ", " sv "], + [" rampla ", " rampa "], + [" bloque ", " blque "], + [" circle ", " cir "], + [" island ", " is "], + [" common ", " comm "], + [" ribera ", " rbra "], + [" sector ", " sect "], + [" rincon ", " rcon "], + [" van de ", " vd "], + [" corner ", " cnr "], + [" subida ", " sbida "], + [" banda ", " b "], + [" bulev ", " bd "], + [" barro ", " bo "], + [" cllon ", " cjon "], + [" p zza ", " p za "], + [" drugi ", " 2 "], + [" druga ", " 2 "], + [" placu ", " pl "], + [" aleji ", " al "], + [" aleja ", " al "], + [" aleje ", " al "], + [" stary ", " st "], + [" stara ", " st "], + [" dolny ", " dln "], + [" dolna ", " dln "], + [" gorne ", " gn "], + [" gorna ", " gn "], + [" stare ", " st "], + [" gorny ", " gn "], + [" ulicy ", " ul "], + [" ulica ", " ul "], + [" o l v ", " olv "], + [" plein ", " pln "], + [" markt ", " mkt "], + [" lange ", " l "], + [" viale ", " v le "], + ["gracht ", " gr "], + [" prins ", " pr "], + ["straat ", " st "], + [" plass ", " pl "], + [" sving ", " sv "], + [" gaten ", " g "], + [" veien ", " v "], + [" vliet ", " vlt "], + [" dolne ", " dln "], + [" b dul ", " bd "], + [" sodra ", " s "], + [" norra ", " n "], + [" gamla ", " gla "], + [" grand ", " gr "], + [" vagen ", " v "], + [" gatan ", " g "], + [" ostra ", " o "], + ["vastra ", " v "], + [" cadde ", " cd "], + [" duong ", " d "], + [" sokak ", " sk "], + [" plats ", " pl "], + ["stigen ", " st "], + [" vayla ", " vla "], + ["taival ", " tvl "], + [" sveti ", " sv "], + [" aukio ", " auk "], + [" sveta ", " sv "], + [" cesta ", " c "], + [" piata ", " pta "], + [" aleea ", " al "], + [" kaari ", " kri "], + ["penger ", " pgr "], + [" ranta ", " rt "], + [" rinne ", " rn "], + ["raitti ", " r "], + ["puisto ", " ps "], + [" polku ", " p "], + [" porta ", " pta "], + [" ponte ", " p te "], + [" paseo ", " po "], + [" fbrca ", " fca "], + [" allee ", " al "], + [" cours ", " crs "], + ["sainte ", " ste "], + ["square ", " sq "], + [" largo ", " l go "], + [" wharf ", " whrf "], + [" corte ", " c te "], + [" corso ", " c so "], + [" campo ", " c po "], + [" santa ", " sta "], + [" calle ", " c "], + [" strip ", " strp "], + [" alley ", " al "], + [" north ", " n "], + [" block ", " blk "], + [" gully ", " gly "], + [" sielo ", " s "], + [" brace ", " br "], + [" ronde ", " rnde "], + [" grove ", " gr "], + [" break ", " brk "], + [" roads ", " rds "], + [" track ", " trk "], + [" house ", " ho "], + [" trail ", " trl "], + [" mount ", " mt "], + [" cross ", " crss "], + [" beach ", " bch "], + [" point ", " pt "], + [" basin ", " basn "], + [" green ", " gn "], + [" plaza ", " pl "], + [" lille ", " ll "], + [" slope ", " slpe "], + [" placa ", " pl "], + [" place ", " pl "], + [" shunt ", " shun "], + [" saint ", " st "], + [" ulice ", " ul "], + [" amble ", " ambl "], + [" route ", " rt "], + [" sound ", " snd "], + [" store ", " st "], + [" front ", " frnt "], + [" elbow ", " elb "], + [" glade ", " gl "], + [" south ", " s "], + [" round ", " rnd "], + [" drive ", " dr "], + [" croft ", " cft "], + [" platz ", " pl "], + [" ferry ", " fy "], + [" ridge ", " rdge "], + [" tanav ", " tn "], + [" banan ", " ba "], + [" quays ", " qys "], + [" sankt ", " st "], + [" vkhod ", " vkh "], + [" chase ", " ch "], + [" vista ", " vsta "], + [" rhein ", " rh "], + [" court ", " ct "], + ["brucke ", " br "], + [" upper ", " up "], + [" river ", " r "], + [" range ", " rnge "], + [" lower ", " lr "], + [" kalea ", " k "], + [" crest ", " crst "], + [" obere ", " o "], + [" manor ", " mnr "], + [" byway ", " bywy "], + [" reach ", " rch "], + [" copse ", " cps "], + ["quelle ", " qu "], + [" creek ", " cr "], + [" close ", " c "], + [" fort ", " ft "], + [" apch ", " app "], + [" mont ", " mt "], + [" bdul ", " bd "], + ["saint ", " st "], + [" back ", " bk "], + [" c le ", " c "], + ["place ", " pl "], + [" frwy ", " fwy "], + [" quai ", " qu "], + [" ally ", " al "], + [" m te ", " mt "], + [" lane ", " ln "], + ["aukio ", " auk "], + [" loop ", " lp "], + [" line ", " ln "], + [" alue ", " al "], + [" link ", " lk "], + [" glde ", " gl "], + [" alea ", " al "], + [" gate ", " g "], + [" intr ", " int "], + [" gdns ", " gdn "], + [" hird ", " hrd "], + [" varf ", " vf "], + [" virf ", " vf "], + [" hgts ", " hts "], + [" expy ", " exp "], + ["markt ", " mkt "], + [" bypa ", " byp "], + ["o l v ", " olv "], + [" cres ", " cr "], + [" bdwy ", " bway "], + [" csac ", " cds "], + [" nowy ", " n "], + [" laan ", " ln "], + [" crsg ", " xing "], + ["vliet ", " vlt "], + [" city ", " cty "], + ["sving ", " sv "], + ["plass ", " pl "], + ["gaten ", " g "], + ["veien ", " v "], + [" gata ", " g "], + [" sint ", " st "], + [" caus ", " cway "], + [" cove ", " cv "], + ["plein ", " pln "], + [" cswy ", " cway "], + [" plac ", " pl "], + [" nowa ", " n "], + [" kolo ", " k "], + [" katu ", " k "], + [" duze ", " dz "], + [" blvd ", " bd "], + [" p ta ", " pta "], + [" maly ", " ml "], + [" mala ", " ml "], + [" bdge ", " bri "], + [" nowe ", " n "], + [" brdg ", " bri "], + [" male ", " ml "], + [" drwy ", " dvwy "], + [" duza ", " dz "], + [" utca ", " u "], + [" east ", " e "], + [" duzy ", " dz "], + ["kaari ", " kri "], + [" quan ", " q "], + [" svwy ", " swy "], + [" shwy ", " sh "], + [" road ", " rd "], + ["sankt ", " st "], + [" quay ", " qy "], + ["plats ", " pl "], + [" rise ", " ri "], + [" berg ", " bg "], + [" tcty ", " tct "], + [" viad ", " via "], + [" view ", " vw "], + [" vdct ", " via "], + [" vale ", " v "], + [" avda ", " av "], + [" grad ", " ghr "], + [" walk ", " wlk "], + [" west ", " w "], + [" yard ", " yd "], + [" blok ", " bl "], + [" terr ", " ter "], + [" cmno ", " cno "], + [" stra ", " st "], + [" thfr ", " thor "], + [" turn ", " tn "], + [" tpke ", " tpk "], + [" burg ", " bg "], + ["vayla ", " vla "], + ["vagen ", " v "], + [" tori ", " tr "], + ["gatan ", " g "], + ["grand ", " gr "], + [" pass ", " ps "], + [" pkwy ", " pwy "], + [" park ", " pk "], + ["rinne ", " rn "], + [" mtwy ", " mwy "], + [" mndr ", " mr "], + [" kyla ", " kl "], + [" kuja ", " kj "], + ["platz ", " pl "], + ["ranta ", " rt "], + [" mile ", " mi "], + [" pfad ", " p "], + [" mews ", " m "], + ["polku ", " p "], + [" psge ", " ps "], + [" plza ", " pl "], + ["ostra ", " o "], + ["gamla ", " gla "], + [" stig ", " st "], + ["norra ", " n "], + ["sodra ", " s "], + [" pike ", " pk "], + [" dorf ", " df "], + [" piaz ", " p za "], + [" phwy ", " pway "], + ["pfad ", " p "], + [" mnt ", " mt "], + ["gata ", " g "], + [" bhf ", " bf "], + [" bad ", " b "], + ["gate ", " g "], + [" zum ", " z "], + ["stig ", " st "], + [" blv ", " bd "], + ["kuja ", " kj "], + [" bul ", " bd "], + [" str ", " st "], + ["alue ", " al "], + [" cen ", " ctr "], + [" ave ", " av "], + ["kyla ", " kl "], + [" ale ", " al "], + [" spl ", " sp "], + [" all ", " al "], + [" k s ", " ks "], + [" aly ", " al "], + ["dorf ", " df "], + [" bvd ", " bd "], + [" vag ", " v "], + [" iii ", " 3 "], + [" tie ", " t "], + [" sok ", " sk "], + ["burg ", " bg "], + ["katu ", " k "], + ["berg ", " bg "], + ["tori ", " tr "], + [" kte ", " k "], + [" gro ", " gr "], + [" grn ", " gn "], + [" gld ", " gl "], + [" san ", " s "], + [" hse ", " ho "], + [" gte ", " g "], + [" rte ", " rt "], + [" rue ", " r "], + [" che ", " ch "], + [" pas ", " ps "], + [" plz ", " pl "], + [" pnt ", " pt "], + [" pky ", " pwy "], + [" pza ", " pl "], + [" rvr ", " r "], + [" riv ", " r "], + [" lit ", " lt "], + [" p k ", " pk "], + [" lwr ", " lr "], + [" low ", " lr "], + [" sth ", " s "], + [" crk ", " cr "], + ["pres ", " pres "], + ["laan ", " ln "], + [" bda ", " b "], + [" vei ", " v "], + [" via ", " v "], + [" way ", " wy "], + [" upr ", " up "], + [" avd ", " av "], + [" crt ", " ct "], + ["stwg ", " stwg "], + ["sint ", " st "], + [" v d ", " vd "], + [" van ", " v "], + [" drv ", " dr "], + [" tce ", " ter "], + [" va ", " v "], + [" oa ", " o "], + [" sa ", " s "], + [" na ", " n "], + ["bgm ", " bgm "], + [" nw ", " n "], + ["vag ", " v "], + [" im ", " 1 "], + ["vla ", " vla "], + ["gla ", " gla "], + [" am ", " a "], + [" ph ", " p "], + ["rue ", " r "], + [" ga ", " g "], + ["ste ", " ste "], + ["str ", " st "], + [" cl ", " c "], + [" vn ", " v "], + [" gt ", " g "], + ["vei ", " v "], + ["vlt ", " vlt "], + [" ce ", " cv "], + [" ii ", " 2 "], + ["pln ", " pln "], + ["olv ", " olv "], + ["mkt ", " mkt "], + ["tvl ", " tvl "], + [" ob ", " o "], + ["pgr ", " pgr "], + [" in ", " 1 "], + [" mw ", " m "], + ["kri ", " kri "], + ["pko ", " pko "], + ["auk ", " auk "], + ["tie ", " t "], + [" i ", " 1 "] + ] +}