2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
12 from textwrap import dedent
13 from pathlib import Path
15 from icu import Transliterator
16 import psycopg2.extras
18 from nominatim.db.connection import connect
19 from nominatim.db.properties import set_property, get_property
20 from nominatim.db.sql_preprocessor import SQLPreprocessor
22 DBCFG_NORMALIZATION = "tokenizer_normalization"
23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
27 LOG = logging.getLogger()
29 def create(dsn, data_dir):
30 """ Create a new instance of the tokenizer provided by this module.
32 return LegacyICUTokenizer(dsn, data_dir)
35 class LegacyICUTokenizer:
36 """ This tokenizer uses libICU to covert names and queries to ASCII.
37 Otherwise it uses the same algorithms and data structures as the
38 normalization routines in Nominatm 3.
41 def __init__(self, dsn, data_dir):
43 self.data_dir = data_dir
44 self.normalization = None
45 self.transliteration = None
46 self.abbreviations = None
49 def init_new_db(self, config, init_db=True):
50 """ Set up a new tokenizer for the database.
52 This copies all necessary data in the project directory to make
53 sure the tokenizer remains stable even over updates.
55 if config.TOKENIZER_CONFIG:
56 cfgfile = Path(config.TOKENIZER_CONFIG)
58 cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
60 rules = json.loads(cfgfile.read_text())
61 self.transliteration = ';'.join(rules['normalization']) + ';'
62 self.abbreviations = rules["abbreviations"]
63 self.normalization = config.TERM_NORMALIZATION
65 self._install_php(config)
66 self._save_config(config)
69 self.update_sql_functions(config)
70 self._init_db_tables(config)
73 def init_from_project(self):
74 """ Initialise the tokenizer from the project directory.
76 with connect(self.dsn) as conn:
77 self.normalization = get_property(conn, DBCFG_NORMALIZATION)
78 self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
79 self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
82 def finalize_import(self, config):
83 """ Do any required postprocessing to make the tokenizer data ready
86 with connect(self.dsn) as conn:
87 sqlp = SQLPreprocessor(conn, config)
88 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
91 def update_sql_functions(self, config):
92 """ Reimport the SQL functions for this tokenizer.
94 with connect(self.dsn) as conn:
95 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
98 max_word_freq=max_word_freq)
101 def check_database(self):
102 """ Check that the tokenizer is set up correctly.
104 self.init_from_project()
106 if self.normalization is None\
107 or self.transliteration is None\
108 or self.abbreviations is None:
109 return "Configuration for tokenizer 'legacy_icu' are missing."
114 def name_analyzer(self):
115 """ Create a new analyzer for tokenizing names and queries
116 using this tokinzer. Analyzers are context managers and should
120 with tokenizer.name_analyzer() as analyzer:
124 When used outside the with construct, the caller must ensure to
125 call the close() function before destructing the analyzer.
127 Analyzers are not thread-safe. You need to instantiate one per thread.
129 norm = Transliterator.createFromRules("normalizer", self.normalization)
130 trans = Transliterator.createFromRules("normalizer", self.transliteration)
131 return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
134 def _install_php(self, config):
135 """ Install the php script for the tokenizer.
137 abbr_inverse = list(zip(*self.abbreviations))
138 php_file = self.data_dir / "tokenizer.php"
139 php_file.write_text(dedent("""\
141 @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
142 @define('CONST_Term_Normalization_Rules', "{0.normalization}");
143 @define('CONST_Transliteration', "{0.transliteration}");
144 @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
145 require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
146 """.format(self, config,
147 "','".join(abbr_inverse[0]),
148 "','".join(abbr_inverse[1]))))
151 def _save_config(self, config):
152 """ Save the configuration that needs to remain stable for the given
153 database as database properties.
155 with connect(self.dsn) as conn:
156 set_property(conn, DBCFG_NORMALIZATION, self.normalization)
157 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
158 set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
159 set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
162 def _init_db_tables(self, config):
163 """ Set up the word table and fill it with pre-computed word
166 with connect(self.dsn) as conn:
167 sqlp = SQLPreprocessor(conn, config)
168 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
171 LOG.warning("Precomputing word tokens")
173 # get partial words and their frequencies
175 with self.name_analyzer() as analyzer:
176 with conn.cursor(name="words") as cur:
177 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
179 for name, cnt in cur:
180 term = analyzer.make_standard_word(name)
182 for word in term.split():
185 # copy them back into the word table
186 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
188 with conn.cursor() as cur:
189 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
190 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
191 WHERE word_id is null""")
196 class LegacyICUNameAnalyzer:
197 """ The legacy analyzer uses the ICU library for splitting names.
199 Each instance opens a connection to the database to request the
203 def __init__(self, dsn, normalizer, transliterator, abbreviations):
204 self.conn = connect(dsn).connection
205 self.conn.autocommit = True
206 self.normalizer = normalizer
207 self.transliterator = transliterator
208 self.abbreviations = abbreviations
209 #psycopg2.extras.register_hstore(self.conn)
211 self._cache = _TokenCache()
218 def __exit__(self, exc_type, exc_value, traceback):
223 """ Free all resources used by the analyzer.
230 def normalize(self, phrase):
231 """ Normalize the given phrase, i.e. remove all properties that
232 are irrelevant for search.
234 return self.normalizer.transliterate(phrase)
236 @functools.lru_cache(maxsize=1024)
237 def make_standard_word(self, name):
238 """ Create the normalised version of the name.
240 norm = ' ' + self.transliterator.transliterate(name) + ' '
241 for full, abbr in self.abbreviations:
243 norm = norm.replace(full, abbr)
248 def _make_standard_hnr(self, hnr):
249 """ Create a normalised version of a housenumber.
251 This function takes minor shortcuts on transliteration.
256 return self.transliterator.transliterate(hnr)
258 def add_postcodes_from_db(self):
259 """ Add postcodes from the location_postcode table to the word table.
261 copystr = io.StringIO()
262 with self.conn.cursor() as cur:
263 cur.execute("SELECT distinct(postcode) FROM location_postcode")
264 for (postcode, ) in cur:
265 copystr.write(postcode)
267 copystr.write(self.transliterator.transliterate(postcode))
268 copystr.write('\tplace\tpostcode\t0\n')
270 cur.copy_from(copystr, 'word',
271 columns=['word', 'word_token', 'class', 'type',
272 'search_name_count'])
273 # Don't really need an ID for postcodes....
274 # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
275 # WHERE word_id is null and type = 'postcode'""")
278 def update_special_phrases(self, phrases):
279 """ Replace the search index for special phrases with the new phrases.
281 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
284 with self.conn.cursor() as cur:
285 # Get the old phrases.
286 existing_phrases = set()
287 cur.execute("""SELECT word, class, type, operator FROM word
288 WHERE class != 'place'
289 OR (type != 'house' AND type != 'postcode')""")
290 for label, cls, typ, oper in cur:
291 existing_phrases.add((label, cls, typ, oper or '-'))
293 to_add = norm_phrases - existing_phrases
294 to_delete = existing_phrases - norm_phrases
297 copystr = io.StringIO()
298 for word, cls, typ, oper in to_add:
299 term = self.make_standard_word(word)
309 copystr.write(oper if oper in ('in', 'near') else '\\N')
310 copystr.write('\t0\n')
312 cur.copy_from(copystr, 'word',
313 columns=['word', 'word_token', 'class', 'type',
314 'operator', 'search_name_count'])
317 psycopg2.extras.execute_values(
319 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
320 WHERE word = name and class = in_class and type = in_type
321 and ((op = '-' and operator is null) or op = operator)""",
324 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
325 len(norm_phrases), len(to_add), len(to_delete))
328 def add_country_names(self, country_code, names):
329 """ Add names for the given country to the search index.
331 full_names = set((self.make_standard_word(n) for n in names))
332 full_names.discard('')
333 self._add_normalised_country_names(country_code, full_names)
336 def _add_normalised_country_names(self, country_code, names):
337 """ Add names for the given country to the search index.
339 with self.conn.cursor() as cur:
341 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
343 new_names = names.difference((t[0] for t in cur))
346 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
348 (SELECT nextval('seq_word'), token, '{}', 0
349 FROM unnest(%s) as token)
350 """.format(country_code), (list(new_names),))
353 def process_place(self, place):
354 """ Determine tokenizer information about the given place.
356 Returns a JSON-serialisable structure that will be handed into
357 the database via the token_info field.
359 token_info = _TokenInfo(self._cache)
361 names = place.get('name')
364 full_names = set((self.make_standard_word(name) for name in names.values()))
365 full_names.discard('')
367 token_info.add_names(self.conn, full_names)
369 country_feature = place.get('country_feature')
370 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
371 self._add_normalised_country_names(country_feature.lower(),
374 address = place.get('address')
379 for key, value in address.items():
380 if key == 'postcode':
381 self._add_postcode(value)
382 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
384 elif key == 'street':
385 token_info.add_street(self.conn, self.make_standard_word(value))
387 token_info.add_place(self.conn, self.make_standard_word(value))
388 elif not key.startswith('_') and \
389 key not in ('country', 'full'):
390 addr_terms.append((key, self.make_standard_word(value)))
393 hnrs = self._split_housenumbers(hnrs)
394 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
397 token_info.add_address_terms(self.conn, addr_terms)
399 return token_info.data
402 def _add_postcode(self, postcode):
403 """ Make sure the normalized postcode is present in the word table.
405 if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
406 term = self.make_standard_word(postcode)
410 with self.conn.cursor() as cur:
411 # no word_id needed for postcodes
412 cur.execute("""INSERT INTO word (word, word_token, class, type,
414 (SELECT pc, %s, 'place', 'postcode', 0
415 FROM (VALUES (%s)) as v(pc)
418 WHERE word = pc and class='place' and type='postcode'))
419 """, (' ' + term, postcode))
420 self._cache.postcodes.add(postcode)
423 def _split_housenumbers(hnrs):
424 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
425 # split numbers if necessary
428 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
430 if len(simple_list) > 1:
431 hnrs = list(set(simple_list))
441 """ Collect token information to be sent back to the database.
443 def __init__(self, cache):
448 def _mk_array(tokens):
449 return '{%s}' % ','.join((str(s) for s in tokens))
452 def add_names(self, conn, names):
453 """ Adds token information for the normalised names.
455 # Start with all partial names
456 terms = set((part for ns in names for part in ns.split()))
457 # Add partials for the full terms (TO BE REMOVED)
458 terms.update((n for n in names))
460 terms.update((' ' + n for n in names))
462 self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
465 def add_housenumbers(self, conn, hnrs):
466 """ Extract housenumber information from a list of normalised
469 self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
470 self.data['hnr'] = ';'.join(hnrs)
473 def add_street(self, conn, street):
474 """ Add addr:street match terms.
481 tid = self.cache.names.get(term)
484 with conn.cursor() as cur:
485 cur.execute("""SELECT word_id FROM word
486 WHERE word_token = %s
487 and class is null and type is null""",
490 tid = cur.fetchone()[0]
491 self.cache.names[term] = tid
494 self.data['street'] = '{%d}' % tid
497 def add_place(self, conn, place):
498 """ Add addr:place search and match terms.
503 partial_ids = self.cache.get_term_tokens(conn, place.split())
504 tid = self.cache.get_term_tokens(conn, [' ' + place])
506 self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
507 self.data['place_match'] = '{%s}' % tid[0]
510 def add_address_terms(self, conn, terms):
511 """ Add additional address terms.
515 for key, value in terms:
518 partial_ids = self.cache.get_term_tokens(conn, value.split())
520 tid = self.cache.names.get(term)
523 with conn.cursor() as cur:
524 cur.execute("""SELECT word_id FROM word
525 WHERE word_token = %s
526 and class is null and type is null""",
529 tid = cur.fetchone()[0]
530 self.cache.names[term] = tid
532 tokens[key] = [self._mk_array(partial_ids),
533 '{%s}' % ('' if tid is None else str(tid))]
536 self.data['addr'] = tokens
540 """ Cache for token information to avoid repeated database queries.
542 This cache is not thread-safe and needs to be instantiated per
547 self.postcodes = set()
548 self.housenumbers = {}
551 def get_term_tokens(self, conn, terms):
552 """ Get token ids for a list of terms, looking them up in the database
559 token = self.names.get(term)
566 with conn.cursor() as cur:
567 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
569 for term, tid in cur:
570 self.names[term] = tid
577 def get_hnr_tokens(self, conn, terms):
578 """ Get token ids for a list of housenumbers, looking them up in the
579 database if necessary.
585 token = self.housenumbers.get(term)
592 with conn.cursor() as cur:
593 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
595 for term, tid in cur:
596 self.housenumbers[term] = tid