2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
11 from pathlib import Path
13 import psycopg2.extras
15 from nominatim.db.connection import connect
16 from nominatim.db.properties import set_property, get_property
17 from nominatim.db.utils import CopyBuffer
18 from nominatim.db.sql_preprocessor import SQLPreprocessor
19 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
20 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
25 LOG = logging.getLogger()
27 def create(dsn, data_dir):
28 """ Create a new instance of the tokenizer provided by this module.
30 return LegacyICUTokenizer(dsn, data_dir)
33 class LegacyICUTokenizer:
34 """ This tokenizer uses libICU to covert names and queries to ASCII.
35 Otherwise it uses the same algorithms and data structures as the
36 normalization routines in Nominatim 3.
39 def __init__(self, dsn, data_dir):
41 self.data_dir = data_dir
42 self.naming_rules = None
43 self.term_normalization = None
44 self.max_word_frequency = None
47 def init_new_db(self, config, init_db=True):
48 """ Set up a new tokenizer for the database.
50 This copies all necessary data in the project directory to make
51 sure the tokenizer remains stable even over updates.
53 if config.TOKENIZER_CONFIG:
54 cfgfile = Path(config.TOKENIZER_CONFIG)
56 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
58 loader = ICURuleLoader(cfgfile)
59 self.naming_rules = ICUNameProcessorRules(loader=loader)
60 self.term_normalization = config.TERM_NORMALIZATION
61 self.max_word_frequency = config.MAX_WORD_FREQUENCY
63 self._install_php(config.lib_dir.php)
64 self._save_config(config)
67 self.update_sql_functions(config)
68 self._init_db_tables(config)
71 def init_from_project(self):
72 """ Initialise the tokenizer from the project directory.
74 with connect(self.dsn) as conn:
75 self.naming_rules = ICUNameProcessorRules(conn=conn)
76 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
77 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
80 def finalize_import(self, config):
81 """ Do any required postprocessing to make the tokenizer data ready
84 with connect(self.dsn) as conn:
85 sqlp = SQLPreprocessor(conn, config)
86 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
89 def update_sql_functions(self, config):
90 """ Reimport the SQL functions for this tokenizer.
92 with connect(self.dsn) as conn:
93 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
94 sqlp = SQLPreprocessor(conn, config)
95 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
96 max_word_freq=max_word_freq)
99 def check_database(self):
100 """ Check that the tokenizer is set up correctly.
102 self.init_from_project()
104 if self.naming_rules is None:
105 return "Configuration for tokenizer 'legacy_icu' are missing."
110 def name_analyzer(self):
111 """ Create a new analyzer for tokenizing names and queries
112 using this tokinzer. Analyzers are context managers and should
116 with tokenizer.name_analyzer() as analyzer:
120 When used outside the with construct, the caller must ensure to
121 call the close() function before destructing the analyzer.
123 Analyzers are not thread-safe. You need to instantiate one per thread.
125 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
128 def _install_php(self, phpdir):
129 """ Install the php script for the tokenizer.
131 php_file = self.data_dir / "tokenizer.php"
132 php_file.write_text(dedent("""\
134 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
135 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
136 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
137 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
138 """.format(self, phpdir))) # pylint: disable=missing-format-attribute
141 def _save_config(self, config):
142 """ Save the configuration that needs to remain stable for the given
143 database as database properties.
145 with connect(self.dsn) as conn:
146 self.naming_rules.save_rules(conn)
148 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
149 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
152 def _init_db_tables(self, config):
153 """ Set up the word table and fill it with pre-computed word
156 with connect(self.dsn) as conn:
157 sqlp = SQLPreprocessor(conn, config)
158 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
161 LOG.warning("Precomputing word tokens")
163 # get partial words and their frequencies
165 name_proc = ICUNameProcessor(self.naming_rules)
166 with conn.cursor(name="words") as cur:
167 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
169 for name, cnt in cur:
170 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
171 for term in word.split():
174 # copy them back into the word table
175 with CopyBuffer() as copystr:
176 for args in words.items():
179 with conn.cursor() as cur:
180 copystr.copy_out(cur, 'word',
181 columns=['word_token', 'search_name_count'])
182 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
183 WHERE word_id is null""")
188 class LegacyICUNameAnalyzer:
189 """ The legacy analyzer uses the ICU library for splitting names.
191 Each instance opens a connection to the database to request the
195 def __init__(self, dsn, name_proc):
196 self.conn = connect(dsn).connection
197 self.conn.autocommit = True
198 self.name_processor = name_proc
200 self._cache = _TokenCache()
207 def __exit__(self, exc_type, exc_value, traceback):
212 """ Free all resources used by the analyzer.
219 def get_word_token_info(self, words):
220 """ Return token information for the given list of words.
221 If a word starts with # it is assumed to be a full name
222 otherwise is a partial name.
224 The function returns a list of tuples with
225 (original word, word token, word id).
227 The function is used for testing and debugging only
228 and not necessarily efficient.
232 if word.startswith('#'):
233 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
235 tokens[word] = self.name_processor.get_search_normalized(word)
237 with self.conn.cursor() as cur:
238 cur.execute("""SELECT word_token, word_id
239 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
240 WHERE word_token = t.term
241 and class is null and country_code is null""",
242 (list(tokens.values()), ))
243 ids = {r[0]: r[1] for r in cur}
245 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
249 def normalize_postcode(postcode):
250 """ Convert the postcode to a standardized form.
252 This function must yield exactly the same result as the SQL function
253 'token_normalized_postcode()'.
255 return postcode.strip().upper()
258 def _make_standard_hnr(self, hnr):
259 """ Create a normalised version of a housenumber.
261 This function takes minor shortcuts on transliteration.
263 return self.name_processor.get_search_normalized(hnr)
265 def update_postcodes_from_db(self):
266 """ Update postcode tokens in the word table from the location_postcode
270 with self.conn.cursor() as cur:
271 # This finds us the rows in location_postcode and word that are
272 # missing in the other table.
273 cur.execute("""SELECT * FROM
274 (SELECT pc, word FROM
275 (SELECT distinct(postcode) as pc FROM location_postcode) p
277 (SELECT word FROM word
278 WHERE class ='place' and type = 'postcode') w
280 WHERE pc is null or word is null""")
282 with CopyBuffer() as copystr:
283 for postcode, word in cur:
285 to_delete.append(word)
289 ' ' + self.name_processor.get_search_normalized(postcode),
290 'place', 'postcode', 0)
293 cur.execute("""DELETE FROM WORD
294 WHERE class ='place' and type = 'postcode'
298 copystr.copy_out(cur, 'word',
299 columns=['word', 'word_token', 'class', 'type',
300 'search_name_count'])
303 def update_special_phrases(self, phrases, should_replace):
304 """ Replace the search index for special phrases with the new phrases.
306 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
309 with self.conn.cursor() as cur:
310 # Get the old phrases.
311 existing_phrases = set()
312 cur.execute("""SELECT word, class, type, operator FROM word
313 WHERE class != 'place'
314 OR (type != 'house' AND type != 'postcode')""")
315 for label, cls, typ, oper in cur:
316 existing_phrases.add((label, cls, typ, oper or '-'))
318 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
320 deleted = self._remove_special_phrases(cur, norm_phrases,
325 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
326 len(norm_phrases), added, deleted)
329 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
330 """ Add all phrases to the database that are not yet there.
332 to_add = new_phrases - existing_phrases
335 with CopyBuffer() as copystr:
336 for word, cls, typ, oper in to_add:
337 term = self.name_processor.get_search_normalized(word)
339 copystr.add(word, term, cls, typ,
340 oper if oper in ('in', 'near') else None, 0)
343 copystr.copy_out(cursor, 'word',
344 columns=['word', 'word_token', 'class', 'type',
345 'operator', 'search_name_count'])
351 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
352 """ Remove all phrases from the databse that are no longer in the
355 to_delete = existing_phrases - new_phrases
358 psycopg2.extras.execute_values(
360 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
361 WHERE word = name and class = in_class and type = in_type
362 and ((op = '-' and operator is null) or op = operator)""",
365 return len(to_delete)
368 def add_country_names(self, country_code, names):
369 """ Add names for the given country to the search index.
372 for name in self._compute_full_names(names):
374 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
376 with self.conn.cursor() as cur:
378 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
380 word_tokens.difference_update((t[0] for t in cur))
383 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
385 (SELECT nextval('seq_word'), token, '{}', 0
386 FROM unnest(%s) as token)
387 """.format(country_code), (list(word_tokens),))
390 def process_place(self, place):
391 """ Determine tokenizer information about the given place.
393 Returns a JSON-serialisable structure that will be handed into
394 the database via the token_info field.
396 token_info = _TokenInfo(self._cache)
398 names = place.get('name')
401 fulls, partials = self._compute_name_tokens(names)
403 token_info.add_names(fulls, partials)
405 country_feature = place.get('country_feature')
406 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
407 self.add_country_names(country_feature.lower(), names)
409 address = place.get('address')
414 for key, value in address.items():
415 if key == 'postcode':
416 self._add_postcode(value)
417 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
419 elif key == 'street':
420 token_info.add_street(*self._compute_name_tokens({'name': value}))
422 token_info.add_place(*self._compute_name_tokens({'name': value}))
423 elif not key.startswith('_') and \
424 key not in ('country', 'full'):
425 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
428 hnrs = self._split_housenumbers(hnrs)
429 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
432 token_info.add_address_terms(addr_terms)
434 return token_info.data
437 def _compute_name_tokens(self, names):
438 """ Computes the full name and partial name tokens for the given
441 full_names = self._compute_full_names(names)
443 partial_tokens = set()
445 for name in full_names:
446 norm_name = self.name_processor.get_normalized(name)
447 full, part = self._cache.names.get(norm_name, (None, None))
449 variants = self.name_processor.get_variants_ascii(norm_name)
450 with self.conn.cursor() as cur:
451 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
452 (norm_name, variants))
453 full, part = cur.fetchone()
455 self._cache.names[norm_name] = (full, part)
457 full_tokens.add(full)
458 partial_tokens.update(part)
460 return full_tokens, partial_tokens
464 def _compute_full_names(names):
465 """ Return the set of all full name word ids to be used with the
466 given dictionary of names.
469 for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
470 full_names.add(name.strip())
472 brace_idx = name.find('(')
474 full_names.add(name[:brace_idx].strip())
479 def _add_postcode(self, postcode):
480 """ Make sure the normalized postcode is present in the word table.
482 if re.search(r'[:,;]', postcode) is None:
483 postcode = self.normalize_postcode(postcode)
485 if postcode not in self._cache.postcodes:
486 term = self.name_processor.get_search_normalized(postcode)
490 with self.conn.cursor() as cur:
491 # no word_id needed for postcodes
492 cur.execute("""INSERT INTO word (word, word_token, class, type,
494 (SELECT pc, %s, 'place', 'postcode', 0
495 FROM (VALUES (%s)) as v(pc)
498 WHERE word = pc and class='place' and type='postcode'))
499 """, (' ' + term, postcode))
500 self._cache.postcodes.add(postcode)
504 def _split_housenumbers(hnrs):
505 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
506 # split numbers if necessary
509 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
511 if len(simple_list) > 1:
512 hnrs = list(set(simple_list))
522 """ Collect token information to be sent back to the database.
524 def __init__(self, cache):
529 def _mk_array(tokens):
530 return '{%s}' % ','.join((str(s) for s in tokens))
533 def add_names(self, fulls, partials):
534 """ Adds token information for the normalised names.
536 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
539 def add_housenumbers(self, conn, hnrs):
540 """ Extract housenumber information from a list of normalised
543 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
544 self.data['hnr'] = ';'.join(hnrs)
547 def add_street(self, fulls, _):
548 """ Add addr:street match terms.
551 self.data['street'] = self._mk_array(fulls)
554 def add_place(self, fulls, partials):
555 """ Add addr:place search and match terms.
558 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
559 self.data['place_match'] = self._mk_array(fulls)
562 def add_address_terms(self, terms):
563 """ Add additional address terms.
567 for key, fulls, partials in terms:
569 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
570 self._mk_array(fulls)]
573 self.data['addr'] = tokens
577 """ Cache for token information to avoid repeated database queries.
579 This cache is not thread-safe and needs to be instantiated per
584 self.postcodes = set()
585 self.housenumbers = {}
588 def get_hnr_tokens(self, conn, terms):
589 """ Get token ids for a list of housenumbers, looking them up in the
590 database if necessary.
596 token = self.housenumbers.get(term)
603 with conn.cursor() as cur:
604 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
606 for term, tid in cur:
607 self.housenumbers[term] = tid