nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import io
   7 import itertools
   8 import logging
   9 import re
  10 from textwrap import dedent
  11 from pathlib import Path
  12
  13 import psycopg2.extras
  14
  15 from nominatim.db.connection import connect
  16 from nominatim.db.properties import set_property, get_property
  17 from nominatim.db.utils import CopyBuffer
  18 from nominatim.db.sql_preprocessor import SQLPreprocessor
  19 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  20 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  21
  22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  24
  25 LOG = logging.getLogger()
  26
  27 def create(dsn, data_dir):
  28     """ Create a new instance of the tokenizer provided by this module.
  29     """
  30     return LegacyICUTokenizer(dsn, data_dir)
  31
  32
  33 class LegacyICUTokenizer:
  34     """ This tokenizer uses libICU to covert names and queries to ASCII.
  35         Otherwise it uses the same algorithms and data structures as the
  36         normalization routines in Nominatim 3.
  37     """
  38
  39     def __init__(self, dsn, data_dir):
  40         self.dsn = dsn
  41         self.data_dir = data_dir
  42         self.naming_rules = None
  43         self.term_normalization = None
  44         self.max_word_frequency = None
  45
  46
  47     def init_new_db(self, config, init_db=True):
  48         """ Set up a new tokenizer for the database.
  49
  50             This copies all necessary data in the project directory to make
  51             sure the tokenizer remains stable even over updates.
  52         """
  53         if config.TOKENIZER_CONFIG:
  54             cfgfile = Path(config.TOKENIZER_CONFIG)
  55         else:
  56             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  57
  58         loader = ICURuleLoader(cfgfile)
  59         self.naming_rules = ICUNameProcessorRules(loader=loader)
  60         self.term_normalization = config.TERM_NORMALIZATION
  61         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  62
  63         self._install_php(config.lib_dir.php)
  64         self._save_config(config)
  65
  66         if init_db:
  67             self.update_sql_functions(config)
  68             self._init_db_tables(config)
  69
  70
  71     def init_from_project(self):
  72         """ Initialise the tokenizer from the project directory.
  73         """
  74         with connect(self.dsn) as conn:
  75             self.naming_rules = ICUNameProcessorRules(conn=conn)
  76             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  77             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  78
  79
  80     def finalize_import(self, config):
  81         """ Do any required postprocessing to make the tokenizer data ready
  82             for use.
  83         """
  84         with connect(self.dsn) as conn:
  85             sqlp = SQLPreprocessor(conn, config)
  86             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  87
  88
  89     def update_sql_functions(self, config):
  90         """ Reimport the SQL functions for this tokenizer.
  91         """
  92         with connect(self.dsn) as conn:
  93             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  94             sqlp = SQLPreprocessor(conn, config)
  95             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  96                               max_word_freq=max_word_freq)
  97
  98
  99     def check_database(self):
 100         """ Check that the tokenizer is set up correctly.
 101         """
 102         self.init_from_project()
 103
 104         if self.naming_rules is None:
 105             return "Configuration for tokenizer 'legacy_icu' are missing."
 106
 107         return None
 108
 109
 110     def name_analyzer(self):
 111         """ Create a new analyzer for tokenizing names and queries
 112             using this tokinzer. Analyzers are context managers and should
 113             be used accordingly:
 114
 115             ```
 116             with tokenizer.name_analyzer() as analyzer:
 117                 analyser.tokenize()
 118             ```
 119
 120             When used outside the with construct, the caller must ensure to
 121             call the close() function before destructing the analyzer.
 122
 123             Analyzers are not thread-safe. You need to instantiate one per thread.
 124         """
 125         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 126
 127
 128     def _install_php(self, phpdir):
 129         """ Install the php script for the tokenizer.
 130         """
 131         php_file = self.data_dir / "tokenizer.php"
 132         php_file.write_text(dedent("""\
 133             <?php
 134             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 135             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 136             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 137             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 138             """.format(self, phpdir))) # pylint: disable=missing-format-attribute
 139
 140
 141     def _save_config(self, config):
 142         """ Save the configuration that needs to remain stable for the given
 143             database as database properties.
 144         """
 145         with connect(self.dsn) as conn:
 146             self.naming_rules.save_rules(conn)
 147
 148             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 149             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 150
 151
 152     def _init_db_tables(self, config):
 153         """ Set up the word table and fill it with pre-computed word
 154             frequencies.
 155         """
 156         with connect(self.dsn) as conn:
 157             sqlp = SQLPreprocessor(conn, config)
 158             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 159             conn.commit()
 160
 161             LOG.warning("Precomputing word tokens")
 162
 163             # get partial words and their frequencies
 164             words = Counter()
 165             name_proc = ICUNameProcessor(self.naming_rules)
 166             with conn.cursor(name="words") as cur:
 167                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 168
 169                 for name, cnt in cur:
 170                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 171                         for term in word.split():
 172                             words[term] += cnt
 173
 174             # copy them back into the word table
 175             with CopyBuffer() as copystr:
 176                 for args in words.items():
 177                     copystr.add(*args)
 178
 179                 with conn.cursor() as cur:
 180                     copystr.copy_out(cur, 'word',
 181                                       columns=['word_token', 'search_name_count'])
 182                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 183                                    WHERE word_id is null""")
 184
 185             conn.commit()
 186
 187
 188 class LegacyICUNameAnalyzer:
 189     """ The legacy analyzer uses the ICU library for splitting names.
 190
 191         Each instance opens a connection to the database to request the
 192         normalization.
 193     """
 194
 195     def __init__(self, dsn, name_proc):
 196         self.conn = connect(dsn).connection
 197         self.conn.autocommit = True
 198         self.name_processor = name_proc
 199
 200         self._cache = _TokenCache()
 201
 202
 203     def __enter__(self):
 204         return self
 205
 206
 207     def __exit__(self, exc_type, exc_value, traceback):
 208         self.close()
 209
 210
 211     def close(self):
 212         """ Free all resources used by the analyzer.
 213         """
 214         if self.conn:
 215             self.conn.close()
 216             self.conn = None
 217
 218
 219     def get_word_token_info(self, words):
 220         """ Return token information for the given list of words.
 221             If a word starts with # it is assumed to be a full name
 222             otherwise is a partial name.
 223
 224             The function returns a list of tuples with
 225             (original word, word token, word id).
 226
 227             The function is used for testing and debugging only
 228             and not necessarily efficient.
 229         """
 230         tokens = {}
 231         for word in words:
 232             if word.startswith('#'):
 233                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 234             else:
 235                 tokens[word] = self.name_processor.get_search_normalized(word)
 236
 237         with self.conn.cursor() as cur:
 238             cur.execute("""SELECT word_token, word_id
 239                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 240                            WHERE word_token = t.term
 241                                  and class is null and country_code is null""",
 242                         (list(tokens.values()), ))
 243             ids = {r[0]: r[1] for r in cur}
 244
 245         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 246
 247
 248     @staticmethod
 249     def normalize_postcode(postcode):
 250         """ Convert the postcode to a standardized form.
 251
 252             This function must yield exactly the same result as the SQL function
 253             'token_normalized_postcode()'.
 254         """
 255         return postcode.strip().upper()
 256
 257
 258     def _make_standard_hnr(self, hnr):
 259         """ Create a normalised version of a housenumber.
 260
 261             This function takes minor shortcuts on transliteration.
 262         """
 263         return self.name_processor.get_search_normalized(hnr)
 264
 265     def update_postcodes_from_db(self):
 266         """ Update postcode tokens in the word table from the location_postcode
 267             table.
 268         """
 269         to_delete = []
 270         with self.conn.cursor() as cur:
 271             # This finds us the rows in location_postcode and word that are
 272             # missing in the other table.
 273             cur.execute("""SELECT * FROM
 274                             (SELECT pc, word FROM
 275                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 276                               FULL JOIN
 277                               (SELECT word FROM word
 278                                 WHERE class ='place' and type = 'postcode') w
 279                               ON pc = word) x
 280                            WHERE pc is null or word is null""")
 281
 282             with CopyBuffer() as copystr:
 283                 for postcode, word in cur:
 284                     if postcode is None:
 285                         to_delete.append(word)
 286                     else:
 287                         copystr.add(
 288                             postcode,
 289                             ' ' + self.name_processor.get_search_normalized(postcode),
 290                             'place', 'postcode', 0)
 291
 292                 if to_delete:
 293                     cur.execute("""DELETE FROM WORD
 294                                    WHERE class ='place' and type = 'postcode'
 295                                          and word = any(%s)
 296                                 """, (to_delete, ))
 297
 298                 copystr.copy_out(cur, 'word',
 299                                  columns=['word', 'word_token', 'class', 'type',
 300                                           'search_name_count'])
 301
 302
 303     def update_special_phrases(self, phrases, should_replace):
 304         """ Replace the search index for special phrases with the new phrases.
 305         """
 306         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 307                             for p in phrases))
 308
 309         with self.conn.cursor() as cur:
 310             # Get the old phrases.
 311             existing_phrases = set()
 312             cur.execute("""SELECT word, class, type, operator FROM word
 313                            WHERE class != 'place'
 314                                  OR (type != 'house' AND type != 'postcode')""")
 315             for label, cls, typ, oper in cur:
 316                 existing_phrases.add((label, cls, typ, oper or '-'))
 317
 318             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 319             if should_replace:
 320                 deleted = self._remove_special_phrases(cur, norm_phrases,
 321                                                        existing_phrases)
 322             else:
 323                 deleted = 0
 324
 325         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 326                  len(norm_phrases), added, deleted)
 327
 328
 329     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 330         """ Add all phrases to the database that are not yet there.
 331         """
 332         to_add = new_phrases - existing_phrases
 333
 334         added = 0
 335         with CopyBuffer() as copystr:
 336             for word, cls, typ, oper in to_add:
 337                 term = self.name_processor.get_search_normalized(word)
 338                 if term:
 339                     copystr.add(word, term, cls, typ,
 340                                 oper if oper in ('in', 'near')  else None, 0)
 341                     added += 1
 342
 343             copystr.copy_out(cursor, 'word',
 344                              columns=['word', 'word_token', 'class', 'type',
 345                                       'operator', 'search_name_count'])
 346
 347         return added
 348
 349
 350     @staticmethod
 351     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 352         """ Remove all phrases from the databse that are no longer in the
 353             new phrase list.
 354         """
 355         to_delete = existing_phrases - new_phrases
 356
 357         if to_delete:
 358             psycopg2.extras.execute_values(
 359                 cursor,
 360                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 361                     WHERE word = name and class = in_class and type = in_type
 362                           and ((op = '-' and operator is null) or op = operator)""",
 363                 to_delete)
 364
 365         return len(to_delete)
 366
 367
 368     def add_country_names(self, country_code, names):
 369         """ Add names for the given country to the search index.
 370         """
 371         word_tokens = set()
 372         for name in self._compute_full_names(names):
 373             if name:
 374                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 375
 376         with self.conn.cursor() as cur:
 377             # Get existing names
 378             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 379                         (country_code, ))
 380             word_tokens.difference_update((t[0] for t in cur))
 381
 382             if word_tokens:
 383                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 384                                                  search_name_count)
 385                                (SELECT nextval('seq_word'), token, '{}', 0
 386                                 FROM unnest(%s) as token)
 387                             """.format(country_code), (list(word_tokens),))
 388
 389
 390     def process_place(self, place):
 391         """ Determine tokenizer information about the given place.
 392
 393             Returns a JSON-serialisable structure that will be handed into
 394             the database via the token_info field.
 395         """
 396         token_info = _TokenInfo(self._cache)
 397
 398         names = place.get('name')
 399
 400         if names:
 401             fulls, partials = self._compute_name_tokens(names)
 402
 403             token_info.add_names(fulls, partials)
 404
 405             country_feature = place.get('country_feature')
 406             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 407                 self.add_country_names(country_feature.lower(), names)
 408
 409         address = place.get('address')
 410
 411         if address:
 412             hnrs = []
 413             addr_terms = []
 414             for key, value in address.items():
 415                 if key == 'postcode':
 416                     self._add_postcode(value)
 417                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 418                     hnrs.append(value)
 419                 elif key == 'street':
 420                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 421                 elif key == 'place':
 422                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 423                 elif not key.startswith('_') and \
 424                      key not in ('country', 'full'):
 425                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 426
 427             if hnrs:
 428                 hnrs = self._split_housenumbers(hnrs)
 429                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 430
 431             if addr_terms:
 432                 token_info.add_address_terms(addr_terms)
 433
 434         return token_info.data
 435
 436
 437     def _compute_name_tokens(self, names):
 438         """ Computes the full name and partial name tokens for the given
 439             dictionary of names.
 440         """
 441         full_names = self._compute_full_names(names)
 442         full_tokens = set()
 443         partial_tokens = set()
 444
 445         for name in full_names:
 446             norm_name = self.name_processor.get_normalized(name)
 447             full, part = self._cache.names.get(norm_name, (None, None))
 448             if full is None:
 449                 variants = self.name_processor.get_variants_ascii(norm_name)
 450                 with self.conn.cursor() as cur:
 451                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 452                                 (norm_name, variants))
 453                     full, part = cur.fetchone()
 454
 455                 self._cache.names[norm_name] = (full, part)
 456
 457             full_tokens.add(full)
 458             partial_tokens.update(part)
 459
 460         return full_tokens, partial_tokens
 461
 462
 463     @staticmethod
 464     def _compute_full_names(names):
 465         """ Return the set of all full name word ids to be used with the
 466             given dictionary of names.
 467         """
 468         full_names = set()
 469         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
 470             full_names.add(name.strip())
 471
 472             brace_idx = name.find('(')
 473             if brace_idx >= 0:
 474                 full_names.add(name[:brace_idx].strip())
 475
 476         return full_names
 477
 478
 479     def _add_postcode(self, postcode):
 480         """ Make sure the normalized postcode is present in the word table.
 481         """
 482         if re.search(r'[:,;]', postcode) is None:
 483             postcode = self.normalize_postcode(postcode)
 484
 485             if postcode not in self._cache.postcodes:
 486                 term = self.name_processor.get_search_normalized(postcode)
 487                 if not term:
 488                     return
 489
 490                 with self.conn.cursor() as cur:
 491                     # no word_id needed for postcodes
 492                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 493                                                      search_name_count)
 494                                    (SELECT pc, %s, 'place', 'postcode', 0
 495                                     FROM (VALUES (%s)) as v(pc)
 496                                     WHERE NOT EXISTS
 497                                      (SELECT * FROM word
 498                                       WHERE word = pc and class='place' and type='postcode'))
 499                                 """, (' ' + term, postcode))
 500                 self._cache.postcodes.add(postcode)
 501
 502
 503     @staticmethod
 504     def _split_housenumbers(hnrs):
 505         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 506             # split numbers if necessary
 507             simple_list = []
 508             for hnr in hnrs:
 509                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 510
 511             if len(simple_list) > 1:
 512                 hnrs = list(set(simple_list))
 513             else:
 514                 hnrs = simple_list
 515
 516         return hnrs
 517
 518
 519
 520
 521 class _TokenInfo:
 522     """ Collect token information to be sent back to the database.
 523     """
 524     def __init__(self, cache):
 525         self._cache = cache
 526         self.data = {}
 527
 528     @staticmethod
 529     def _mk_array(tokens):
 530         return '{%s}' % ','.join((str(s) for s in tokens))
 531
 532
 533     def add_names(self, fulls, partials):
 534         """ Adds token information for the normalised names.
 535         """
 536         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 537
 538
 539     def add_housenumbers(self, conn, hnrs):
 540         """ Extract housenumber information from a list of normalised
 541             housenumbers.
 542         """
 543         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 544         self.data['hnr'] = ';'.join(hnrs)
 545
 546
 547     def add_street(self, fulls, _):
 548         """ Add addr:street match terms.
 549         """
 550         if fulls:
 551             self.data['street'] = self._mk_array(fulls)
 552
 553
 554     def add_place(self, fulls, partials):
 555         """ Add addr:place search and match terms.
 556         """
 557         if fulls:
 558             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 559             self.data['place_match'] = self._mk_array(fulls)
 560
 561
 562     def add_address_terms(self, terms):
 563         """ Add additional address terms.
 564         """
 565         tokens = {}
 566
 567         for key, fulls, partials in terms:
 568             if fulls:
 569                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 570                                self._mk_array(fulls)]
 571
 572         if tokens:
 573             self.data['addr'] = tokens
 574
 575
 576 class _TokenCache:
 577     """ Cache for token information to avoid repeated database queries.
 578
 579         This cache is not thread-safe and needs to be instantiated per
 580         analyzer.
 581     """
 582     def __init__(self):
 583         self.names = {}
 584         self.postcodes = set()
 585         self.housenumbers = {}
 586
 587
 588     def get_hnr_tokens(self, conn, terms):
 589         """ Get token ids for a list of housenumbers, looking them up in the
 590             database if necessary.
 591         """
 592         tokens = []
 593         askdb = []
 594
 595         for term in terms:
 596             token = self.housenumbers.get(term)
 597             if token is None:
 598                 askdb.append(term)
 599             else:
 600                 tokens.append(token)
 601
 602         if askdb:
 603             with conn.cursor() as cur:
 604                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 605                             (askdb, ))
 606                 for term, tid in cur:
 607                     self.housenumbers[term] = tid
 608                     tokens.append(tid)
 609
 610         return tokens