nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import functools
   7 import io
   8 import itertools
   9 import json
  10 import logging
  11 import re
  12 from textwrap import dedent
  13 from pathlib import Path
  14
  15 from icu import Transliterator
  16 import psycopg2.extras
  17
  18 from nominatim.db.connection import connect
  19 from nominatim.db.properties import set_property, get_property
  20 from nominatim.db.sql_preprocessor import SQLPreprocessor
  21
  22 DBCFG_NORMALIZATION = "tokenizer_normalization"
  23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  26
  27 LOG = logging.getLogger()
  28
  29 def create(dsn, data_dir):
  30     """ Create a new instance of the tokenizer provided by this module.
  31     """
  32     return LegacyICUTokenizer(dsn, data_dir)
  33
  34
  35 class LegacyICUTokenizer:
  36     """ This tokenizer uses libICU to covert names and queries to ASCII.
  37         Otherwise it uses the same algorithms and data structures as the
  38         normalization routines in Nominatim 3.
  39     """
  40
  41     def __init__(self, dsn, data_dir):
  42         self.dsn = dsn
  43         self.data_dir = data_dir
  44         self.normalization = None
  45         self.transliteration = None
  46         self.abbreviations = None
  47
  48
  49     def init_new_db(self, config, init_db=True):
  50         """ Set up a new tokenizer for the database.
  51
  52             This copies all necessary data in the project directory to make
  53             sure the tokenizer remains stable even over updates.
  54         """
  55         if config.TOKENIZER_CONFIG:
  56             cfgfile = Path(config.TOKENIZER_CONFIG)
  57         else:
  58             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  59
  60         rules = json.loads(cfgfile.read_text())
  61         self._load_transliteration(rules['normalization'], cfgfile.parent)
  62         self.abbreviations = rules["abbreviations"]
  63         self.normalization = config.TERM_NORMALIZATION
  64
  65         self._install_php(config)
  66         self._save_config(config)
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._init_db_tables(config)
  71
  72
  73     def _load_transliteration(self, rules, cfg_path):
  74         if isinstance(rules, str):
  75             self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
  76         else:
  77             self.transliteration = ';'.join(rules) + ';'
  78
  79     def init_from_project(self):
  80         """ Initialise the tokenizer from the project directory.
  81         """
  82         with connect(self.dsn) as conn:
  83             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  84             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  85             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  86
  87
  88     def finalize_import(self, config):
  89         """ Do any required postprocessing to make the tokenizer data ready
  90             for use.
  91         """
  92         with connect(self.dsn) as conn:
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  95
  96
  97     def update_sql_functions(self, config):
  98         """ Reimport the SQL functions for this tokenizer.
  99         """
 100         with connect(self.dsn) as conn:
 101             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
 102             sqlp = SQLPreprocessor(conn, config)
 103             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
 104                               max_word_freq=max_word_freq)
 105
 106
 107     def check_database(self):
 108         """ Check that the tokenizer is set up correctly.
 109         """
 110         self.init_from_project()
 111
 112         if self.normalization is None\
 113            or self.transliteration is None\
 114            or self.abbreviations is None:
 115             return "Configuration for tokenizer 'legacy_icu' are missing."
 116
 117         return None
 118
 119
 120     def name_analyzer(self):
 121         """ Create a new analyzer for tokenizing names and queries
 122             using this tokinzer. Analyzers are context managers and should
 123             be used accordingly:
 124
 125             ```
 126             with tokenizer.name_analyzer() as analyzer:
 127                 analyser.tokenize()
 128             ```
 129
 130             When used outside the with construct, the caller must ensure to
 131             call the close() function before destructing the analyzer.
 132
 133             Analyzers are not thread-safe. You need to instantiate one per thread.
 134         """
 135         norm = Transliterator.createFromRules("normalizer", self.normalization)
 136         trans = Transliterator.createFromRules("trans", self.transliteration)
 137         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 138
 139
 140     def _install_php(self, config):
 141         """ Install the php script for the tokenizer.
 142         """
 143         abbr_inverse = list(zip(*self.abbreviations))
 144         php_file = self.data_dir / "tokenizer.php"
 145         php_file.write_text(dedent("""\
 146             <?php
 147             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 148             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 149             @define('CONST_Transliteration', "{0.transliteration}");
 150             @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
 151             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 152             """.format(self, config,
 153                        "','".join(abbr_inverse[0]),
 154                        "','".join(abbr_inverse[1]))))
 155
 156
 157     def _save_config(self, config):
 158         """ Save the configuration that needs to remain stable for the given
 159             database as database properties.
 160         """
 161         with connect(self.dsn) as conn:
 162             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 163             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 164             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 165             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 166
 167
 168     def _init_db_tables(self, config):
 169         """ Set up the word table and fill it with pre-computed word
 170             frequencies.
 171         """
 172         with connect(self.dsn) as conn:
 173             sqlp = SQLPreprocessor(conn, config)
 174             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 175             conn.commit()
 176
 177             LOG.warning("Precomputing word tokens")
 178
 179             # get partial words and their frequencies
 180             words = Counter()
 181             with self.name_analyzer() as analyzer:
 182                 with conn.cursor(name="words") as cur:
 183                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 184
 185                     for name, cnt in cur:
 186                         term = analyzer.make_standard_word(name)
 187                         if term:
 188                             for word in term.split():
 189                                 words[word] += cnt
 190
 191             # copy them back into the word table
 192             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 193
 194
 195             with conn.cursor() as cur:
 196                 copystr.seek(0)
 197                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 198                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 199                                WHERE word_id is null""")
 200
 201             conn.commit()
 202
 203
 204 class LegacyICUNameAnalyzer:
 205     """ The legacy analyzer uses the ICU library for splitting names.
 206
 207         Each instance opens a connection to the database to request the
 208         normalization.
 209     """
 210
 211     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 212         self.conn = connect(dsn).connection
 213         self.conn.autocommit = True
 214         self.normalizer = normalizer
 215         self.transliterator = transliterator
 216         self.abbreviations = abbreviations
 217
 218         self._cache = _TokenCache()
 219
 220
 221     def __enter__(self):
 222         return self
 223
 224
 225     def __exit__(self, exc_type, exc_value, traceback):
 226         self.close()
 227
 228
 229     def close(self):
 230         """ Free all resources used by the analyzer.
 231         """
 232         if self.conn:
 233             self.conn.close()
 234             self.conn = None
 235
 236
 237     def get_word_token_info(self, conn, words):
 238         """ Return token information for the given list of words.
 239             If a word starts with # it is assumed to be a full name
 240             otherwise is a partial name.
 241
 242             The function returns a list of tuples with
 243             (original word, word token, word id).
 244
 245             The function is used for testing and debugging only
 246             and not necessarily efficient.
 247         """
 248         tokens = {}
 249         for word in words:
 250             if word.startswith('#'):
 251                 tokens[word] = ' ' + self.make_standard_word(word[1:])
 252             else:
 253                 tokens[word] = self.make_standard_word(word)
 254
 255         with conn.cursor() as cur:
 256             cur.execute("""SELECT word_token, word_id
 257                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 258                            WHERE word_token = t.term
 259                                  and class is null and country_code is null""",
 260                         (list(tokens.values()), ))
 261             ids = {r[0]: r[1] for r in cur}
 262
 263         return [(k, v, ids[v]) for k, v in tokens.items()]
 264
 265
 266     def normalize(self, phrase):
 267         """ Normalize the given phrase, i.e. remove all properties that
 268             are irrelevant for search.
 269         """
 270         return self.normalizer.transliterate(phrase)
 271
 272     @staticmethod
 273     def normalize_postcode(postcode):
 274         """ Convert the postcode to a standardized form.
 275
 276             This function must yield exactly the same result as the SQL function
 277             'token_normalized_postcode()'.
 278         """
 279         return postcode.strip().upper()
 280
 281
 282     @functools.lru_cache(maxsize=1024)
 283     def make_standard_word(self, name):
 284         """ Create the normalised version of the input.
 285         """
 286         norm = ' ' + self.transliterator.transliterate(name) + ' '
 287         for full, abbr in self.abbreviations:
 288             if full in norm:
 289                 norm = norm.replace(full, abbr)
 290
 291         return norm.strip()
 292
 293
 294     def _make_standard_hnr(self, hnr):
 295         """ Create a normalised version of a housenumber.
 296
 297             This function takes minor shortcuts on transliteration.
 298         """
 299         if hnr.isdigit():
 300             return hnr
 301
 302         return self.transliterator.transliterate(hnr)
 303
 304     def update_postcodes_from_db(self):
 305         """ Update postcode tokens in the word table from the location_postcode
 306             table.
 307         """
 308         to_delete = []
 309         copystr = io.StringIO()
 310         with self.conn.cursor() as cur:
 311             # This finds us the rows in location_postcode and word that are
 312             # missing in the other table.
 313             cur.execute("""SELECT * FROM
 314                             (SELECT pc, word FROM
 315                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 316                               FULL JOIN
 317                               (SELECT word FROM word
 318                                 WHERE class ='place' and type = 'postcode') w
 319                               ON pc = word) x
 320                            WHERE pc is null or word is null""")
 321
 322             for postcode, word in cur:
 323                 if postcode is None:
 324                     to_delete.append(word)
 325                 else:
 326                     copystr.write(postcode)
 327                     copystr.write('\t ')
 328                     copystr.write(self.transliterator.transliterate(postcode))
 329                     copystr.write('\tplace\tpostcode\t0\n')
 330
 331             if to_delete:
 332                 cur.execute("""DELETE FROM WORD
 333                                WHERE class ='place' and type = 'postcode'
 334                                      and word = any(%s)
 335                             """, (to_delete, ))
 336
 337             if copystr.getvalue():
 338                 copystr.seek(0)
 339                 cur.copy_from(copystr, 'word',
 340                               columns=['word', 'word_token', 'class', 'type',
 341                                        'search_name_count'])
 342
 343
 344     def update_special_phrases(self, phrases, should_replace):
 345         """ Replace the search index for special phrases with the new phrases.
 346         """
 347         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 348                             for p in phrases))
 349
 350         with self.conn.cursor() as cur:
 351             # Get the old phrases.
 352             existing_phrases = set()
 353             cur.execute("""SELECT word, class, type, operator FROM word
 354                            WHERE class != 'place'
 355                                  OR (type != 'house' AND type != 'postcode')""")
 356             for label, cls, typ, oper in cur:
 357                 existing_phrases.add((label, cls, typ, oper or '-'))
 358
 359             to_add = norm_phrases - existing_phrases
 360             to_delete = existing_phrases - norm_phrases
 361
 362             if to_add:
 363                 copystr = io.StringIO()
 364                 for word, cls, typ, oper in to_add:
 365                     term = self.make_standard_word(word)
 366                     if term:
 367                         copystr.write(word)
 368                         copystr.write('\t ')
 369                         copystr.write(term)
 370                         copystr.write('\t')
 371                         copystr.write(cls)
 372                         copystr.write('\t')
 373                         copystr.write(typ)
 374                         copystr.write('\t')
 375                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 376                         copystr.write('\t0\n')
 377
 378                 copystr.seek(0)
 379                 cur.copy_from(copystr, 'word',
 380                               columns=['word', 'word_token', 'class', 'type',
 381                                        'operator', 'search_name_count'])
 382
 383             if to_delete and should_replace:
 384                 psycopg2.extras.execute_values(
 385                     cur,
 386                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 387                         WHERE word = name and class = in_class and type = in_type
 388                               and ((op = '-' and operator is null) or op = operator)""",
 389                     to_delete)
 390
 391         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 392                  len(norm_phrases), len(to_add), len(to_delete))
 393
 394
 395     def add_country_names(self, country_code, names):
 396         """ Add names for the given country to the search index.
 397         """
 398         full_names = set((self.make_standard_word(n) for n in names))
 399         full_names.discard('')
 400         self._add_normalized_country_names(country_code, full_names)
 401
 402
 403     def _add_normalized_country_names(self, country_code, names):
 404         """ Add names for the given country to the search index.
 405         """
 406         word_tokens = set((' ' + name for name in names))
 407         with self.conn.cursor() as cur:
 408             # Get existing names
 409             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 410                         (country_code, ))
 411             word_tokens.difference_update((t[0] for t in cur))
 412
 413             if word_tokens:
 414                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 415                                                  search_name_count)
 416                                (SELECT nextval('seq_word'), token, '{}', 0
 417                                 FROM unnest(%s) as token)
 418                             """.format(country_code), (list(word_tokens),))
 419
 420
 421     def process_place(self, place):
 422         """ Determine tokenizer information about the given place.
 423
 424             Returns a JSON-serialisable structure that will be handed into
 425             the database via the token_info field.
 426         """
 427         token_info = _TokenInfo(self._cache)
 428
 429         names = place.get('name')
 430
 431         if names:
 432             full_names = self._compute_full_names(names)
 433
 434             token_info.add_names(self.conn, full_names)
 435
 436             country_feature = place.get('country_feature')
 437             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 438                 self._add_normalized_country_names(country_feature.lower(),
 439                                                    full_names)
 440
 441         address = place.get('address')
 442
 443         if address:
 444             hnrs = []
 445             addr_terms = []
 446             for key, value in address.items():
 447                 if key == 'postcode':
 448                     self._add_postcode(value)
 449                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 450                     hnrs.append(value)
 451                 elif key == 'street':
 452                     token_info.add_street(self.conn, self.make_standard_word(value))
 453                 elif key == 'place':
 454                     token_info.add_place(self.conn, self.make_standard_word(value))
 455                 elif not key.startswith('_') and \
 456                      key not in ('country', 'full'):
 457                     addr_terms.append((key, self.make_standard_word(value)))
 458
 459             if hnrs:
 460                 hnrs = self._split_housenumbers(hnrs)
 461                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 462
 463             if addr_terms:
 464                 token_info.add_address_terms(self.conn, addr_terms)
 465
 466         return token_info.data
 467
 468
 469     def _compute_full_names(self, names):
 470         """ Return the set of all full name word ids to be used with the
 471             given dictionary of names.
 472         """
 473         full_names = set()
 474         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
 475             word = self.make_standard_word(name)
 476             if word:
 477                 full_names.add(word)
 478
 479                 brace_split = name.split('(', 2)
 480                 if len(brace_split) > 1:
 481                     word = self.make_standard_word(brace_split[0])
 482                     if word:
 483                         full_names.add(word)
 484
 485         return full_names
 486
 487
 488     def _add_postcode(self, postcode):
 489         """ Make sure the normalized postcode is present in the word table.
 490         """
 491         if re.search(r'[:,;]', postcode) is None:
 492             postcode = self.normalize_postcode(postcode)
 493
 494             if postcode not in self._cache.postcodes:
 495                 term = self.make_standard_word(postcode)
 496                 if not term:
 497                     return
 498
 499                 with self.conn.cursor() as cur:
 500                     # no word_id needed for postcodes
 501                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 502                                                      search_name_count)
 503                                    (SELECT pc, %s, 'place', 'postcode', 0
 504                                     FROM (VALUES (%s)) as v(pc)
 505                                     WHERE NOT EXISTS
 506                                      (SELECT * FROM word
 507                                       WHERE word = pc and class='place' and type='postcode'))
 508                                 """, (' ' + term, postcode))
 509                 self._cache.postcodes.add(postcode)
 510
 511     @staticmethod
 512     def _split_housenumbers(hnrs):
 513         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 514             # split numbers if necessary
 515             simple_list = []
 516             for hnr in hnrs:
 517                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 518
 519             if len(simple_list) > 1:
 520                 hnrs = list(set(simple_list))
 521             else:
 522                 hnrs = simple_list
 523
 524         return hnrs
 525
 526
 527
 528
 529 class _TokenInfo:
 530     """ Collect token information to be sent back to the database.
 531     """
 532     def __init__(self, cache):
 533         self.cache = cache
 534         self.data = {}
 535
 536     @staticmethod
 537     def _mk_array(tokens):
 538         return '{%s}' % ','.join((str(s) for s in tokens))
 539
 540
 541     def add_names(self, conn, names):
 542         """ Adds token information for the normalised names.
 543         """
 544         # Start with all partial names
 545         terms = set((part for ns in names for part in ns.split()))
 546         # Add the full names
 547         terms.update((' ' + n for n in names))
 548
 549         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 550
 551
 552     def add_housenumbers(self, conn, hnrs):
 553         """ Extract housenumber information from a list of normalised
 554             housenumbers.
 555         """
 556         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 557         self.data['hnr'] = ';'.join(hnrs)
 558
 559
 560     def add_street(self, conn, street):
 561         """ Add addr:street match terms.
 562         """
 563         if not street:
 564             return
 565
 566         term = ' ' + street
 567
 568         tid = self.cache.names.get(term)
 569
 570         if tid is None:
 571             with conn.cursor() as cur:
 572                 cur.execute("""SELECT word_id FROM word
 573                                 WHERE word_token = %s
 574                                       and class is null and type is null""",
 575                             (term, ))
 576                 if cur.rowcount > 0:
 577                     tid = cur.fetchone()[0]
 578                     self.cache.names[term] = tid
 579
 580         if tid is not None:
 581             self.data['street'] = '{%d}' % tid
 582
 583
 584     def add_place(self, conn, place):
 585         """ Add addr:place search and match terms.
 586         """
 587         if not place:
 588             return
 589
 590         partial_ids = self.cache.get_term_tokens(conn, place.split())
 591         tid = self.cache.get_term_tokens(conn, [' ' + place])
 592
 593         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 594         self.data['place_match'] = '{%s}' % tid[0]
 595
 596
 597     def add_address_terms(self, conn, terms):
 598         """ Add additional address terms.
 599         """
 600         tokens = {}
 601
 602         for key, value in terms:
 603             if not value:
 604                 continue
 605             partial_ids = self.cache.get_term_tokens(conn, value.split())
 606             term = ' ' + value
 607             tid = self.cache.names.get(term)
 608
 609             if tid is None:
 610                 with conn.cursor() as cur:
 611                     cur.execute("""SELECT word_id FROM word
 612                                     WHERE word_token = %s
 613                                           and class is null and type is null""",
 614                                 (term, ))
 615                     if cur.rowcount > 0:
 616                         tid = cur.fetchone()[0]
 617                         self.cache.names[term] = tid
 618
 619             tokens[key] = [self._mk_array(partial_ids),
 620                            '{%s}' % ('' if tid is None else str(tid))]
 621
 622         if tokens:
 623             self.data['addr'] = tokens
 624
 625
 626 class _TokenCache:
 627     """ Cache for token information to avoid repeated database queries.
 628
 629         This cache is not thread-safe and needs to be instantiated per
 630         analyzer.
 631     """
 632     def __init__(self):
 633         self.names = {}
 634         self.postcodes = set()
 635         self.housenumbers = {}
 636
 637
 638     def get_term_tokens(self, conn, terms):
 639         """ Get token ids for a list of terms, looking them up in the database
 640             if necessary.
 641         """
 642         tokens = []
 643         askdb = []
 644
 645         for term in terms:
 646             token = self.names.get(term)
 647             if token is None:
 648                 askdb.append(term)
 649             elif token != 0:
 650                 tokens.append(token)
 651
 652         if askdb:
 653             with conn.cursor() as cur:
 654                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 655                             (askdb, ))
 656                 for term, tid in cur:
 657                     self.names[term] = tid
 658                     if tid != 0:
 659                         tokens.append(tid)
 660
 661         return tokens
 662
 663
 664     def get_hnr_tokens(self, conn, terms):
 665         """ Get token ids for a list of housenumbers, looking them up in the
 666             database if necessary.
 667         """
 668         tokens = []
 669         askdb = []
 670
 671         for term in terms:
 672             token = self.housenumbers.get(term)
 673             if token is None:
 674                 askdb.append(term)
 675             else:
 676                 tokens.append(token)
 677
 678         if askdb:
 679             with conn.cursor() as cur:
 680                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 681                             (askdb, ))
 682                 for term, tid in cur:
 683                     self.housenumbers[term] = tid
 684                     tokens.append(tid)
 685
 686         return tokens