nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70
  71     def finalize_import(self, config):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75         with connect(self.dsn) as conn:
  76             sqlp = SQLPreprocessor(conn, config)
  77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  86
  87
  88     def check_database(self, config):
  89         """ Check that the tokenizer is set up correctly.
  90         """
  91         # Will throw an error if there is an issue.
  92         self.init_from_project(config)
  93
  94
  95     def update_statistics(self):
  96         """ Recompute frequencies for all name words.
  97         """
  98         with connect(self.dsn) as conn:
  99             if conn.table_exists('search_name'):
 100                 with conn.cursor() as cur:
 101                     cur.drop_table("word_frequencies")
 102                     LOG.info("Computing word frequencies")
 103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                      SELECT unnest(name_vector) as id, count(*)
 105                                      FROM search_name GROUP BY id""")
 106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                     LOG.info("Update word table with recomputed frequencies")
 108                     cur.execute("""UPDATE word
 109                                    SET info = info || jsonb_build_object('count', count)
 110                                    FROM word_frequencies WHERE word_id = id""")
 111                     cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def _cleanup_housenumbers(self):
 116         """ Remove unused house numbers.
 117         """
 118         with connect(self.dsn) as conn:
 119             if not conn.table_exists('search_name'):
 120                 return
 121             with conn.cursor(name="hnr_counter") as cur:
 122                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) FROM word
 123                                WHERE type = 'H'
 124                                  AND NOT EXISTS(SELECT * FROM search_name
 125                                                 WHERE ARRAY[word.word_id] && name_vector)
 126                                  AND (char_length(coalesce(word, word_token)) > 6
 127                                       OR coalesce(word, word_token) not similar to '\\d+')
 128                             """)
 129                 candidates = {token: wid for wid, token in cur}
 130             with conn.cursor(name="hnr_counter") as cur:
 131                 cur.execute("""SELECT housenumber FROM placex
 132                                WHERE housenumber is not null
 133                                      AND (char_length(housenumber) > 6
 134                                           OR housenumber not similar to '\\d+')
 135                             """)
 136                 for row in cur:
 137                     for hnr in row[0].split(';'):
 138                         candidates.pop(hnr, None)
 139             LOG.info("There are %s outdated housenumbers.", len(candidates))
 140             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 141             if candidates:
 142                 with conn.cursor() as cur:
 143                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 144                                 (list(candidates.values()), ))
 145                 conn.commit()
 146
 147
 148
 149     def update_word_tokens(self):
 150         """ Remove unused tokens.
 151         """
 152         LOG.warning("Cleaning up housenumber tokens.")
 153         self._cleanup_housenumbers()
 154         LOG.warning("Tokenizer house-keeping done.")
 155
 156
 157     def name_analyzer(self):
 158         """ Create a new analyzer for tokenizing names and queries
 159             using this tokinzer. Analyzers are context managers and should
 160             be used accordingly:
 161
 162             ```
 163             with tokenizer.name_analyzer() as analyzer:
 164                 analyser.tokenize()
 165             ```
 166
 167             When used outside the with construct, the caller must ensure to
 168             call the close() function before destructing the analyzer.
 169
 170             Analyzers are not thread-safe. You need to instantiate one per thread.
 171         """
 172         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 173                                      self.loader.make_token_analysis())
 174
 175
 176     def _install_php(self, phpdir):
 177         """ Install the php script for the tokenizer.
 178         """
 179         php_file = self.data_dir / "tokenizer.php"
 180         php_file.write_text(dedent(f"""\
 181             <?php
 182             @define('CONST_Max_Word_Frequency', 10000000);
 183             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 184             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 185             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 186
 187
 188     def _save_config(self):
 189         """ Save the configuration that needs to remain stable for the given
 190             database as database properties.
 191         """
 192         with connect(self.dsn) as conn:
 193             self.loader.save_config_to_db(conn)
 194
 195
 196     def _init_db_tables(self, config):
 197         """ Set up the word table and fill it with pre-computed word
 198             frequencies.
 199         """
 200         with connect(self.dsn) as conn:
 201             sqlp = SQLPreprocessor(conn, config)
 202             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 203             conn.commit()
 204
 205
 206 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 207     """ The legacy analyzer uses the ICU library for splitting names.
 208
 209         Each instance opens a connection to the database to request the
 210         normalization.
 211     """
 212
 213     def __init__(self, dsn, sanitizer, token_analysis):
 214         self.conn = connect(dsn).connection
 215         self.conn.autocommit = True
 216         self.sanitizer = sanitizer
 217         self.token_analysis = token_analysis
 218
 219         self._cache = _TokenCache()
 220
 221
 222     def close(self):
 223         """ Free all resources used by the analyzer.
 224         """
 225         if self.conn:
 226             self.conn.close()
 227             self.conn = None
 228
 229
 230     def _search_normalized(self, name):
 231         """ Return the search token transliteration of the given name.
 232         """
 233         return self.token_analysis.search.transliterate(name).strip()
 234
 235
 236     def _normalized(self, name):
 237         """ Return the normalized version of the given name with all
 238             non-relevant information removed.
 239         """
 240         return self.token_analysis.normalizer.transliterate(name).strip()
 241
 242
 243     def get_word_token_info(self, words):
 244         """ Return token information for the given list of words.
 245             If a word starts with # it is assumed to be a full name
 246             otherwise is a partial name.
 247
 248             The function returns a list of tuples with
 249             (original word, word token, word id).
 250
 251             The function is used for testing and debugging only
 252             and not necessarily efficient.
 253         """
 254         full_tokens = {}
 255         partial_tokens = {}
 256         for word in words:
 257             if word.startswith('#'):
 258                 full_tokens[word] = self._search_normalized(word[1:])
 259             else:
 260                 partial_tokens[word] = self._search_normalized(word)
 261
 262         with self.conn.cursor() as cur:
 263             cur.execute("""SELECT word_token, word_id
 264                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 265                         """, (list(full_tokens.values()),))
 266             full_ids = {r[0]: r[1] for r in cur}
 267             cur.execute("""SELECT word_token, word_id
 268                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 269                         (list(partial_tokens.values()),))
 270             part_ids = {r[0]: r[1] for r in cur}
 271
 272         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 273                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 274
 275
 276     @staticmethod
 277     def normalize_postcode(postcode):
 278         """ Convert the postcode to a standardized form.
 279
 280             This function must yield exactly the same result as the SQL function
 281             'token_normalized_postcode()'.
 282         """
 283         return postcode.strip().upper()
 284
 285
 286     def update_postcodes_from_db(self):
 287         """ Update postcode tokens in the word table from the location_postcode
 288             table.
 289         """
 290         to_delete = []
 291         with self.conn.cursor() as cur:
 292             # This finds us the rows in location_postcode and word that are
 293             # missing in the other table.
 294             cur.execute("""SELECT * FROM
 295                             (SELECT pc, word FROM
 296                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 297                               FULL JOIN
 298                               (SELECT word FROM word WHERE type = 'P') w
 299                               ON pc = word) x
 300                            WHERE pc is null or word is null""")
 301
 302             with CopyBuffer() as copystr:
 303                 for postcode, word in cur:
 304                     if postcode is None:
 305                         to_delete.append(word)
 306                     else:
 307                         copystr.add(self._search_normalized(postcode),
 308                                     'P', postcode)
 309
 310                 if to_delete:
 311                     cur.execute("""DELETE FROM WORD
 312                                    WHERE type ='P' and word = any(%s)
 313                                 """, (to_delete, ))
 314
 315                 copystr.copy_out(cur, 'word',
 316                                  columns=['word_token', 'type', 'word'])
 317
 318
 319     def update_special_phrases(self, phrases, should_replace):
 320         """ Replace the search index for special phrases with the new phrases.
 321             If `should_replace` is True, then the previous set of will be
 322             completely replaced. Otherwise the phrases are added to the
 323             already existing ones.
 324         """
 325         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 326                             for p in phrases))
 327
 328         with self.conn.cursor() as cur:
 329             # Get the old phrases.
 330             existing_phrases = set()
 331             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 332             for word, info in cur:
 333                 existing_phrases.add((word, info['class'], info['type'],
 334                                       info.get('op') or '-'))
 335
 336             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 337             if should_replace:
 338                 deleted = self._remove_special_phrases(cur, norm_phrases,
 339                                                        existing_phrases)
 340             else:
 341                 deleted = 0
 342
 343         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 344                  len(norm_phrases), added, deleted)
 345
 346
 347     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 348         """ Add all phrases to the database that are not yet there.
 349         """
 350         to_add = new_phrases - existing_phrases
 351
 352         added = 0
 353         with CopyBuffer() as copystr:
 354             for word, cls, typ, oper in to_add:
 355                 term = self._search_normalized(word)
 356                 if term:
 357                     copystr.add(term, 'S', word,
 358                                 json.dumps({'class': cls, 'type': typ,
 359                                             'op': oper if oper in ('in', 'near') else None}))
 360                     added += 1
 361
 362             copystr.copy_out(cursor, 'word',
 363                              columns=['word_token', 'type', 'word', 'info'])
 364
 365         return added
 366
 367
 368     @staticmethod
 369     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 370         """ Remove all phrases from the databse that are no longer in the
 371             new phrase list.
 372         """
 373         to_delete = existing_phrases - new_phrases
 374
 375         if to_delete:
 376             cursor.execute_values(
 377                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 378                     WHERE type = 'S' and word = name
 379                           and info->>'class' = in_class and info->>'type' = in_type
 380                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 381                 """, to_delete)
 382
 383         return len(to_delete)
 384
 385
 386     def add_country_names(self, country_code, names):
 387         """ Add default names for the given country to the search index.
 388         """
 389         # Make sure any name preprocessing for country names applies.
 390         info = PlaceInfo({'name': names, 'country_code': country_code,
 391                           'rank_address': 4, 'class': 'boundary',
 392                           'type': 'administrative'})
 393         self._add_country_full_names(country_code,
 394                                      self.sanitizer.process_names(info)[0],
 395                                      internal=True)
 396
 397
 398     def _add_country_full_names(self, country_code, names, internal=False):
 399         """ Add names for the given country from an already sanitized
 400             name list.
 401         """
 402         word_tokens = set()
 403         for name in names:
 404             norm_name = self._search_normalized(name.name)
 405             if norm_name:
 406                 word_tokens.add(norm_name)
 407
 408         with self.conn.cursor() as cur:
 409             # Get existing names
 410             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 411                              FROM word
 412                              WHERE type = 'C' and word = %s""",
 413                         (country_code, ))
 414             existing_tokens = {True: set(), False: set()} # internal/external names
 415             for word in cur:
 416                 existing_tokens[word[1]].add(word[0])
 417
 418             # Delete names that no longer exist.
 419             gone_tokens = existing_tokens[internal] - word_tokens
 420             if internal:
 421                 gone_tokens.update(existing_tokens[False] & word_tokens)
 422             if gone_tokens:
 423                 cur.execute("""DELETE FROM word
 424                                USING unnest(%s) as token
 425                                WHERE type = 'C' and word = %s
 426                                      and word_token = token""",
 427                             (list(gone_tokens), country_code))
 428
 429             # Only add those names that are not yet in the list.
 430             new_tokens = word_tokens - existing_tokens[True]
 431             if not internal:
 432                 new_tokens -= existing_tokens[False]
 433             if new_tokens:
 434                 if internal:
 435                     sql = """INSERT INTO word (word_token, type, word, info)
 436                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 437                                   FROM unnest(%s) as token)
 438                            """
 439                 else:
 440                     sql = """INSERT INTO word (word_token, type, word)
 441                                    (SELECT token, 'C', %s
 442                                     FROM unnest(%s) as token)
 443                           """
 444                 cur.execute(sql, (country_code, list(new_tokens)))
 445
 446
 447     def process_place(self, place):
 448         """ Determine tokenizer information about the given place.
 449
 450             Returns a JSON-serializable structure that will be handed into
 451             the database via the token_info field.
 452         """
 453         token_info = _TokenInfo()
 454
 455         names, address = self.sanitizer.process_names(place)
 456
 457         if names:
 458             token_info.set_names(*self._compute_name_tokens(names))
 459
 460             if place.is_country():
 461                 self._add_country_full_names(place.country_code, names)
 462
 463         if address:
 464             self._process_place_address(token_info, address)
 465
 466         return token_info.to_dict()
 467
 468
 469     def _process_place_address(self, token_info, address):
 470         for item in address:
 471             if item.kind == 'postcode':
 472                 self._add_postcode(item.name)
 473             elif item.kind == 'housenumber':
 474                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 475             elif item.kind == 'street':
 476                 token_info.add_street(self._retrieve_full_tokens(item.name))
 477             elif item.kind == 'place':
 478                 if not item.suffix:
 479                     token_info.add_place(self._compute_partial_tokens(item.name))
 480             elif not item.kind.startswith('_') and not item.suffix and \
 481                  item.kind not in ('country', 'full'):
 482                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 483
 484
 485     def _compute_housenumber_token(self, hnr):
 486         """ Normalize the housenumber and return the word token and the
 487             canonical form.
 488         """
 489         analyzer = self.token_analysis.analysis.get('@housenumber')
 490         result = None, None
 491
 492         if analyzer is None:
 493             # When no custom analyzer is set, simply normalize and transliterate
 494             norm_name = self._search_normalized(hnr.name)
 495             if norm_name:
 496                 result = self._cache.housenumbers.get(norm_name, result)
 497                 if result[0] is None:
 498                     with self.conn.cursor() as cur:
 499                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 500                         result = cur.fetchone()[0], norm_name
 501                         self._cache.housenumbers[norm_name] = result
 502         else:
 503             # Otherwise use the analyzer to determine the canonical name.
 504             # Per convention we use the first variant as the 'lookup name', the
 505             # name that gets saved in the housenumber field of the place.
 506             norm_name = analyzer.normalize(hnr.name)
 507             if norm_name:
 508                 result = self._cache.housenumbers.get(norm_name, result)
 509                 if result[0] is None:
 510                     variants = analyzer.get_variants_ascii(norm_name)
 511                     if variants:
 512                         with self.conn.cursor() as cur:
 513                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
 514                                         (norm_name, list(variants)))
 515                             result = cur.fetchone()[0], variants[0]
 516                             self._cache.housenumbers[norm_name] = result
 517
 518         return result
 519
 520
 521     def _compute_partial_tokens(self, name):
 522         """ Normalize the given term, split it into partial words and return
 523             then token list for them.
 524         """
 525         norm_name = self._search_normalized(name)
 526
 527         tokens = []
 528         need_lookup = []
 529         for partial in norm_name.split():
 530             token = self._cache.partials.get(partial)
 531             if token:
 532                 tokens.append(token)
 533             else:
 534                 need_lookup.append(partial)
 535
 536         if need_lookup:
 537             with self.conn.cursor() as cur:
 538                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 539                                FROM unnest(%s) word""",
 540                             (need_lookup, ))
 541
 542                 for partial, token in cur:
 543                     tokens.append(token)
 544                     self._cache.partials[partial] = token
 545
 546         return tokens
 547
 548
 549     def _retrieve_full_tokens(self, name):
 550         """ Get the full name token for the given name, if it exists.
 551             The name is only retrived for the standard analyser.
 552         """
 553         norm_name = self._search_normalized(name)
 554
 555         # return cached if possible
 556         if norm_name in self._cache.fulls:
 557             return self._cache.fulls[norm_name]
 558
 559         with self.conn.cursor() as cur:
 560             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 561                         (norm_name, ))
 562             full = [row[0] for row in cur]
 563
 564         self._cache.fulls[norm_name] = full
 565
 566         return full
 567
 568
 569     def _compute_name_tokens(self, names):
 570         """ Computes the full name and partial name tokens for the given
 571             dictionary of names.
 572         """
 573         full_tokens = set()
 574         partial_tokens = set()
 575
 576         for name in names:
 577             analyzer_id = name.get_attr('analyzer')
 578             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 579             norm_name = analyzer.normalize(name.name)
 580             if analyzer_id is None:
 581                 token_id = norm_name
 582             else:
 583                 token_id = f'{norm_name}@{analyzer_id}'
 584
 585             full, part = self._cache.names.get(token_id, (None, None))
 586             if full is None:
 587                 variants = analyzer.get_variants_ascii(norm_name)
 588                 if not variants:
 589                     continue
 590
 591                 with self.conn.cursor() as cur:
 592                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 593                                 (token_id, variants))
 594                     full, part = cur.fetchone()
 595
 596                 self._cache.names[token_id] = (full, part)
 597
 598             full_tokens.add(full)
 599             partial_tokens.update(part)
 600
 601         return full_tokens, partial_tokens
 602
 603
 604     def _add_postcode(self, postcode):
 605         """ Make sure the normalized postcode is present in the word table.
 606         """
 607         if re.search(r'[:,;]', postcode) is None:
 608             postcode = self.normalize_postcode(postcode)
 609
 610             if postcode not in self._cache.postcodes:
 611                 term = self._search_normalized(postcode)
 612                 if not term:
 613                     return
 614
 615                 with self.conn.cursor() as cur:
 616                     # no word_id needed for postcodes
 617                     cur.execute("""INSERT INTO word (word_token, type, word)
 618                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 619                                     WHERE NOT EXISTS
 620                                      (SELECT * FROM word
 621                                       WHERE type = 'P' and word = pc))
 622                                 """, (term, postcode))
 623                 self._cache.postcodes.add(postcode)
 624
 625
 626 class _TokenInfo:
 627     """ Collect token information to be sent back to the database.
 628     """
 629     def __init__(self):
 630         self.names = None
 631         self.housenumbers = set()
 632         self.housenumber_tokens = set()
 633         self.street_tokens = set()
 634         self.place_tokens = set()
 635         self.address_tokens = {}
 636
 637
 638     @staticmethod
 639     def _mk_array(tokens):
 640         return f"{{{','.join((str(s) for s in tokens))}}}"
 641
 642
 643     def to_dict(self):
 644         """ Return the token information in database importable format.
 645         """
 646         out = {}
 647
 648         if self.names:
 649             out['names'] = self.names
 650
 651         if self.housenumbers:
 652             out['hnr'] = ';'.join(self.housenumbers)
 653             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 654
 655         if self.street_tokens:
 656             out['street'] = self._mk_array(self.street_tokens)
 657
 658         if self.place_tokens:
 659             out['place'] = self._mk_array(self.place_tokens)
 660
 661         if self.address_tokens:
 662             out['addr'] = self.address_tokens
 663
 664         return out
 665
 666
 667     def set_names(self, fulls, partials):
 668         """ Adds token information for the normalised names.
 669         """
 670         self.names = self._mk_array(itertools.chain(fulls, partials))
 671
 672
 673     def add_housenumber(self, token, hnr):
 674         """ Extract housenumber information from a list of normalised
 675             housenumbers.
 676         """
 677         if token:
 678             self.housenumbers.add(hnr)
 679             self.housenumber_tokens.add(token)
 680
 681
 682     def add_street(self, tokens):
 683         """ Add addr:street match terms.
 684         """
 685         self.street_tokens.update(tokens)
 686
 687
 688     def add_place(self, tokens):
 689         """ Add addr:place search and match terms.
 690         """
 691         self.place_tokens.update(tokens)
 692
 693
 694     def add_address_term(self, key, partials):
 695         """ Add additional address terms.
 696         """
 697         if partials:
 698             self.address_tokens[key] = self._mk_array(partials)
 699
 700
 701 class _TokenCache:
 702     """ Cache for token information to avoid repeated database queries.
 703
 704         This cache is not thread-safe and needs to be instantiated per
 705         analyzer.
 706     """
 707     def __init__(self):
 708         self.names = {}
 709         self.partials = {}
 710         self.fulls = {}
 711         self.postcodes = set()
 712         self.housenumbers = {}