src/nominatim_db/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2025 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import logging
  15
  16 from psycopg.types.json import Jsonb
  17 from psycopg import sql as pysql
  18
  19 from ..db.connection import connect, Connection, Cursor, \
  20                             drop_tables, table_exists, execute_scalar
  21 from ..config import Configuration
  22 from ..db.sql_preprocessor import SQLPreprocessor
  23 from ..data.place_info import PlaceInfo
  24 from ..data.place_name import PlaceName
  25 from .icu_rule_loader import ICURuleLoader
  26 from .place_sanitizer import PlaceSanitizer
  27 from .icu_token_analysis import ICUTokenAnalysis
  28 from .base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 WORD_TYPES = (('country_names', 'C'),
  35               ('postcodes', 'P'),
  36               ('full_word', 'W'),
  37               ('housenumbers', 'H'))
  38
  39
  40 def create(dsn: str) -> 'ICUTokenizer':
  41     """ Create a new instance of the tokenizer provided by this module.
  42     """
  43     return ICUTokenizer(dsn)
  44
  45
  46 class ICUTokenizer(AbstractTokenizer):
  47     """ This tokenizer uses libICU to convert names and queries to ASCII.
  48         Otherwise it uses the same algorithms and data structures as the
  49         normalization routines in Nominatim 3.
  50     """
  51
  52     def __init__(self, dsn: str) -> None:
  53         self.dsn = dsn
  54         self.loader: Optional[ICURuleLoader] = None
  55
  56     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  57         """ Set up a new tokenizer for the database.
  58
  59             This copies all necessary data in the project directory to make
  60             sure the tokenizer remains stable even over updates.
  61         """
  62         self.loader = ICURuleLoader(config)
  63
  64         self._save_config()
  65
  66         if init_db:
  67             self.update_sql_functions(config)
  68             self._setup_db_tables(config)
  69             self._create_base_indices(config, 'word')
  70
  71     def init_from_project(self, config: Configuration) -> None:
  72         """ Initialise the tokenizer from the project directory.
  73         """
  74         self.loader = ICURuleLoader(config)
  75
  76         with connect(self.dsn) as conn:
  77             self.loader.load_config_from_db(conn)
  78
  79     def finalize_import(self, config: Configuration) -> None:
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         self._create_lookup_indices(config, 'word')
  84
  85     def update_sql_functions(self, config: Configuration) -> None:
  86         """ Reimport the SQL functions for this tokenizer.
  87         """
  88         with connect(self.dsn) as conn:
  89             sqlp = SQLPreprocessor(conn, config)
  90             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  91
  92     def check_database(self, config: Configuration) -> None:
  93         """ Check that the tokenizer is set up correctly.
  94         """
  95         # Will throw an error if there is an issue.
  96         self.init_from_project(config)
  97
  98     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
  99         """ Recompute frequencies for all name words.
 100         """
 101         with connect(self.dsn) as conn:
 102             if not table_exists(conn, 'search_name'):
 103                 return
 104
 105             with conn.cursor() as cur:
 106                 cur.execute('ANALYSE search_name')
 107                 if threads > 1:
 108                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
 109                                      .format(pysql.Literal(min(threads, 6),)))
 110
 111                 LOG.info('Computing word frequencies')
 112                 drop_tables(conn, 'word_frequencies')
 113                 cur.execute("""
 114                   CREATE TEMP TABLE word_frequencies AS
 115                   WITH word_freq AS MATERIALIZED (
 116                            SELECT unnest(name_vector) as id, count(*)
 117                                  FROM search_name GROUP BY id),
 118                        addr_freq AS MATERIALIZED (
 119                            SELECT unnest(nameaddress_vector) as id, count(*)
 120                                  FROM search_name GROUP BY id)
 121                   SELECT coalesce(a.id, w.id) as id,
 122                          (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB
 123                               ELSE jsonb_build_object('count', w.count) END
 124                           ||
 125                           CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB
 126                               ELSE jsonb_build_object('addr_count', a.count) END) as info
 127                   FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 128                   """)
 129                 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 130                 cur.execute('ANALYSE word_frequencies')
 131                 LOG.info('Update word table with recomputed frequencies')
 132                 drop_tables(conn, 'tmp_word')
 133                 cur.execute("""CREATE TABLE tmp_word AS
 134                                 SELECT word_id, word_token, type, word,
 135                                        coalesce(word.info, '{}'::jsonb)
 136                                        - 'count' - 'addr_count' ||
 137                                        coalesce(wf.info, '{}'::jsonb)
 138                                        as info
 139                                 FROM word LEFT JOIN word_frequencies wf
 140                                      ON word.word_id = wf.id
 141                             """)
 142                 drop_tables(conn, 'word_frequencies')
 143
 144             with conn.cursor() as cur:
 145                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 146
 147             sqlp = SQLPreprocessor(conn, config)
 148             sqlp.run_string(conn,
 149                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 150             conn.commit()
 151         self._create_base_indices(config, 'tmp_word')
 152         self._create_lookup_indices(config, 'tmp_word')
 153         self._move_temporary_word_table('tmp_word')
 154
 155     def _cleanup_housenumbers(self) -> None:
 156         """ Remove unused house numbers.
 157         """
 158         with connect(self.dsn) as conn:
 159             if not table_exists(conn, 'search_name'):
 160                 return
 161             with conn.cursor(name="hnr_counter") as cur:
 162                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 163                                FROM word
 164                                WHERE type = 'H'
 165                                  AND NOT EXISTS(SELECT * FROM search_name
 166                                                 WHERE ARRAY[word.word_id] && name_vector)
 167                                  AND (char_length(coalesce(word, word_token)) > 6
 168                                       OR coalesce(word, word_token) not similar to '\\d+')
 169                             """)
 170                 candidates = {token: wid for wid, token in cur}
 171             with conn.cursor(name="hnr_counter") as cur:
 172                 cur.execute("""SELECT housenumber FROM placex
 173                                WHERE housenumber is not null
 174                                      AND (char_length(housenumber) > 6
 175                                           OR housenumber not similar to '\\d+')
 176                             """)
 177                 for row in cur:
 178                     for hnr in row[0].split(';'):
 179                         candidates.pop(hnr, None)
 180             LOG.info("There are %s outdated housenumbers.", len(candidates))
 181             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 182             if candidates:
 183                 with conn.cursor() as cur:
 184                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 185                                 (list(candidates.values()), ))
 186                 conn.commit()
 187
 188     def update_word_tokens(self) -> None:
 189         """ Remove unused tokens.
 190         """
 191         LOG.warning("Cleaning up housenumber tokens.")
 192         self._cleanup_housenumbers()
 193         LOG.warning("Tokenizer house-keeping done.")
 194
 195     def name_analyzer(self) -> 'ICUNameAnalyzer':
 196         """ Create a new analyzer for tokenizing names and queries
 197             using this tokinzer. Analyzers are context managers and should
 198             be used accordingly:
 199
 200             ```
 201             with tokenizer.name_analyzer() as analyzer:
 202                 analyser.tokenize()
 203             ```
 204
 205             When used outside the with construct, the caller must ensure to
 206             call the close() function before destructing the analyzer.
 207
 208             Analyzers are not thread-safe. You need to instantiate one per thread.
 209         """
 210         assert self.loader is not None
 211         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 212                                self.loader.make_token_analysis())
 213
 214     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 215         """ Return a list of the `num` most frequent full words
 216             in the database.
 217         """
 218         with conn.cursor() as cur:
 219             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 220                              FROM word WHERE type = 'W'
 221                              GROUP BY word
 222                              ORDER BY count DESC LIMIT %s""", (num,))
 223             return list(s[0].split('@')[0] for s in cur)
 224
 225     def _save_config(self) -> None:
 226         """ Save the configuration that needs to remain stable for the given
 227             database as database properties.
 228         """
 229         assert self.loader is not None
 230         with connect(self.dsn) as conn:
 231             self.loader.save_config_to_db(conn)
 232
 233     def _setup_db_tables(self, config: Configuration) -> None:
 234         """ Set up the word table and fill it with pre-computed word
 235             frequencies.
 236         """
 237         with connect(self.dsn) as conn:
 238             drop_tables(conn, 'word')
 239             sqlp = SQLPreprocessor(conn, config)
 240             sqlp.run_string(conn, """
 241                 CREATE TABLE word (
 242                       word_id INTEGER,
 243                       word_token text NOT NULL,
 244                       type text NOT NULL,
 245                       word text,
 246                       info jsonb
 247                     ) {{db.tablespace.search_data}};
 248                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 249
 250                 DROP SEQUENCE IF EXISTS seq_word;
 251                 CREATE SEQUENCE seq_word start 1;
 252                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 253             """)
 254             conn.commit()
 255
 256     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 257         """ Set up the word table and fill it with pre-computed word
 258             frequencies.
 259         """
 260         with connect(self.dsn) as conn:
 261             sqlp = SQLPreprocessor(conn, config)
 262             sqlp.run_string(conn,
 263                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 264                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 265                             table_name=table_name)
 266             for name, ctype in WORD_TYPES:
 267                 sqlp.run_string(conn,
 268                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 269                                    USING BTREE (word) {{db.tablespace.address_index}}
 270                                    WHERE type = '{{column_type}}'
 271                                 """,
 272                                 table_name=table_name, idx_name=name,
 273                                 column_type=ctype)
 274             conn.commit()
 275
 276     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 277         """ Create additional indexes used when running the API.
 278         """
 279         with connect(self.dsn) as conn:
 280             sqlp = SQLPreprocessor(conn, config)
 281             # Index required for details lookup.
 282             sqlp.run_string(
 283                 conn,
 284                 """
 285                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 286                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 287                 """,
 288                 table_name=table_name)
 289             conn.commit()
 290
 291     def _move_temporary_word_table(self, old: str) -> None:
 292         """ Rename all tables and indexes used by the tokenizer.
 293         """
 294         with connect(self.dsn) as conn:
 295             drop_tables(conn, 'word')
 296             with conn.cursor() as cur:
 297                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 298                 for idx in ('word_token', 'word_id'):
 299                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 300                                       RENAME TO idx_word_{idx}""")
 301                 for name, _ in WORD_TYPES:
 302                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 303                                     RENAME TO idx_word_{name}""")
 304             conn.commit()
 305
 306
 307 class ICUNameAnalyzer(AbstractAnalyzer):
 308     """ The ICU analyzer uses the ICU library for splitting names.
 309
 310         Each instance opens a connection to the database to request the
 311         normalization.
 312     """
 313
 314     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 315                  token_analysis: ICUTokenAnalysis) -> None:
 316         self.conn: Optional[Connection] = connect(dsn)
 317         self.conn.autocommit = True
 318         self.sanitizer = sanitizer
 319         self.token_analysis = token_analysis
 320
 321         self._cache = _TokenCache()
 322
 323     def close(self) -> None:
 324         """ Free all resources used by the analyzer.
 325         """
 326         if self.conn:
 327             self.conn.close()
 328             self.conn = None
 329
 330     def _search_normalized(self, name: str) -> str:
 331         """ Return the search token transliteration of the given name.
 332         """
 333         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 334
 335     def _normalized(self, name: str) -> str:
 336         """ Return the normalized version of the given name with all
 337             non-relevant information removed.
 338         """
 339         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 340
 341     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, Optional[int]]]:
 342         """ Return token information for the given list of words.
 343             If a word starts with # it is assumed to be a full name
 344             otherwise is a partial name.
 345
 346             The function returns a list of tuples with
 347             (original word, word token, word id).
 348
 349             The function is used for testing and debugging only
 350             and not necessarily efficient.
 351         """
 352         assert self.conn is not None
 353         full_tokens = {}
 354         partial_tokens = {}
 355         for word in words:
 356             if word.startswith('#'):
 357                 full_tokens[word] = self._search_normalized(word[1:])
 358             else:
 359                 partial_tokens[word] = self._search_normalized(word)
 360
 361         with self.conn.cursor() as cur:
 362             cur.execute("""SELECT word_token, word_id
 363                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 364                         """, (list(full_tokens.values()),))
 365             full_ids = {r[0]: cast(int, r[1]) for r in cur}
 366             cur.execute("""SELECT word_token, word_id
 367                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 368                         (list(partial_tokens.values()),))
 369             part_ids = {r[0]: cast(int, r[1]) for r in cur}
 370
 371         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 372             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 373
 374     def normalize_postcode(self, postcode: str) -> str:
 375         """ Convert the postcode to a standardized form.
 376
 377             This function must yield exactly the same result as the SQL function
 378             'token_normalized_postcode()'.
 379         """
 380         return postcode.strip().upper()
 381
 382     def update_postcodes_from_db(self) -> None:
 383         """ Postcode update.
 384
 385             Removes all postcodes from the word table because they are not
 386             needed. Postcodes are recognised by pattern.
 387         """
 388         assert self.conn is not None
 389
 390         with self.conn.cursor() as cur:
 391             cur.execute("DELETE FROM word WHERE type = 'P'")
 392
 393     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 394                                should_replace: bool) -> None:
 395         """ Replace the search index for special phrases with the new phrases.
 396             If `should_replace` is True, then the previous set of will be
 397             completely replaced. Otherwise the phrases are added to the
 398             already existing ones.
 399         """
 400         assert self.conn is not None
 401         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 402                             for p in phrases))
 403
 404         with self.conn.cursor() as cur:
 405             # Get the old phrases.
 406             existing_phrases = set()
 407             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 408             for word, info in cur:
 409                 existing_phrases.add((word, info['class'], info['type'],
 410                                       info.get('op') or '-'))
 411
 412             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 413             if should_replace:
 414                 deleted = self._remove_special_phrases(cur, norm_phrases,
 415                                                        existing_phrases)
 416             else:
 417                 deleted = 0
 418
 419         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 420                  len(norm_phrases), added, deleted)
 421
 422     def _add_special_phrases(self, cursor: Cursor,
 423                              new_phrases: Set[Tuple[str, str, str, str]],
 424                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 425         """ Add all phrases to the database that are not yet there.
 426         """
 427         to_add = new_phrases - existing_phrases
 428
 429         added = 0
 430         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
 431             for word, cls, typ, oper in to_add:
 432                 term = self._search_normalized(word)
 433                 if term:
 434                     copy.write_row((term, 'S', word,
 435                                     Jsonb({'class': cls, 'type': typ,
 436                                            'op': oper if oper in ('in', 'near') else None})))
 437                     added += 1
 438
 439         return added
 440
 441     def _remove_special_phrases(self, cursor: Cursor,
 442                                 new_phrases: Set[Tuple[str, str, str, str]],
 443                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 444         """ Remove all phrases from the database that are no longer in the
 445             new phrase list.
 446         """
 447         to_delete = existing_phrases - new_phrases
 448
 449         if to_delete:
 450             cursor.executemany(
 451                 """ DELETE FROM word
 452                       WHERE type = 'S' and word = %s
 453                             and info->>'class' = %s and info->>'type' = %s
 454                             and %s = coalesce(info->>'op', '-')
 455                 """, to_delete)
 456
 457         return len(to_delete)
 458
 459     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 460         """ Add default names for the given country to the search index.
 461         """
 462         # Make sure any name preprocessing for country names applies.
 463         info = PlaceInfo({'name': names, 'country_code': country_code,
 464                           'rank_address': 4, 'class': 'boundary',
 465                           'type': 'administrative'})
 466         self._add_country_full_names(country_code,
 467                                      self.sanitizer.process_names(info)[0],
 468                                      internal=True)
 469
 470     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 471                                 internal: bool = False) -> None:
 472         """ Add names for the given country from an already sanitized
 473             name list.
 474         """
 475         assert self.conn is not None
 476         word_tokens = set()
 477         for name in names:
 478             norm_name = self._normalized(name.name)
 479             token_name = self._search_normalized(name.name)
 480             if norm_name and token_name:
 481                 word_tokens.add((token_name, norm_name))
 482
 483         with self.conn.cursor() as cur:
 484             # Get existing names
 485             cur.execute("""SELECT word_token,
 486                                   word as lookup,
 487                                   coalesce(info ? 'internal', false) as is_internal
 488                              FROM word
 489                              WHERE type = 'C' and info->>'cc' = %s""",
 490                         (country_code, ))
 491             # internal/external names
 492             existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
 493             for word in cur:
 494                 existing_tokens[word[2]].add((word[0], word[1]))
 495
 496             # Delete names that no longer exist.
 497             gone_tokens = existing_tokens[internal] - word_tokens
 498             if internal:
 499                 gone_tokens.update(existing_tokens[False] & word_tokens)
 500             if gone_tokens:
 501                 cur.execute("""DELETE FROM word
 502                                USING jsonb_array_elements(%s) as data
 503                                WHERE type = 'C' and info->>'cc' = %s
 504                                      and word_token = data->>0 and word = data->>1""",
 505                             (Jsonb(list(gone_tokens)), country_code))
 506
 507             # Only add those names that are not yet in the list.
 508             new_tokens = word_tokens - existing_tokens[True]
 509             if not internal:
 510                 new_tokens -= existing_tokens[False]
 511             if new_tokens:
 512                 if internal:
 513                     sql = """INSERT INTO word (word_token, type, word, info)
 514                                (SELECT data->>0, 'C', data->>1,
 515                                        jsonb_build_object('internal', 'yes', 'cc', %s::text)
 516                                   FROM jsonb_array_elements(%s) as data)
 517                            """
 518                 else:
 519                     sql = """INSERT INTO word (word_token, type, word, info)
 520                                    (SELECT data->>0, 'C', data->>1,
 521                                            jsonb_build_object('cc', %s::text)
 522                                     FROM  jsonb_array_elements(%s) as data)
 523                           """
 524                 cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
 525
 526     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 527         """ Determine tokenizer information about the given place.
 528
 529             Returns a JSON-serializable structure that will be handed into
 530             the database via the token_info field.
 531         """
 532         token_info = _TokenInfo()
 533
 534         names, address = self.sanitizer.process_names(place)
 535
 536         if names:
 537             token_info.set_names(*self._compute_name_tokens(names))
 538
 539             if place.is_country():
 540                 assert place.country_code is not None
 541                 self._add_country_full_names(place.country_code, names)
 542
 543         if address:
 544             self._process_place_address(token_info, address)
 545
 546         return token_info.to_dict()
 547
 548     def _process_place_address(self, token_info: '_TokenInfo',
 549                                address: Sequence[PlaceName]) -> None:
 550         for item in address:
 551             if item.kind == 'postcode':
 552                 token_info.set_postcode(self._add_postcode(item))
 553             elif item.kind == 'housenumber':
 554                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 555             elif item.kind == 'street':
 556                 token_info.add_street(self._retrieve_full_tokens(item.name))
 557             elif item.kind == 'place':
 558                 if not item.suffix:
 559                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
 560             elif (not item.kind.startswith('_') and not item.suffix and
 561                   item.kind not in ('country', 'full', 'inclusion')):
 562                 token_info.add_address_term(item.kind,
 563                                             itertools.chain(*self._compute_name_tokens([item])))
 564
 565     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 566         """ Normalize the housenumber and return the word token and the
 567             canonical form.
 568         """
 569         assert self.conn is not None
 570         analyzer = self.token_analysis.analysis.get('@housenumber')
 571         result: Tuple[Optional[int], Optional[str]] = (None, None)
 572
 573         if analyzer is None:
 574             # When no custom analyzer is set, simply normalize and transliterate
 575             norm_name = self._search_normalized(hnr.name)
 576             if norm_name:
 577                 result = self._cache.housenumbers.get(norm_name, result)
 578                 if result[0] is None:
 579                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 580
 581                     result = hid, norm_name
 582                     self._cache.housenumbers[norm_name] = result
 583         else:
 584             # Otherwise use the analyzer to determine the canonical name.
 585             # Per convention we use the first variant as the 'lookup name', the
 586             # name that gets saved in the housenumber field of the place.
 587             word_id = analyzer.get_canonical_id(hnr)
 588             if word_id:
 589                 result = self._cache.housenumbers.get(word_id, result)
 590                 if result[0] is None:
 591                     varout = analyzer.compute_variants(word_id)
 592                     if isinstance(varout, tuple):
 593                         variants = varout[0]
 594                     else:
 595                         variants = varout
 596                     if variants:
 597                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
 598                                              (word_id, variants))
 599                         result = hid, variants[0]
 600                         self._cache.housenumbers[word_id] = result
 601
 602         return result
 603
 604     def _retrieve_full_tokens(self, name: str) -> List[int]:
 605         """ Get the full name token for the given name, if it exists.
 606             The name is only retrieved for the standard analyser.
 607         """
 608         assert self.conn is not None
 609         norm_name = self._search_normalized(name)
 610
 611         # return cached if possible
 612         if norm_name in self._cache.fulls:
 613             return self._cache.fulls[norm_name]
 614
 615         with self.conn.cursor() as cur:
 616             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 617                         (norm_name, ))
 618             full = [row[0] for row in cur]
 619
 620         self._cache.fulls[norm_name] = full
 621
 622         return full
 623
 624     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 625         """ Computes the full name and partial name tokens for the given
 626             dictionary of names.
 627         """
 628         assert self.conn is not None
 629         full_tokens: Set[int] = set()
 630         partial_tokens: Set[int] = set()
 631
 632         for name in names:
 633             analyzer_id = name.get_attr('analyzer')
 634             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 635             word_id = analyzer.get_canonical_id(name)
 636             if analyzer_id is None:
 637                 token_id = word_id
 638             else:
 639                 token_id = f'{word_id}@{analyzer_id}'
 640
 641             full, part = self._cache.names.get(token_id, (None, None))
 642             if full is None:
 643                 varset = analyzer.compute_variants(word_id)
 644                 if isinstance(varset, tuple):
 645                     variants, lookups = varset
 646                 else:
 647                     variants, lookups = varset, None
 648                 if not variants:
 649                     continue
 650
 651                 with self.conn.cursor() as cur:
 652                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s, %s)",
 653                                 (token_id, variants, lookups))
 654                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 655
 656                 self._cache.names[token_id] = (full, part)
 657
 658             assert part is not None
 659
 660             full_tokens.add(full)
 661             partial_tokens.update(part)
 662
 663         return full_tokens, partial_tokens
 664
 665     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 666         """ Make sure the normalized postcode is present in the word table.
 667         """
 668         assert self.conn is not None
 669         analyzer = self.token_analysis.analysis.get('@postcode')
 670
 671         if analyzer is None:
 672             return item.name.strip().upper()
 673         else:
 674             return analyzer.get_canonical_id(item)
 675
 676
 677 class _TokenInfo:
 678     """ Collect token information to be sent back to the database.
 679     """
 680     def __init__(self) -> None:
 681         self.names: Optional[str] = None
 682         self.housenumbers: Set[str] = set()
 683         self.housenumber_tokens: Set[int] = set()
 684         self.street_tokens: Optional[Set[int]] = None
 685         self.place_tokens: Set[int] = set()
 686         self.address_tokens: Dict[str, str] = {}
 687         self.postcode: Optional[str] = None
 688
 689     def _mk_array(self, tokens: Iterable[Any]) -> str:
 690         return f"{{{','.join((str(s) for s in tokens))}}}"
 691
 692     def to_dict(self) -> Dict[str, Any]:
 693         """ Return the token information in database importable format.
 694         """
 695         out: Dict[str, Any] = {}
 696
 697         if self.names:
 698             out['names'] = self.names
 699
 700         if self.housenumbers:
 701             out['hnr'] = ';'.join(self.housenumbers)
 702             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 703
 704         if self.street_tokens is not None:
 705             out['street'] = self._mk_array(self.street_tokens)
 706
 707         if self.place_tokens:
 708             out['place'] = self._mk_array(self.place_tokens)
 709
 710         if self.address_tokens:
 711             out['addr'] = self.address_tokens
 712
 713         if self.postcode:
 714             out['postcode'] = self.postcode
 715
 716         return out
 717
 718     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 719         """ Adds token information for the normalised names.
 720         """
 721         self.names = self._mk_array(itertools.chain(fulls, partials))
 722
 723     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 724         """ Extract housenumber information from a list of normalised
 725             housenumbers.
 726         """
 727         if token:
 728             assert hnr is not None
 729             self.housenumbers.add(hnr)
 730             self.housenumber_tokens.add(token)
 731
 732     def add_street(self, tokens: Iterable[int]) -> None:
 733         """ Add addr:street match terms.
 734         """
 735         if self.street_tokens is None:
 736             self.street_tokens = set()
 737         self.street_tokens.update(tokens)
 738
 739     def add_place(self, tokens: Iterable[int]) -> None:
 740         """ Add addr:place search and match terms.
 741         """
 742         self.place_tokens.update(tokens)
 743
 744     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 745         """ Add additional address terms.
 746         """
 747         array = self._mk_array(partials)
 748         if len(array) > 2:
 749             self.address_tokens[key] = array
 750
 751     def set_postcode(self, postcode: Optional[str]) -> None:
 752         """ Set the postcode to the given one.
 753         """
 754         self.postcode = postcode
 755
 756
 757 class _TokenCache:
 758     """ Cache for token information to avoid repeated database queries.
 759
 760         This cache is not thread-safe and needs to be instantiated per
 761         analyzer.
 762     """
 763     def __init__(self) -> None:
 764         self.names: Dict[str, Tuple[int, List[int]]] = {}
 765         self.partials: Dict[str, int] = {}
 766         self.fulls: Dict[str, List[int]] = {}
 767         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}