nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from nominatim.db.connection import connect, Connection, Cursor
  20 from nominatim.config import Configuration
  21 from nominatim.db.utils import CopyBuffer
  22 from nominatim.db.sql_preprocessor import SQLPreprocessor
  23 from nominatim.data.place_info import PlaceInfo
  24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  26 from nominatim.data.place_name import PlaceName
  27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  35     """ Create a new instance of the tokenizer provided by this module.
  36     """
  37     return ICUTokenizer(dsn, data_dir)
  38
  39
  40 class ICUTokenizer(AbstractTokenizer):
  41     """ This tokenizer uses libICU to convert names and queries to ASCII.
  42         Otherwise it uses the same algorithms and data structures as the
  43         normalization routines in Nominatim 3.
  44     """
  45
  46     def __init__(self, dsn: str, data_dir: Path) -> None:
  47         self.dsn = dsn
  48         self.data_dir = data_dir
  49         self.loader: Optional[ICURuleLoader] = None
  50
  51
  52     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  53         """ Set up a new tokenizer for the database.
  54
  55             This copies all necessary data in the project directory to make
  56             sure the tokenizer remains stable even over updates.
  57         """
  58         self.loader = ICURuleLoader(config)
  59
  60         self._install_php(config.lib_dir.php, overwrite=True)
  61         self._save_config()
  62
  63         if init_db:
  64             self.update_sql_functions(config)
  65             self._init_db_tables(config)
  66
  67
  68     def init_from_project(self, config: Configuration) -> None:
  69         """ Initialise the tokenizer from the project directory.
  70         """
  71         self.loader = ICURuleLoader(config)
  72
  73         with connect(self.dsn) as conn:
  74             self.loader.load_config_from_db(conn)
  75
  76         self._install_php(config.lib_dir.php, overwrite=False)
  77
  78
  79     def finalize_import(self, config: Configuration) -> None:
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config: Configuration) -> None:
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             sqlp = SQLPreprocessor(conn, config)
  93             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  94
  95
  96     def check_database(self, config: Configuration) -> None:
  97         """ Check that the tokenizer is set up correctly.
  98         """
  99         # Will throw an error if there is an issue.
 100         self.init_from_project(config)
 101
 102
 103     def update_statistics(self) -> None:
 104         """ Recompute frequencies for all name words.
 105         """
 106         with connect(self.dsn) as conn:
 107             if conn.table_exists('search_name'):
 108                 with conn.cursor() as cur:
 109                     cur.drop_table("word_frequencies")
 110                     LOG.info("Computing word frequencies")
 111                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 112                                      SELECT unnest(name_vector) as id, count(*)
 113                                      FROM search_name GROUP BY id""")
 114                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 115                     LOG.info("Update word table with recomputed frequencies")
 116                     cur.execute("""UPDATE word
 117                                    SET info = info || jsonb_build_object('count', count)
 118                                    FROM word_frequencies WHERE word_id = id""")
 119                     cur.drop_table("word_frequencies")
 120             conn.commit()
 121
 122
 123     def _cleanup_housenumbers(self) -> None:
 124         """ Remove unused house numbers.
 125         """
 126         with connect(self.dsn) as conn:
 127             if not conn.table_exists('search_name'):
 128                 return
 129             with conn.cursor(name="hnr_counter") as cur:
 130                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 131                                FROM word
 132                                WHERE type = 'H'
 133                                  AND NOT EXISTS(SELECT * FROM search_name
 134                                                 WHERE ARRAY[word.word_id] && name_vector)
 135                                  AND (char_length(coalesce(word, word_token)) > 6
 136                                       OR coalesce(word, word_token) not similar to '\\d+')
 137                             """)
 138                 candidates = {token: wid for wid, token in cur}
 139             with conn.cursor(name="hnr_counter") as cur:
 140                 cur.execute("""SELECT housenumber FROM placex
 141                                WHERE housenumber is not null
 142                                      AND (char_length(housenumber) > 6
 143                                           OR housenumber not similar to '\\d+')
 144                             """)
 145                 for row in cur:
 146                     for hnr in row[0].split(';'):
 147                         candidates.pop(hnr, None)
 148             LOG.info("There are %s outdated housenumbers.", len(candidates))
 149             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 150             if candidates:
 151                 with conn.cursor() as cur:
 152                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 153                                 (list(candidates.values()), ))
 154                 conn.commit()
 155
 156
 157
 158     def update_word_tokens(self) -> None:
 159         """ Remove unused tokens.
 160         """
 161         LOG.warning("Cleaning up housenumber tokens.")
 162         self._cleanup_housenumbers()
 163         LOG.warning("Tokenizer house-keeping done.")
 164
 165
 166     def name_analyzer(self) -> 'ICUNameAnalyzer':
 167         """ Create a new analyzer for tokenizing names and queries
 168             using this tokinzer. Analyzers are context managers and should
 169             be used accordingly:
 170
 171             ```
 172             with tokenizer.name_analyzer() as analyzer:
 173                 analyser.tokenize()
 174             ```
 175
 176             When used outside the with construct, the caller must ensure to
 177             call the close() function before destructing the analyzer.
 178
 179             Analyzers are not thread-safe. You need to instantiate one per thread.
 180         """
 181         assert self.loader is not None
 182         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 183                                self.loader.make_token_analysis())
 184
 185
 186     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 187         """ Return a list of the `num` most frequent full words
 188             in the database.
 189         """
 190         with conn.cursor() as cur:
 191             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 192                              FROM word WHERE type = 'W'
 193                              GROUP BY word
 194                              ORDER BY count DESC LIMIT %s""", (num,))
 195             return list(s[0].split('@')[0] for s in cur)
 196
 197
 198     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
 199         """ Install the php script for the tokenizer.
 200         """
 201         assert self.loader is not None
 202         php_file = self.data_dir / "tokenizer.php"
 203
 204         if not php_file.exists() or overwrite:
 205             php_file.write_text(dedent(f"""\
 206                 <?php
 207                 @define('CONST_Max_Word_Frequency', 10000000);
 208                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 209                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 210                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 211
 212
 213     def _save_config(self) -> None:
 214         """ Save the configuration that needs to remain stable for the given
 215             database as database properties.
 216         """
 217         assert self.loader is not None
 218         with connect(self.dsn) as conn:
 219             self.loader.save_config_to_db(conn)
 220
 221
 222     def _init_db_tables(self, config: Configuration) -> None:
 223         """ Set up the word table and fill it with pre-computed word
 224             frequencies.
 225         """
 226         with connect(self.dsn) as conn:
 227             sqlp = SQLPreprocessor(conn, config)
 228             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 229             conn.commit()
 230
 231
 232 class ICUNameAnalyzer(AbstractAnalyzer):
 233     """ The ICU analyzer uses the ICU library for splitting names.
 234
 235         Each instance opens a connection to the database to request the
 236         normalization.
 237     """
 238
 239     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 240                  token_analysis: ICUTokenAnalysis) -> None:
 241         self.conn: Optional[Connection] = connect(dsn).connection
 242         self.conn.autocommit = True
 243         self.sanitizer = sanitizer
 244         self.token_analysis = token_analysis
 245
 246         self._cache = _TokenCache()
 247
 248
 249     def close(self) -> None:
 250         """ Free all resources used by the analyzer.
 251         """
 252         if self.conn:
 253             self.conn.close()
 254             self.conn = None
 255
 256
 257     def _search_normalized(self, name: str) -> str:
 258         """ Return the search token transliteration of the given name.
 259         """
 260         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 261
 262
 263     def _normalized(self, name: str) -> str:
 264         """ Return the normalized version of the given name with all
 265             non-relevant information removed.
 266         """
 267         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 268
 269
 270     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 271         """ Return token information for the given list of words.
 272             If a word starts with # it is assumed to be a full name
 273             otherwise is a partial name.
 274
 275             The function returns a list of tuples with
 276             (original word, word token, word id).
 277
 278             The function is used for testing and debugging only
 279             and not necessarily efficient.
 280         """
 281         assert self.conn is not None
 282         full_tokens = {}
 283         partial_tokens = {}
 284         for word in words:
 285             if word.startswith('#'):
 286                 full_tokens[word] = self._search_normalized(word[1:])
 287             else:
 288                 partial_tokens[word] = self._search_normalized(word)
 289
 290         with self.conn.cursor() as cur:
 291             cur.execute("""SELECT word_token, word_id
 292                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 293                         """, (list(full_tokens.values()),))
 294             full_ids = {r[0]: r[1] for r in cur}
 295             cur.execute("""SELECT word_token, word_id
 296                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 297                         (list(partial_tokens.values()),))
 298             part_ids = {r[0]: r[1] for r in cur}
 299
 300         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 301                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 302
 303
 304     def normalize_postcode(self, postcode: str) -> str:
 305         """ Convert the postcode to a standardized form.
 306
 307             This function must yield exactly the same result as the SQL function
 308             'token_normalized_postcode()'.
 309         """
 310         return postcode.strip().upper()
 311
 312
 313     def update_postcodes_from_db(self) -> None:
 314         """ Update postcode tokens in the word table from the location_postcode
 315             table.
 316         """
 317         assert self.conn is not None
 318         analyzer = self.token_analysis.analysis.get('@postcode')
 319
 320         with self.conn.cursor() as cur:
 321             # First get all postcode names currently in the word table.
 322             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 323             word_entries = set((entry[0] for entry in cur))
 324
 325             # Then compute the required postcode names from the postcode table.
 326             needed_entries = set()
 327             cur.execute("SELECT country_code, postcode FROM location_postcode")
 328             for cc, postcode in cur:
 329                 info = PlaceInfo({'country_code': cc,
 330                                   'class': 'place', 'type': 'postcode',
 331                                   'address': {'postcode': postcode}})
 332                 address = self.sanitizer.process_names(info)[1]
 333                 for place in address:
 334                     if place.kind == 'postcode':
 335                         if analyzer is None:
 336                             postcode_name = place.name.strip().upper()
 337                             variant_base = None
 338                         else:
 339                             postcode_name = analyzer.get_canonical_id(place)
 340                             variant_base = place.get_attr("variant")
 341
 342                         if variant_base:
 343                             needed_entries.add(f'{postcode_name}@{variant_base}')
 344                         else:
 345                             needed_entries.add(postcode_name)
 346                         break
 347
 348         # Now update the word table.
 349         self._delete_unused_postcode_words(word_entries - needed_entries)
 350         self._add_missing_postcode_words(needed_entries - word_entries)
 351
 352     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 353         assert self.conn is not None
 354         if tokens:
 355             with self.conn.cursor() as cur:
 356                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 357                             (list(tokens), ))
 358
 359     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 360         assert self.conn is not None
 361         if not tokens:
 362             return
 363
 364         analyzer = self.token_analysis.analysis.get('@postcode')
 365         terms = []
 366
 367         for postcode_name in tokens:
 368             if '@' in postcode_name:
 369                 term, variant = postcode_name.split('@', 2)
 370                 term = self._search_normalized(term)
 371                 if analyzer is None:
 372                     variants = [term]
 373                 else:
 374                     variants = analyzer.compute_variants(variant)
 375                     if term not in variants:
 376                         variants.append(term)
 377             else:
 378                 variants = [self._search_normalized(postcode_name)]
 379             terms.append((postcode_name, variants))
 380
 381         if terms:
 382             with self.conn.cursor() as cur:
 383                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 384                                       FROM (VALUES %s) AS v(pc, var)""",
 385                                    terms)
 386
 387
 388
 389
 390     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 391                                should_replace: bool) -> None:
 392         """ Replace the search index for special phrases with the new phrases.
 393             If `should_replace` is True, then the previous set of will be
 394             completely replaced. Otherwise the phrases are added to the
 395             already existing ones.
 396         """
 397         assert self.conn is not None
 398         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 399                             for p in phrases))
 400
 401         with self.conn.cursor() as cur:
 402             # Get the old phrases.
 403             existing_phrases = set()
 404             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 405             for word, info in cur:
 406                 existing_phrases.add((word, info['class'], info['type'],
 407                                       info.get('op') or '-'))
 408
 409             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 410             if should_replace:
 411                 deleted = self._remove_special_phrases(cur, norm_phrases,
 412                                                        existing_phrases)
 413             else:
 414                 deleted = 0
 415
 416         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 417                  len(norm_phrases), added, deleted)
 418
 419
 420     def _add_special_phrases(self, cursor: Cursor,
 421                              new_phrases: Set[Tuple[str, str, str, str]],
 422                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 423         """ Add all phrases to the database that are not yet there.
 424         """
 425         to_add = new_phrases - existing_phrases
 426
 427         added = 0
 428         with CopyBuffer() as copystr:
 429             for word, cls, typ, oper in to_add:
 430                 term = self._search_normalized(word)
 431                 if term:
 432                     copystr.add(term, 'S', word,
 433                                 json.dumps({'class': cls, 'type': typ,
 434                                             'op': oper if oper in ('in', 'near') else None}))
 435                     added += 1
 436
 437             copystr.copy_out(cursor, 'word',
 438                              columns=['word_token', 'type', 'word', 'info'])
 439
 440         return added
 441
 442
 443     def _remove_special_phrases(self, cursor: Cursor,
 444                              new_phrases: Set[Tuple[str, str, str, str]],
 445                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 446         """ Remove all phrases from the database that are no longer in the
 447             new phrase list.
 448         """
 449         to_delete = existing_phrases - new_phrases
 450
 451         if to_delete:
 452             cursor.execute_values(
 453                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 454                     WHERE type = 'S' and word = name
 455                           and info->>'class' = in_class and info->>'type' = in_type
 456                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 457                 """, to_delete)
 458
 459         return len(to_delete)
 460
 461
 462     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 463         """ Add default names for the given country to the search index.
 464         """
 465         # Make sure any name preprocessing for country names applies.
 466         info = PlaceInfo({'name': names, 'country_code': country_code,
 467                           'rank_address': 4, 'class': 'boundary',
 468                           'type': 'administrative'})
 469         self._add_country_full_names(country_code,
 470                                      self.sanitizer.process_names(info)[0],
 471                                      internal=True)
 472
 473
 474     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 475                                 internal: bool = False) -> None:
 476         """ Add names for the given country from an already sanitized
 477             name list.
 478         """
 479         assert self.conn is not None
 480         word_tokens = set()
 481         for name in names:
 482             norm_name = self._search_normalized(name.name)
 483             if norm_name:
 484                 word_tokens.add(norm_name)
 485
 486         with self.conn.cursor() as cur:
 487             # Get existing names
 488             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 489                              FROM word
 490                              WHERE type = 'C' and word = %s""",
 491                         (country_code, ))
 492             # internal/external names
 493             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 494             for word in cur:
 495                 existing_tokens[word[1]].add(word[0])
 496
 497             # Delete names that no longer exist.
 498             gone_tokens = existing_tokens[internal] - word_tokens
 499             if internal:
 500                 gone_tokens.update(existing_tokens[False] & word_tokens)
 501             if gone_tokens:
 502                 cur.execute("""DELETE FROM word
 503                                USING unnest(%s) as token
 504                                WHERE type = 'C' and word = %s
 505                                      and word_token = token""",
 506                             (list(gone_tokens), country_code))
 507
 508             # Only add those names that are not yet in the list.
 509             new_tokens = word_tokens - existing_tokens[True]
 510             if not internal:
 511                 new_tokens -= existing_tokens[False]
 512             if new_tokens:
 513                 if internal:
 514                     sql = """INSERT INTO word (word_token, type, word, info)
 515                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 516                                   FROM unnest(%s) as token)
 517                            """
 518                 else:
 519                     sql = """INSERT INTO word (word_token, type, word)
 520                                    (SELECT token, 'C', %s
 521                                     FROM unnest(%s) as token)
 522                           """
 523                 cur.execute(sql, (country_code, list(new_tokens)))
 524
 525
 526     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 527         """ Determine tokenizer information about the given place.
 528
 529             Returns a JSON-serializable structure that will be handed into
 530             the database via the token_info field.
 531         """
 532         token_info = _TokenInfo()
 533
 534         names, address = self.sanitizer.process_names(place)
 535
 536         if names:
 537             token_info.set_names(*self._compute_name_tokens(names))
 538
 539             if place.is_country():
 540                 assert place.country_code is not None
 541                 self._add_country_full_names(place.country_code, names)
 542
 543         if address:
 544             self._process_place_address(token_info, address)
 545
 546         return token_info.to_dict()
 547
 548
 549     def _process_place_address(self, token_info: '_TokenInfo',
 550                                address: Sequence[PlaceName]) -> None:
 551         for item in address:
 552             if item.kind == 'postcode':
 553                 token_info.set_postcode(self._add_postcode(item))
 554             elif item.kind == 'housenumber':
 555                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 556             elif item.kind == 'street':
 557                 token_info.add_street(self._retrieve_full_tokens(item.name))
 558             elif item.kind == 'place':
 559                 if not item.suffix:
 560                     token_info.add_place(self._compute_partial_tokens(item.name))
 561             elif not item.kind.startswith('_') and not item.suffix and \
 562                  item.kind not in ('country', 'full', 'inclusion'):
 563                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 564
 565
 566     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 567         """ Normalize the housenumber and return the word token and the
 568             canonical form.
 569         """
 570         assert self.conn is not None
 571         analyzer = self.token_analysis.analysis.get('@housenumber')
 572         result: Tuple[Optional[int], Optional[str]] = (None, None)
 573
 574         if analyzer is None:
 575             # When no custom analyzer is set, simply normalize and transliterate
 576             norm_name = self._search_normalized(hnr.name)
 577             if norm_name:
 578                 result = self._cache.housenumbers.get(norm_name, result)
 579                 if result[0] is None:
 580                     with self.conn.cursor() as cur:
 581                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 582
 583                         result = hid, norm_name
 584                         self._cache.housenumbers[norm_name] = result
 585         else:
 586             # Otherwise use the analyzer to determine the canonical name.
 587             # Per convention we use the first variant as the 'lookup name', the
 588             # name that gets saved in the housenumber field of the place.
 589             word_id = analyzer.get_canonical_id(hnr)
 590             if word_id:
 591                 result = self._cache.housenumbers.get(word_id, result)
 592                 if result[0] is None:
 593                     variants = analyzer.compute_variants(word_id)
 594                     if variants:
 595                         with self.conn.cursor() as cur:
 596                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
 597                                              (word_id, list(variants)))
 598                             result = hid, variants[0]
 599                             self._cache.housenumbers[word_id] = result
 600
 601         return result
 602
 603
 604     def _compute_partial_tokens(self, name: str) -> List[int]:
 605         """ Normalize the given term, split it into partial words and return
 606             then token list for them.
 607         """
 608         assert self.conn is not None
 609         norm_name = self._search_normalized(name)
 610
 611         tokens = []
 612         need_lookup = []
 613         for partial in norm_name.split():
 614             token = self._cache.partials.get(partial)
 615             if token:
 616                 tokens.append(token)
 617             else:
 618                 need_lookup.append(partial)
 619
 620         if need_lookup:
 621             with self.conn.cursor() as cur:
 622                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 623                                FROM unnest(%s) word""",
 624                             (need_lookup, ))
 625
 626                 for partial, token in cur:
 627                     assert token is not None
 628                     tokens.append(token)
 629                     self._cache.partials[partial] = token
 630
 631         return tokens
 632
 633
 634     def _retrieve_full_tokens(self, name: str) -> List[int]:
 635         """ Get the full name token for the given name, if it exists.
 636             The name is only retrieved for the standard analyser.
 637         """
 638         assert self.conn is not None
 639         norm_name = self._search_normalized(name)
 640
 641         # return cached if possible
 642         if norm_name in self._cache.fulls:
 643             return self._cache.fulls[norm_name]
 644
 645         with self.conn.cursor() as cur:
 646             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 647                         (norm_name, ))
 648             full = [row[0] for row in cur]
 649
 650         self._cache.fulls[norm_name] = full
 651
 652         return full
 653
 654
 655     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 656         """ Computes the full name and partial name tokens for the given
 657             dictionary of names.
 658         """
 659         assert self.conn is not None
 660         full_tokens: Set[int] = set()
 661         partial_tokens: Set[int] = set()
 662
 663         for name in names:
 664             analyzer_id = name.get_attr('analyzer')
 665             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 666             word_id = analyzer.get_canonical_id(name)
 667             if analyzer_id is None:
 668                 token_id = word_id
 669             else:
 670                 token_id = f'{word_id}@{analyzer_id}'
 671
 672             full, part = self._cache.names.get(token_id, (None, None))
 673             if full is None:
 674                 variants = analyzer.compute_variants(word_id)
 675                 if not variants:
 676                     continue
 677
 678                 with self.conn.cursor() as cur:
 679                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 680                                 (token_id, variants))
 681                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 682
 683                 self._cache.names[token_id] = (full, part)
 684
 685             assert part is not None
 686
 687             full_tokens.add(full)
 688             partial_tokens.update(part)
 689
 690         return full_tokens, partial_tokens
 691
 692
 693     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 694         """ Make sure the normalized postcode is present in the word table.
 695         """
 696         assert self.conn is not None
 697         analyzer = self.token_analysis.analysis.get('@postcode')
 698
 699         if analyzer is None:
 700             postcode_name = item.name.strip().upper()
 701             variant_base = None
 702         else:
 703             postcode_name = analyzer.get_canonical_id(item)
 704             variant_base = item.get_attr("variant")
 705
 706         if variant_base:
 707             postcode = f'{postcode_name}@{variant_base}'
 708         else:
 709             postcode = postcode_name
 710
 711         if postcode not in self._cache.postcodes:
 712             term = self._search_normalized(postcode_name)
 713             if not term:
 714                 return None
 715
 716             variants = {term}
 717             if analyzer is not None and variant_base:
 718                 variants.update(analyzer.compute_variants(variant_base))
 719
 720             with self.conn.cursor() as cur:
 721                 cur.execute("SELECT create_postcode_word(%s, %s)",
 722                             (postcode, list(variants)))
 723             self._cache.postcodes.add(postcode)
 724
 725         return postcode_name
 726
 727
 728 class _TokenInfo:
 729     """ Collect token information to be sent back to the database.
 730     """
 731     def __init__(self) -> None:
 732         self.names: Optional[str] = None
 733         self.housenumbers: Set[str] = set()
 734         self.housenumber_tokens: Set[int] = set()
 735         self.street_tokens: Optional[Set[int]] = None
 736         self.place_tokens: Set[int] = set()
 737         self.address_tokens: Dict[str, str] = {}
 738         self.postcode: Optional[str] = None
 739
 740
 741     def _mk_array(self, tokens: Iterable[Any]) -> str:
 742         return f"{{{','.join((str(s) for s in tokens))}}}"
 743
 744
 745     def to_dict(self) -> Dict[str, Any]:
 746         """ Return the token information in database importable format.
 747         """
 748         out: Dict[str, Any] = {}
 749
 750         if self.names:
 751             out['names'] = self.names
 752
 753         if self.housenumbers:
 754             out['hnr'] = ';'.join(self.housenumbers)
 755             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 756
 757         if self.street_tokens is not None:
 758             out['street'] = self._mk_array(self.street_tokens)
 759
 760         if self.place_tokens:
 761             out['place'] = self._mk_array(self.place_tokens)
 762
 763         if self.address_tokens:
 764             out['addr'] = self.address_tokens
 765
 766         if self.postcode:
 767             out['postcode'] = self.postcode
 768
 769         return out
 770
 771
 772     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 773         """ Adds token information for the normalised names.
 774         """
 775         self.names = self._mk_array(itertools.chain(fulls, partials))
 776
 777
 778     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 779         """ Extract housenumber information from a list of normalised
 780             housenumbers.
 781         """
 782         if token:
 783             assert hnr is not None
 784             self.housenumbers.add(hnr)
 785             self.housenumber_tokens.add(token)
 786
 787
 788     def add_street(self, tokens: Iterable[int]) -> None:
 789         """ Add addr:street match terms.
 790         """
 791         if self.street_tokens is None:
 792             self.street_tokens = set()
 793         self.street_tokens.update(tokens)
 794
 795
 796     def add_place(self, tokens: Iterable[int]) -> None:
 797         """ Add addr:place search and match terms.
 798         """
 799         self.place_tokens.update(tokens)
 800
 801
 802     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 803         """ Add additional address terms.
 804         """
 805         if partials:
 806             self.address_tokens[key] = self._mk_array(partials)
 807
 808     def set_postcode(self, postcode: Optional[str]) -> None:
 809         """ Set the postcode to the given one.
 810         """
 811         self.postcode = postcode
 812
 813
 814 class _TokenCache:
 815     """ Cache for token information to avoid repeated database queries.
 816
 817         This cache is not thread-safe and needs to be instantiated per
 818         analyzer.
 819     """
 820     def __init__(self) -> None:
 821         self.names: Dict[str, Tuple[int, List[int]]] = {}
 822         self.partials: Dict[str, int] = {}
 823         self.fulls: Dict[str, List[int]] = {}
 824         self.postcodes: Set[str] = set()
 825         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}