nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from nominatim.db.connection import connect, Connection, Cursor
  20 from nominatim.config import Configuration
  21 from nominatim.db.utils import CopyBuffer
  22 from nominatim.db.sql_preprocessor import SQLPreprocessor
  23 from nominatim.data.place_info import PlaceInfo
  24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  26 from nominatim.data.place_name import PlaceName
  27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 WORD_TYPES =(('country_names', 'C'),
  35              ('postcodes', 'P'),
  36              ('full_word', 'W'),
  37              ('housenumbers', 'H'))
  38
  39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  40     """ Create a new instance of the tokenizer provided by this module.
  41     """
  42     return ICUTokenizer(dsn, data_dir)
  43
  44
  45 class ICUTokenizer(AbstractTokenizer):
  46     """ This tokenizer uses libICU to convert names and queries to ASCII.
  47         Otherwise it uses the same algorithms and data structures as the
  48         normalization routines in Nominatim 3.
  49     """
  50
  51     def __init__(self, dsn: str, data_dir: Path) -> None:
  52         self.dsn = dsn
  53         self.data_dir = data_dir
  54         self.loader: Optional[ICURuleLoader] = None
  55
  56
  57     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  58         """ Set up a new tokenizer for the database.
  59
  60             This copies all necessary data in the project directory to make
  61             sure the tokenizer remains stable even over updates.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         self._install_php(config.lib_dir.php, overwrite=True)
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82         self._install_php(config.lib_dir.php, overwrite=False)
  83
  84
  85     def finalize_import(self, config: Configuration) -> None:
  86         """ Do any required postprocessing to make the tokenizer data ready
  87             for use.
  88         """
  89         self._create_lookup_indices(config, 'word')
  90
  91
  92     def update_sql_functions(self, config: Configuration) -> None:
  93         """ Reimport the SQL functions for this tokenizer.
  94         """
  95         with connect(self.dsn) as conn:
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  98
  99
 100     def check_database(self, config: Configuration) -> None:
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         # Will throw an error if there is an issue.
 104         self.init_from_project(config)
 105
 106
 107     def update_statistics(self, config: Configuration) -> None:
 108         """ Recompute frequencies for all name words.
 109         """
 110         with connect(self.dsn) as conn:
 111             if not conn.table_exists('search_name'):
 112                 return
 113
 114             with conn.cursor() as cur:
 115                 LOG.info('Computing word frequencies')
 116                 cur.drop_table('word_frequencies')
 117                 cur.execute("""CREATE TEMP TABLE word_frequencies AS
 118                                  SELECT unnest(name_vector) as id, count(*)
 119                                  FROM search_name GROUP BY id""")
 120                 cur.execute('CREATE INDEX ON word_frequencies(id)')
 121                 LOG.info('Update word table with recomputed frequencies')
 122                 cur.drop_table('tmp_word')
 123                 cur.execute("""CREATE TABLE tmp_word AS
 124                                 SELECT word_id, word_token, type, word,
 125                                        (CASE WHEN wf.count is null THEN info
 126                                           ELSE info || jsonb_build_object('count', wf.count)
 127                                         END) as info
 128                                 FROM word LEFT JOIN word_frequencies wf
 129                                   ON word.word_id = wf.id""")
 130                 cur.drop_table('word_frequencies')
 131
 132             sqlp = SQLPreprocessor(conn, config)
 133             sqlp.run_string(conn,
 134                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 135             conn.commit()
 136         self._create_base_indices(config, 'tmp_word')
 137         self._create_lookup_indices(config, 'tmp_word')
 138         self._move_temporary_word_table('tmp_word')
 139
 140
 141
 142     def _cleanup_housenumbers(self) -> None:
 143         """ Remove unused house numbers.
 144         """
 145         with connect(self.dsn) as conn:
 146             if not conn.table_exists('search_name'):
 147                 return
 148             with conn.cursor(name="hnr_counter") as cur:
 149                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 150                                FROM word
 151                                WHERE type = 'H'
 152                                  AND NOT EXISTS(SELECT * FROM search_name
 153                                                 WHERE ARRAY[word.word_id] && name_vector)
 154                                  AND (char_length(coalesce(word, word_token)) > 6
 155                                       OR coalesce(word, word_token) not similar to '\\d+')
 156                             """)
 157                 candidates = {token: wid for wid, token in cur}
 158             with conn.cursor(name="hnr_counter") as cur:
 159                 cur.execute("""SELECT housenumber FROM placex
 160                                WHERE housenumber is not null
 161                                      AND (char_length(housenumber) > 6
 162                                           OR housenumber not similar to '\\d+')
 163                             """)
 164                 for row in cur:
 165                     for hnr in row[0].split(';'):
 166                         candidates.pop(hnr, None)
 167             LOG.info("There are %s outdated housenumbers.", len(candidates))
 168             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 169             if candidates:
 170                 with conn.cursor() as cur:
 171                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 172                                 (list(candidates.values()), ))
 173                 conn.commit()
 174
 175
 176
 177     def update_word_tokens(self) -> None:
 178         """ Remove unused tokens.
 179         """
 180         LOG.warning("Cleaning up housenumber tokens.")
 181         self._cleanup_housenumbers()
 182         LOG.warning("Tokenizer house-keeping done.")
 183
 184
 185     def name_analyzer(self) -> 'ICUNameAnalyzer':
 186         """ Create a new analyzer for tokenizing names and queries
 187             using this tokinzer. Analyzers are context managers and should
 188             be used accordingly:
 189
 190             ```
 191             with tokenizer.name_analyzer() as analyzer:
 192                 analyser.tokenize()
 193             ```
 194
 195             When used outside the with construct, the caller must ensure to
 196             call the close() function before destructing the analyzer.
 197
 198             Analyzers are not thread-safe. You need to instantiate one per thread.
 199         """
 200         assert self.loader is not None
 201         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 202                                self.loader.make_token_analysis())
 203
 204
 205     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 206         """ Return a list of the `num` most frequent full words
 207             in the database.
 208         """
 209         with conn.cursor() as cur:
 210             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 211                              FROM word WHERE type = 'W'
 212                              GROUP BY word
 213                              ORDER BY count DESC LIMIT %s""", (num,))
 214             return list(s[0].split('@')[0] for s in cur)
 215
 216
 217     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
 218         """ Install the php script for the tokenizer.
 219         """
 220         if phpdir is not None:
 221             assert self.loader is not None
 222             php_file = self.data_dir / "tokenizer.php"
 223
 224             if not php_file.exists() or overwrite:
 225                 php_file.write_text(dedent(f"""\
 226                     <?php
 227                     @define('CONST_Max_Word_Frequency', 10000000);
 228                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 229                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 230                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 231
 232
 233     def _save_config(self) -> None:
 234         """ Save the configuration that needs to remain stable for the given
 235             database as database properties.
 236         """
 237         assert self.loader is not None
 238         with connect(self.dsn) as conn:
 239             self.loader.save_config_to_db(conn)
 240
 241
 242     def _setup_db_tables(self, config: Configuration) -> None:
 243         """ Set up the word table and fill it with pre-computed word
 244             frequencies.
 245         """
 246         with connect(self.dsn) as conn:
 247             with conn.cursor() as cur:
 248                 cur.drop_table('word')
 249             sqlp = SQLPreprocessor(conn, config)
 250             sqlp.run_string(conn, """
 251                 CREATE TABLE word (
 252                       word_id INTEGER,
 253                       word_token text NOT NULL,
 254                       type text NOT NULL,
 255                       word text,
 256                       info jsonb
 257                     ) {{db.tablespace.search_data}};
 258                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 259
 260                 DROP SEQUENCE IF EXISTS seq_word;
 261                 CREATE SEQUENCE seq_word start 1;
 262                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 263             """)
 264             conn.commit()
 265
 266
 267     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 268         """ Set up the word table and fill it with pre-computed word
 269             frequencies.
 270         """
 271         with connect(self.dsn) as conn:
 272             sqlp = SQLPreprocessor(conn, config)
 273             sqlp.run_string(conn,
 274                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 275                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 276                             table_name=table_name)
 277             for name, ctype in WORD_TYPES:
 278                 sqlp.run_string(conn,
 279                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 280                                    USING BTREE (word) {{db.tablespace.address_index}}
 281                                    WHERE type = '{{column_type}}'
 282                                 """,
 283                                 table_name=table_name, idx_name=name,
 284                                 column_type=ctype)
 285             conn.commit()
 286
 287
 288     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 289         """ Create addtional indexes used when running the API.
 290         """
 291         with connect(self.dsn) as conn:
 292             sqlp = SQLPreprocessor(conn, config)
 293             # Index required for details lookup.
 294             sqlp.run_string(conn, """
 295                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 296                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 297             """,
 298             table_name=table_name)
 299             conn.commit()
 300
 301
 302     def _move_temporary_word_table(self, old: str) -> None:
 303         """ Rename all tables and indexes used by the tokenizer.
 304         """
 305         with connect(self.dsn) as conn:
 306             with conn.cursor() as cur:
 307                 cur.drop_table('word')
 308                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 309                 for idx in ('word_token', 'word_id'):
 310                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 311                                       RENAME TO idx_word_{idx}""")
 312                 for name, _ in WORD_TYPES:
 313                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 314                                     RENAME TO idx_word_{name}""")
 315             conn.commit()
 316
 317
 318
 319
 320 class ICUNameAnalyzer(AbstractAnalyzer):
 321     """ The ICU analyzer uses the ICU library for splitting names.
 322
 323         Each instance opens a connection to the database to request the
 324         normalization.
 325     """
 326
 327     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 328                  token_analysis: ICUTokenAnalysis) -> None:
 329         self.conn: Optional[Connection] = connect(dsn).connection
 330         self.conn.autocommit = True
 331         self.sanitizer = sanitizer
 332         self.token_analysis = token_analysis
 333
 334         self._cache = _TokenCache()
 335
 336
 337     def close(self) -> None:
 338         """ Free all resources used by the analyzer.
 339         """
 340         if self.conn:
 341             self.conn.close()
 342             self.conn = None
 343
 344
 345     def _search_normalized(self, name: str) -> str:
 346         """ Return the search token transliteration of the given name.
 347         """
 348         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 349
 350
 351     def _normalized(self, name: str) -> str:
 352         """ Return the normalized version of the given name with all
 353             non-relevant information removed.
 354         """
 355         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 356
 357
 358     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 359         """ Return token information for the given list of words.
 360             If a word starts with # it is assumed to be a full name
 361             otherwise is a partial name.
 362
 363             The function returns a list of tuples with
 364             (original word, word token, word id).
 365
 366             The function is used for testing and debugging only
 367             and not necessarily efficient.
 368         """
 369         assert self.conn is not None
 370         full_tokens = {}
 371         partial_tokens = {}
 372         for word in words:
 373             if word.startswith('#'):
 374                 full_tokens[word] = self._search_normalized(word[1:])
 375             else:
 376                 partial_tokens[word] = self._search_normalized(word)
 377
 378         with self.conn.cursor() as cur:
 379             cur.execute("""SELECT word_token, word_id
 380                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 381                         """, (list(full_tokens.values()),))
 382             full_ids = {r[0]: r[1] for r in cur}
 383             cur.execute("""SELECT word_token, word_id
 384                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 385                         (list(partial_tokens.values()),))
 386             part_ids = {r[0]: r[1] for r in cur}
 387
 388         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 389                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 390
 391
 392     def normalize_postcode(self, postcode: str) -> str:
 393         """ Convert the postcode to a standardized form.
 394
 395             This function must yield exactly the same result as the SQL function
 396             'token_normalized_postcode()'.
 397         """
 398         return postcode.strip().upper()
 399
 400
 401     def update_postcodes_from_db(self) -> None:
 402         """ Update postcode tokens in the word table from the location_postcode
 403             table.
 404         """
 405         assert self.conn is not None
 406         analyzer = self.token_analysis.analysis.get('@postcode')
 407
 408         with self.conn.cursor() as cur:
 409             # First get all postcode names currently in the word table.
 410             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 411             word_entries = set((entry[0] for entry in cur))
 412
 413             # Then compute the required postcode names from the postcode table.
 414             needed_entries = set()
 415             cur.execute("SELECT country_code, postcode FROM location_postcode")
 416             for cc, postcode in cur:
 417                 info = PlaceInfo({'country_code': cc,
 418                                   'class': 'place', 'type': 'postcode',
 419                                   'address': {'postcode': postcode}})
 420                 address = self.sanitizer.process_names(info)[1]
 421                 for place in address:
 422                     if place.kind == 'postcode':
 423                         if analyzer is None:
 424                             postcode_name = place.name.strip().upper()
 425                             variant_base = None
 426                         else:
 427                             postcode_name = analyzer.get_canonical_id(place)
 428                             variant_base = place.get_attr("variant")
 429
 430                         if variant_base:
 431                             needed_entries.add(f'{postcode_name}@{variant_base}')
 432                         else:
 433                             needed_entries.add(postcode_name)
 434                         break
 435
 436         # Now update the word table.
 437         self._delete_unused_postcode_words(word_entries - needed_entries)
 438         self._add_missing_postcode_words(needed_entries - word_entries)
 439
 440     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 441         assert self.conn is not None
 442         if tokens:
 443             with self.conn.cursor() as cur:
 444                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 445                             (list(tokens), ))
 446
 447     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 448         assert self.conn is not None
 449         if not tokens:
 450             return
 451
 452         analyzer = self.token_analysis.analysis.get('@postcode')
 453         terms = []
 454
 455         for postcode_name in tokens:
 456             if '@' in postcode_name:
 457                 term, variant = postcode_name.split('@', 2)
 458                 term = self._search_normalized(term)
 459                 if analyzer is None:
 460                     variants = [term]
 461                 else:
 462                     variants = analyzer.compute_variants(variant)
 463                     if term not in variants:
 464                         variants.append(term)
 465             else:
 466                 variants = [self._search_normalized(postcode_name)]
 467             terms.append((postcode_name, variants))
 468
 469         if terms:
 470             with self.conn.cursor() as cur:
 471                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 472                                       FROM (VALUES %s) AS v(pc, var)""",
 473                                    terms)
 474
 475
 476
 477
 478     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 479                                should_replace: bool) -> None:
 480         """ Replace the search index for special phrases with the new phrases.
 481             If `should_replace` is True, then the previous set of will be
 482             completely replaced. Otherwise the phrases are added to the
 483             already existing ones.
 484         """
 485         assert self.conn is not None
 486         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 487                             for p in phrases))
 488
 489         with self.conn.cursor() as cur:
 490             # Get the old phrases.
 491             existing_phrases = set()
 492             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 493             for word, info in cur:
 494                 existing_phrases.add((word, info['class'], info['type'],
 495                                       info.get('op') or '-'))
 496
 497             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 498             if should_replace:
 499                 deleted = self._remove_special_phrases(cur, norm_phrases,
 500                                                        existing_phrases)
 501             else:
 502                 deleted = 0
 503
 504         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 505                  len(norm_phrases), added, deleted)
 506
 507
 508     def _add_special_phrases(self, cursor: Cursor,
 509                              new_phrases: Set[Tuple[str, str, str, str]],
 510                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 511         """ Add all phrases to the database that are not yet there.
 512         """
 513         to_add = new_phrases - existing_phrases
 514
 515         added = 0
 516         with CopyBuffer() as copystr:
 517             for word, cls, typ, oper in to_add:
 518                 term = self._search_normalized(word)
 519                 if term:
 520                     copystr.add(term, 'S', word,
 521                                 json.dumps({'class': cls, 'type': typ,
 522                                             'op': oper if oper in ('in', 'near') else None}))
 523                     added += 1
 524
 525             copystr.copy_out(cursor, 'word',
 526                              columns=['word_token', 'type', 'word', 'info'])
 527
 528         return added
 529
 530
 531     def _remove_special_phrases(self, cursor: Cursor,
 532                              new_phrases: Set[Tuple[str, str, str, str]],
 533                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 534         """ Remove all phrases from the database that are no longer in the
 535             new phrase list.
 536         """
 537         to_delete = existing_phrases - new_phrases
 538
 539         if to_delete:
 540             cursor.execute_values(
 541                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 542                     WHERE type = 'S' and word = name
 543                           and info->>'class' = in_class and info->>'type' = in_type
 544                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 545                 """, to_delete)
 546
 547         return len(to_delete)
 548
 549
 550     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 551         """ Add default names for the given country to the search index.
 552         """
 553         # Make sure any name preprocessing for country names applies.
 554         info = PlaceInfo({'name': names, 'country_code': country_code,
 555                           'rank_address': 4, 'class': 'boundary',
 556                           'type': 'administrative'})
 557         self._add_country_full_names(country_code,
 558                                      self.sanitizer.process_names(info)[0],
 559                                      internal=True)
 560
 561
 562     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 563                                 internal: bool = False) -> None:
 564         """ Add names for the given country from an already sanitized
 565             name list.
 566         """
 567         assert self.conn is not None
 568         word_tokens = set()
 569         for name in names:
 570             norm_name = self._search_normalized(name.name)
 571             if norm_name:
 572                 word_tokens.add(norm_name)
 573
 574         with self.conn.cursor() as cur:
 575             # Get existing names
 576             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 577                              FROM word
 578                              WHERE type = 'C' and word = %s""",
 579                         (country_code, ))
 580             # internal/external names
 581             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 582             for word in cur:
 583                 existing_tokens[word[1]].add(word[0])
 584
 585             # Delete names that no longer exist.
 586             gone_tokens = existing_tokens[internal] - word_tokens
 587             if internal:
 588                 gone_tokens.update(existing_tokens[False] & word_tokens)
 589             if gone_tokens:
 590                 cur.execute("""DELETE FROM word
 591                                USING unnest(%s) as token
 592                                WHERE type = 'C' and word = %s
 593                                      and word_token = token""",
 594                             (list(gone_tokens), country_code))
 595
 596             # Only add those names that are not yet in the list.
 597             new_tokens = word_tokens - existing_tokens[True]
 598             if not internal:
 599                 new_tokens -= existing_tokens[False]
 600             if new_tokens:
 601                 if internal:
 602                     sql = """INSERT INTO word (word_token, type, word, info)
 603                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 604                                   FROM unnest(%s) as token)
 605                            """
 606                 else:
 607                     sql = """INSERT INTO word (word_token, type, word)
 608                                    (SELECT token, 'C', %s
 609                                     FROM unnest(%s) as token)
 610                           """
 611                 cur.execute(sql, (country_code, list(new_tokens)))
 612
 613
 614     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 615         """ Determine tokenizer information about the given place.
 616
 617             Returns a JSON-serializable structure that will be handed into
 618             the database via the token_info field.
 619         """
 620         token_info = _TokenInfo()
 621
 622         names, address = self.sanitizer.process_names(place)
 623
 624         if names:
 625             token_info.set_names(*self._compute_name_tokens(names))
 626
 627             if place.is_country():
 628                 assert place.country_code is not None
 629                 self._add_country_full_names(place.country_code, names)
 630
 631         if address:
 632             self._process_place_address(token_info, address)
 633
 634         return token_info.to_dict()
 635
 636
 637     def _process_place_address(self, token_info: '_TokenInfo',
 638                                address: Sequence[PlaceName]) -> None:
 639         for item in address:
 640             if item.kind == 'postcode':
 641                 token_info.set_postcode(self._add_postcode(item))
 642             elif item.kind == 'housenumber':
 643                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 644             elif item.kind == 'street':
 645                 token_info.add_street(self._retrieve_full_tokens(item.name))
 646             elif item.kind == 'place':
 647                 if not item.suffix:
 648                     token_info.add_place(self._compute_partial_tokens(item.name))
 649             elif not item.kind.startswith('_') and not item.suffix and \
 650                  item.kind not in ('country', 'full', 'inclusion'):
 651                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 652
 653
 654     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 655         """ Normalize the housenumber and return the word token and the
 656             canonical form.
 657         """
 658         assert self.conn is not None
 659         analyzer = self.token_analysis.analysis.get('@housenumber')
 660         result: Tuple[Optional[int], Optional[str]] = (None, None)
 661
 662         if analyzer is None:
 663             # When no custom analyzer is set, simply normalize and transliterate
 664             norm_name = self._search_normalized(hnr.name)
 665             if norm_name:
 666                 result = self._cache.housenumbers.get(norm_name, result)
 667                 if result[0] is None:
 668                     with self.conn.cursor() as cur:
 669                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 670
 671                         result = hid, norm_name
 672                         self._cache.housenumbers[norm_name] = result
 673         else:
 674             # Otherwise use the analyzer to determine the canonical name.
 675             # Per convention we use the first variant as the 'lookup name', the
 676             # name that gets saved in the housenumber field of the place.
 677             word_id = analyzer.get_canonical_id(hnr)
 678             if word_id:
 679                 result = self._cache.housenumbers.get(word_id, result)
 680                 if result[0] is None:
 681                     variants = analyzer.compute_variants(word_id)
 682                     if variants:
 683                         with self.conn.cursor() as cur:
 684                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
 685                                              (word_id, list(variants)))
 686                             result = hid, variants[0]
 687                             self._cache.housenumbers[word_id] = result
 688
 689         return result
 690
 691
 692     def _compute_partial_tokens(self, name: str) -> List[int]:
 693         """ Normalize the given term, split it into partial words and return
 694             then token list for them.
 695         """
 696         assert self.conn is not None
 697         norm_name = self._search_normalized(name)
 698
 699         tokens = []
 700         need_lookup = []
 701         for partial in norm_name.split():
 702             token = self._cache.partials.get(partial)
 703             if token:
 704                 tokens.append(token)
 705             else:
 706                 need_lookup.append(partial)
 707
 708         if need_lookup:
 709             with self.conn.cursor() as cur:
 710                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 711                                FROM unnest(%s) word""",
 712                             (need_lookup, ))
 713
 714                 for partial, token in cur:
 715                     assert token is not None
 716                     tokens.append(token)
 717                     self._cache.partials[partial] = token
 718
 719         return tokens
 720
 721
 722     def _retrieve_full_tokens(self, name: str) -> List[int]:
 723         """ Get the full name token for the given name, if it exists.
 724             The name is only retrieved for the standard analyser.
 725         """
 726         assert self.conn is not None
 727         norm_name = self._search_normalized(name)
 728
 729         # return cached if possible
 730         if norm_name in self._cache.fulls:
 731             return self._cache.fulls[norm_name]
 732
 733         with self.conn.cursor() as cur:
 734             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 735                         (norm_name, ))
 736             full = [row[0] for row in cur]
 737
 738         self._cache.fulls[norm_name] = full
 739
 740         return full
 741
 742
 743     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 744         """ Computes the full name and partial name tokens for the given
 745             dictionary of names.
 746         """
 747         assert self.conn is not None
 748         full_tokens: Set[int] = set()
 749         partial_tokens: Set[int] = set()
 750
 751         for name in names:
 752             analyzer_id = name.get_attr('analyzer')
 753             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 754             word_id = analyzer.get_canonical_id(name)
 755             if analyzer_id is None:
 756                 token_id = word_id
 757             else:
 758                 token_id = f'{word_id}@{analyzer_id}'
 759
 760             full, part = self._cache.names.get(token_id, (None, None))
 761             if full is None:
 762                 variants = analyzer.compute_variants(word_id)
 763                 if not variants:
 764                     continue
 765
 766                 with self.conn.cursor() as cur:
 767                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 768                                 (token_id, variants))
 769                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 770
 771                 self._cache.names[token_id] = (full, part)
 772
 773             assert part is not None
 774
 775             full_tokens.add(full)
 776             partial_tokens.update(part)
 777
 778         return full_tokens, partial_tokens
 779
 780
 781     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 782         """ Make sure the normalized postcode is present in the word table.
 783         """
 784         assert self.conn is not None
 785         analyzer = self.token_analysis.analysis.get('@postcode')
 786
 787         if analyzer is None:
 788             postcode_name = item.name.strip().upper()
 789             variant_base = None
 790         else:
 791             postcode_name = analyzer.get_canonical_id(item)
 792             variant_base = item.get_attr("variant")
 793
 794         if variant_base:
 795             postcode = f'{postcode_name}@{variant_base}'
 796         else:
 797             postcode = postcode_name
 798
 799         if postcode not in self._cache.postcodes:
 800             term = self._search_normalized(postcode_name)
 801             if not term:
 802                 return None
 803
 804             variants = {term}
 805             if analyzer is not None and variant_base:
 806                 variants.update(analyzer.compute_variants(variant_base))
 807
 808             with self.conn.cursor() as cur:
 809                 cur.execute("SELECT create_postcode_word(%s, %s)",
 810                             (postcode, list(variants)))
 811             self._cache.postcodes.add(postcode)
 812
 813         return postcode_name
 814
 815
 816 class _TokenInfo:
 817     """ Collect token information to be sent back to the database.
 818     """
 819     def __init__(self) -> None:
 820         self.names: Optional[str] = None
 821         self.housenumbers: Set[str] = set()
 822         self.housenumber_tokens: Set[int] = set()
 823         self.street_tokens: Optional[Set[int]] = None
 824         self.place_tokens: Set[int] = set()
 825         self.address_tokens: Dict[str, str] = {}
 826         self.postcode: Optional[str] = None
 827
 828
 829     def _mk_array(self, tokens: Iterable[Any]) -> str:
 830         return f"{{{','.join((str(s) for s in tokens))}}}"
 831
 832
 833     def to_dict(self) -> Dict[str, Any]:
 834         """ Return the token information in database importable format.
 835         """
 836         out: Dict[str, Any] = {}
 837
 838         if self.names:
 839             out['names'] = self.names
 840
 841         if self.housenumbers:
 842             out['hnr'] = ';'.join(self.housenumbers)
 843             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 844
 845         if self.street_tokens is not None:
 846             out['street'] = self._mk_array(self.street_tokens)
 847
 848         if self.place_tokens:
 849             out['place'] = self._mk_array(self.place_tokens)
 850
 851         if self.address_tokens:
 852             out['addr'] = self.address_tokens
 853
 854         if self.postcode:
 855             out['postcode'] = self.postcode
 856
 857         return out
 858
 859
 860     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 861         """ Adds token information for the normalised names.
 862         """
 863         self.names = self._mk_array(itertools.chain(fulls, partials))
 864
 865
 866     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 867         """ Extract housenumber information from a list of normalised
 868             housenumbers.
 869         """
 870         if token:
 871             assert hnr is not None
 872             self.housenumbers.add(hnr)
 873             self.housenumber_tokens.add(token)
 874
 875
 876     def add_street(self, tokens: Iterable[int]) -> None:
 877         """ Add addr:street match terms.
 878         """
 879         if self.street_tokens is None:
 880             self.street_tokens = set()
 881         self.street_tokens.update(tokens)
 882
 883
 884     def add_place(self, tokens: Iterable[int]) -> None:
 885         """ Add addr:place search and match terms.
 886         """
 887         self.place_tokens.update(tokens)
 888
 889
 890     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 891         """ Add additional address terms.
 892         """
 893         if partials:
 894             self.address_tokens[key] = self._mk_array(partials)
 895
 896     def set_postcode(self, postcode: Optional[str]) -> None:
 897         """ Set the postcode to the given one.
 898         """
 899         self.postcode = postcode
 900
 901
 902 class _TokenCache:
 903     """ Cache for token information to avoid repeated database queries.
 904
 905         This cache is not thread-safe and needs to be instantiated per
 906         analyzer.
 907     """
 908     def __init__(self) -> None:
 909         self.names: Dict[str, Tuple[int, List[int]]] = {}
 910         self.partials: Dict[str, int] = {}
 911         self.fulls: Dict[str, List[int]] = {}
 912         self.postcodes: Set[str] = set()
 913         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}