nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from nominatim.db.connection import connect, Connection, Cursor
  20 from nominatim.config import Configuration
  21 from nominatim.db.utils import CopyBuffer
  22 from nominatim.db.sql_preprocessor import SQLPreprocessor
  23 from nominatim.data.place_info import PlaceInfo
  24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  26 from nominatim.data.place_name import PlaceName
  27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 WORD_TYPES =(('country_names', 'C'),
  35              ('postcodes', 'P'),
  36              ('full_word', 'W'),
  37              ('housenumbers', 'H'))
  38
  39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  40     """ Create a new instance of the tokenizer provided by this module.
  41     """
  42     return ICUTokenizer(dsn, data_dir)
  43
  44
  45 class ICUTokenizer(AbstractTokenizer):
  46     """ This tokenizer uses libICU to convert names and queries to ASCII.
  47         Otherwise it uses the same algorithms and data structures as the
  48         normalization routines in Nominatim 3.
  49     """
  50
  51     def __init__(self, dsn: str, data_dir: Path) -> None:
  52         self.dsn = dsn
  53         self.data_dir = data_dir
  54         self.loader: Optional[ICURuleLoader] = None
  55
  56
  57     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  58         """ Set up a new tokenizer for the database.
  59
  60             This copies all necessary data in the project directory to make
  61             sure the tokenizer remains stable even over updates.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         self._install_php(config.lib_dir.php, overwrite=True)
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82         self._install_php(config.lib_dir.php, overwrite=False)
  83
  84
  85     def finalize_import(self, config: Configuration) -> None:
  86         """ Do any required postprocessing to make the tokenizer data ready
  87             for use.
  88         """
  89         self._create_lookup_indices(config, 'word')
  90
  91
  92     def update_sql_functions(self, config: Configuration) -> None:
  93         """ Reimport the SQL functions for this tokenizer.
  94         """
  95         with connect(self.dsn) as conn:
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  98
  99
 100     def check_database(self, config: Configuration) -> None:
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         # Will throw an error if there is an issue.
 104         self.init_from_project(config)
 105
 106
 107     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 108         """ Recompute frequencies for all name words.
 109         """
 110         with connect(self.dsn) as conn:
 111             if not conn.table_exists('search_name'):
 112                 return
 113
 114             with conn.cursor() as cur:
 115                 cur.execute('ANALYSE search_name')
 116                 if threads > 1:
 117                     cur.execute('SET max_parallel_workers_per_gather TO %s',
 118                                 (min(threads, 6),))
 119
 120                 if conn.server_version_tuple() < (12, 0):
 121                     LOG.info('Computing word frequencies')
 122                     cur.drop_table('word_frequencies')
 123                     cur.drop_table('addressword_frequencies')
 124                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 125                                      SELECT unnest(name_vector) as id, count(*)
 126                                      FROM search_name GROUP BY id""")
 127                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 128                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 129                                      SELECT unnest(nameaddress_vector) as id, count(*)
 130                                      FROM search_name GROUP BY id""")
 131                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 132                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 133                                                                                INOUT info JSONB)
 134                                    AS $$
 135                                    DECLARE rec RECORD;
 136                                    BEGIN
 137                                    IF info is null THEN
 138                                      info = '{}'::jsonb;
 139                                    END IF;
 140                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 141                                    LOOP
 142                                      info = info || jsonb_build_object('count', rec.count);
 143                                    END LOOP;
 144                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 145                                    LOOP
 146                                      info = info || jsonb_build_object('addr_count', rec.count);
 147                                    END LOOP;
 148                                    IF info = '{}'::jsonb THEN
 149                                      info = null;
 150                                    END IF;
 151                                    END;
 152                                    $$ LANGUAGE plpgsql IMMUTABLE;
 153                                 """)
 154                     LOG.info('Update word table with recomputed frequencies')
 155                     cur.drop_table('tmp_word')
 156                     cur.execute("""CREATE TABLE tmp_word AS
 157                                     SELECT word_id, word_token, type, word,
 158                                            word_freq_update(word_id, info) as info
 159                                     FROM word
 160                                 """)
 161                     cur.drop_table('word_frequencies')
 162                     cur.drop_table('addressword_frequencies')
 163                 else:
 164                     LOG.info('Computing word frequencies')
 165                     cur.drop_table('word_frequencies')
 166                     cur.execute("""
 167                       CREATE TEMP TABLE word_frequencies AS
 168                       WITH word_freq AS MATERIALIZED (
 169                                SELECT unnest(name_vector) as id, count(*)
 170                                      FROM search_name GROUP BY id),
 171                            addr_freq AS MATERIALIZED (
 172                                SELECT unnest(nameaddress_vector) as id, count(*)
 173                                      FROM search_name GROUP BY id)
 174                       SELECT coalesce(a.id, w.id) as id,
 175                              (CASE WHEN w.count is null THEN '{}'::JSONB
 176                                   ELSE jsonb_build_object('count', w.count) END
 177                               ||
 178                               CASE WHEN a.count is null THEN '{}'::JSONB
 179                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 180                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 181                       """)
 182                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 183                     cur.execute('ANALYSE word_frequencies')
 184                     LOG.info('Update word table with recomputed frequencies')
 185                     cur.drop_table('tmp_word')
 186                     cur.execute("""CREATE TABLE tmp_word AS
 187                                     SELECT word_id, word_token, type, word,
 188                                            (CASE WHEN wf.info is null THEN word.info
 189                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 190                                             END) as info
 191                                     FROM word LEFT JOIN word_frequencies wf
 192                                          ON word.word_id = wf.id
 193                                 """)
 194                     cur.drop_table('word_frequencies')
 195
 196             with conn.cursor() as cur:
 197                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 198
 199             sqlp = SQLPreprocessor(conn, config)
 200             sqlp.run_string(conn,
 201                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 202             conn.commit()
 203         self._create_base_indices(config, 'tmp_word')
 204         self._create_lookup_indices(config, 'tmp_word')
 205         self._move_temporary_word_table('tmp_word')
 206
 207
 208
 209     def _cleanup_housenumbers(self) -> None:
 210         """ Remove unused house numbers.
 211         """
 212         with connect(self.dsn) as conn:
 213             if not conn.table_exists('search_name'):
 214                 return
 215             with conn.cursor(name="hnr_counter") as cur:
 216                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 217                                FROM word
 218                                WHERE type = 'H'
 219                                  AND NOT EXISTS(SELECT * FROM search_name
 220                                                 WHERE ARRAY[word.word_id] && name_vector)
 221                                  AND (char_length(coalesce(word, word_token)) > 6
 222                                       OR coalesce(word, word_token) not similar to '\\d+')
 223                             """)
 224                 candidates = {token: wid for wid, token in cur}
 225             with conn.cursor(name="hnr_counter") as cur:
 226                 cur.execute("""SELECT housenumber FROM placex
 227                                WHERE housenumber is not null
 228                                      AND (char_length(housenumber) > 6
 229                                           OR housenumber not similar to '\\d+')
 230                             """)
 231                 for row in cur:
 232                     for hnr in row[0].split(';'):
 233                         candidates.pop(hnr, None)
 234             LOG.info("There are %s outdated housenumbers.", len(candidates))
 235             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 236             if candidates:
 237                 with conn.cursor() as cur:
 238                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 239                                 (list(candidates.values()), ))
 240                 conn.commit()
 241
 242
 243
 244     def update_word_tokens(self) -> None:
 245         """ Remove unused tokens.
 246         """
 247         LOG.warning("Cleaning up housenumber tokens.")
 248         self._cleanup_housenumbers()
 249         LOG.warning("Tokenizer house-keeping done.")
 250
 251
 252     def name_analyzer(self) -> 'ICUNameAnalyzer':
 253         """ Create a new analyzer for tokenizing names and queries
 254             using this tokinzer. Analyzers are context managers and should
 255             be used accordingly:
 256
 257             ```
 258             with tokenizer.name_analyzer() as analyzer:
 259                 analyser.tokenize()
 260             ```
 261
 262             When used outside the with construct, the caller must ensure to
 263             call the close() function before destructing the analyzer.
 264
 265             Analyzers are not thread-safe. You need to instantiate one per thread.
 266         """
 267         assert self.loader is not None
 268         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 269                                self.loader.make_token_analysis())
 270
 271
 272     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 273         """ Return a list of the `num` most frequent full words
 274             in the database.
 275         """
 276         with conn.cursor() as cur:
 277             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 278                              FROM word WHERE type = 'W'
 279                              GROUP BY word
 280                              ORDER BY count DESC LIMIT %s""", (num,))
 281             return list(s[0].split('@')[0] for s in cur)
 282
 283
 284     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
 285         """ Install the php script for the tokenizer.
 286         """
 287         if phpdir is not None:
 288             assert self.loader is not None
 289             php_file = self.data_dir / "tokenizer.php"
 290
 291             if not php_file.exists() or overwrite:
 292                 php_file.write_text(dedent(f"""\
 293                     <?php
 294                     @define('CONST_Max_Word_Frequency', 10000000);
 295                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 296                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 297                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 298
 299
 300     def _save_config(self) -> None:
 301         """ Save the configuration that needs to remain stable for the given
 302             database as database properties.
 303         """
 304         assert self.loader is not None
 305         with connect(self.dsn) as conn:
 306             self.loader.save_config_to_db(conn)
 307
 308
 309     def _setup_db_tables(self, config: Configuration) -> None:
 310         """ Set up the word table and fill it with pre-computed word
 311             frequencies.
 312         """
 313         with connect(self.dsn) as conn:
 314             with conn.cursor() as cur:
 315                 cur.drop_table('word')
 316             sqlp = SQLPreprocessor(conn, config)
 317             sqlp.run_string(conn, """
 318                 CREATE TABLE word (
 319                       word_id INTEGER,
 320                       word_token text NOT NULL,
 321                       type text NOT NULL,
 322                       word text,
 323                       info jsonb
 324                     ) {{db.tablespace.search_data}};
 325                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 326
 327                 DROP SEQUENCE IF EXISTS seq_word;
 328                 CREATE SEQUENCE seq_word start 1;
 329                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 330             """)
 331             conn.commit()
 332
 333
 334     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 335         """ Set up the word table and fill it with pre-computed word
 336             frequencies.
 337         """
 338         with connect(self.dsn) as conn:
 339             sqlp = SQLPreprocessor(conn, config)
 340             sqlp.run_string(conn,
 341                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 342                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 343                             table_name=table_name)
 344             for name, ctype in WORD_TYPES:
 345                 sqlp.run_string(conn,
 346                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 347                                    USING BTREE (word) {{db.tablespace.address_index}}
 348                                    WHERE type = '{{column_type}}'
 349                                 """,
 350                                 table_name=table_name, idx_name=name,
 351                                 column_type=ctype)
 352             conn.commit()
 353
 354
 355     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 356         """ Create additional indexes used when running the API.
 357         """
 358         with connect(self.dsn) as conn:
 359             sqlp = SQLPreprocessor(conn, config)
 360             # Index required for details lookup.
 361             sqlp.run_string(conn, """
 362                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 363                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 364             """,
 365             table_name=table_name)
 366             conn.commit()
 367
 368
 369     def _move_temporary_word_table(self, old: str) -> None:
 370         """ Rename all tables and indexes used by the tokenizer.
 371         """
 372         with connect(self.dsn) as conn:
 373             with conn.cursor() as cur:
 374                 cur.drop_table('word')
 375                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 376                 for idx in ('word_token', 'word_id'):
 377                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 378                                       RENAME TO idx_word_{idx}""")
 379                 for name, _ in WORD_TYPES:
 380                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 381                                     RENAME TO idx_word_{name}""")
 382             conn.commit()
 383
 384
 385
 386
 387 class ICUNameAnalyzer(AbstractAnalyzer):
 388     """ The ICU analyzer uses the ICU library for splitting names.
 389
 390         Each instance opens a connection to the database to request the
 391         normalization.
 392     """
 393
 394     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 395                  token_analysis: ICUTokenAnalysis) -> None:
 396         self.conn: Optional[Connection] = connect(dsn).connection
 397         self.conn.autocommit = True
 398         self.sanitizer = sanitizer
 399         self.token_analysis = token_analysis
 400
 401         self._cache = _TokenCache()
 402
 403
 404     def close(self) -> None:
 405         """ Free all resources used by the analyzer.
 406         """
 407         if self.conn:
 408             self.conn.close()
 409             self.conn = None
 410
 411
 412     def _search_normalized(self, name: str) -> str:
 413         """ Return the search token transliteration of the given name.
 414         """
 415         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 416
 417
 418     def _normalized(self, name: str) -> str:
 419         """ Return the normalized version of the given name with all
 420             non-relevant information removed.
 421         """
 422         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 423
 424
 425     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 426         """ Return token information for the given list of words.
 427             If a word starts with # it is assumed to be a full name
 428             otherwise is a partial name.
 429
 430             The function returns a list of tuples with
 431             (original word, word token, word id).
 432
 433             The function is used for testing and debugging only
 434             and not necessarily efficient.
 435         """
 436         assert self.conn is not None
 437         full_tokens = {}
 438         partial_tokens = {}
 439         for word in words:
 440             if word.startswith('#'):
 441                 full_tokens[word] = self._search_normalized(word[1:])
 442             else:
 443                 partial_tokens[word] = self._search_normalized(word)
 444
 445         with self.conn.cursor() as cur:
 446             cur.execute("""SELECT word_token, word_id
 447                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 448                         """, (list(full_tokens.values()),))
 449             full_ids = {r[0]: r[1] for r in cur}
 450             cur.execute("""SELECT word_token, word_id
 451                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 452                         (list(partial_tokens.values()),))
 453             part_ids = {r[0]: r[1] for r in cur}
 454
 455         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 456                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 457
 458
 459     def normalize_postcode(self, postcode: str) -> str:
 460         """ Convert the postcode to a standardized form.
 461
 462             This function must yield exactly the same result as the SQL function
 463             'token_normalized_postcode()'.
 464         """
 465         return postcode.strip().upper()
 466
 467
 468     def update_postcodes_from_db(self) -> None:
 469         """ Update postcode tokens in the word table from the location_postcode
 470             table.
 471         """
 472         assert self.conn is not None
 473         analyzer = self.token_analysis.analysis.get('@postcode')
 474
 475         with self.conn.cursor() as cur:
 476             # First get all postcode names currently in the word table.
 477             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 478             word_entries = set((entry[0] for entry in cur))
 479
 480             # Then compute the required postcode names from the postcode table.
 481             needed_entries = set()
 482             cur.execute("SELECT country_code, postcode FROM location_postcode")
 483             for cc, postcode in cur:
 484                 info = PlaceInfo({'country_code': cc,
 485                                   'class': 'place', 'type': 'postcode',
 486                                   'address': {'postcode': postcode}})
 487                 address = self.sanitizer.process_names(info)[1]
 488                 for place in address:
 489                     if place.kind == 'postcode':
 490                         if analyzer is None:
 491                             postcode_name = place.name.strip().upper()
 492                             variant_base = None
 493                         else:
 494                             postcode_name = analyzer.get_canonical_id(place)
 495                             variant_base = place.get_attr("variant")
 496
 497                         if variant_base:
 498                             needed_entries.add(f'{postcode_name}@{variant_base}')
 499                         else:
 500                             needed_entries.add(postcode_name)
 501                         break
 502
 503         # Now update the word table.
 504         self._delete_unused_postcode_words(word_entries - needed_entries)
 505         self._add_missing_postcode_words(needed_entries - word_entries)
 506
 507     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 508         assert self.conn is not None
 509         if tokens:
 510             with self.conn.cursor() as cur:
 511                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 512                             (list(tokens), ))
 513
 514     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 515         assert self.conn is not None
 516         if not tokens:
 517             return
 518
 519         analyzer = self.token_analysis.analysis.get('@postcode')
 520         terms = []
 521
 522         for postcode_name in tokens:
 523             if '@' in postcode_name:
 524                 term, variant = postcode_name.split('@', 2)
 525                 term = self._search_normalized(term)
 526                 if analyzer is None:
 527                     variants = [term]
 528                 else:
 529                     variants = analyzer.compute_variants(variant)
 530                     if term not in variants:
 531                         variants.append(term)
 532             else:
 533                 variants = [self._search_normalized(postcode_name)]
 534             terms.append((postcode_name, variants))
 535
 536         if terms:
 537             with self.conn.cursor() as cur:
 538                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 539                                       FROM (VALUES %s) AS v(pc, var)""",
 540                                    terms)
 541
 542
 543
 544
 545     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 546                                should_replace: bool) -> None:
 547         """ Replace the search index for special phrases with the new phrases.
 548             If `should_replace` is True, then the previous set of will be
 549             completely replaced. Otherwise the phrases are added to the
 550             already existing ones.
 551         """
 552         assert self.conn is not None
 553         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 554                             for p in phrases))
 555
 556         with self.conn.cursor() as cur:
 557             # Get the old phrases.
 558             existing_phrases = set()
 559             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 560             for word, info in cur:
 561                 existing_phrases.add((word, info['class'], info['type'],
 562                                       info.get('op') or '-'))
 563
 564             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 565             if should_replace:
 566                 deleted = self._remove_special_phrases(cur, norm_phrases,
 567                                                        existing_phrases)
 568             else:
 569                 deleted = 0
 570
 571         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 572                  len(norm_phrases), added, deleted)
 573
 574
 575     def _add_special_phrases(self, cursor: Cursor,
 576                              new_phrases: Set[Tuple[str, str, str, str]],
 577                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 578         """ Add all phrases to the database that are not yet there.
 579         """
 580         to_add = new_phrases - existing_phrases
 581
 582         added = 0
 583         with CopyBuffer() as copystr:
 584             for word, cls, typ, oper in to_add:
 585                 term = self._search_normalized(word)
 586                 if term:
 587                     copystr.add(term, 'S', word,
 588                                 json.dumps({'class': cls, 'type': typ,
 589                                             'op': oper if oper in ('in', 'near') else None}))
 590                     added += 1
 591
 592             copystr.copy_out(cursor, 'word',
 593                              columns=['word_token', 'type', 'word', 'info'])
 594
 595         return added
 596
 597
 598     def _remove_special_phrases(self, cursor: Cursor,
 599                              new_phrases: Set[Tuple[str, str, str, str]],
 600                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 601         """ Remove all phrases from the database that are no longer in the
 602             new phrase list.
 603         """
 604         to_delete = existing_phrases - new_phrases
 605
 606         if to_delete:
 607             cursor.execute_values(
 608                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 609                     WHERE type = 'S' and word = name
 610                           and info->>'class' = in_class and info->>'type' = in_type
 611                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 612                 """, to_delete)
 613
 614         return len(to_delete)
 615
 616
 617     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 618         """ Add default names for the given country to the search index.
 619         """
 620         # Make sure any name preprocessing for country names applies.
 621         info = PlaceInfo({'name': names, 'country_code': country_code,
 622                           'rank_address': 4, 'class': 'boundary',
 623                           'type': 'administrative'})
 624         self._add_country_full_names(country_code,
 625                                      self.sanitizer.process_names(info)[0],
 626                                      internal=True)
 627
 628
 629     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 630                                 internal: bool = False) -> None:
 631         """ Add names for the given country from an already sanitized
 632             name list.
 633         """
 634         assert self.conn is not None
 635         word_tokens = set()
 636         for name in names:
 637             norm_name = self._search_normalized(name.name)
 638             if norm_name:
 639                 word_tokens.add(norm_name)
 640
 641         with self.conn.cursor() as cur:
 642             # Get existing names
 643             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 644                              FROM word
 645                              WHERE type = 'C' and word = %s""",
 646                         (country_code, ))
 647             # internal/external names
 648             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 649             for word in cur:
 650                 existing_tokens[word[1]].add(word[0])
 651
 652             # Delete names that no longer exist.
 653             gone_tokens = existing_tokens[internal] - word_tokens
 654             if internal:
 655                 gone_tokens.update(existing_tokens[False] & word_tokens)
 656             if gone_tokens:
 657                 cur.execute("""DELETE FROM word
 658                                USING unnest(%s) as token
 659                                WHERE type = 'C' and word = %s
 660                                      and word_token = token""",
 661                             (list(gone_tokens), country_code))
 662
 663             # Only add those names that are not yet in the list.
 664             new_tokens = word_tokens - existing_tokens[True]
 665             if not internal:
 666                 new_tokens -= existing_tokens[False]
 667             if new_tokens:
 668                 if internal:
 669                     sql = """INSERT INTO word (word_token, type, word, info)
 670                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 671                                   FROM unnest(%s) as token)
 672                            """
 673                 else:
 674                     sql = """INSERT INTO word (word_token, type, word)
 675                                    (SELECT token, 'C', %s
 676                                     FROM unnest(%s) as token)
 677                           """
 678                 cur.execute(sql, (country_code, list(new_tokens)))
 679
 680
 681     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 682         """ Determine tokenizer information about the given place.
 683
 684             Returns a JSON-serializable structure that will be handed into
 685             the database via the token_info field.
 686         """
 687         token_info = _TokenInfo()
 688
 689         names, address = self.sanitizer.process_names(place)
 690
 691         if names:
 692             token_info.set_names(*self._compute_name_tokens(names))
 693
 694             if place.is_country():
 695                 assert place.country_code is not None
 696                 self._add_country_full_names(place.country_code, names)
 697
 698         if address:
 699             self._process_place_address(token_info, address)
 700
 701         return token_info.to_dict()
 702
 703
 704     def _process_place_address(self, token_info: '_TokenInfo',
 705                                address: Sequence[PlaceName]) -> None:
 706         for item in address:
 707             if item.kind == 'postcode':
 708                 token_info.set_postcode(self._add_postcode(item))
 709             elif item.kind == 'housenumber':
 710                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 711             elif item.kind == 'street':
 712                 token_info.add_street(self._retrieve_full_tokens(item.name))
 713             elif item.kind == 'place':
 714                 if not item.suffix:
 715                     token_info.add_place(self._compute_partial_tokens(item.name))
 716             elif not item.kind.startswith('_') and not item.suffix and \
 717                  item.kind not in ('country', 'full', 'inclusion'):
 718                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 719
 720
 721     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 722         """ Normalize the housenumber and return the word token and the
 723             canonical form.
 724         """
 725         assert self.conn is not None
 726         analyzer = self.token_analysis.analysis.get('@housenumber')
 727         result: Tuple[Optional[int], Optional[str]] = (None, None)
 728
 729         if analyzer is None:
 730             # When no custom analyzer is set, simply normalize and transliterate
 731             norm_name = self._search_normalized(hnr.name)
 732             if norm_name:
 733                 result = self._cache.housenumbers.get(norm_name, result)
 734                 if result[0] is None:
 735                     with self.conn.cursor() as cur:
 736                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 737
 738                         result = hid, norm_name
 739                         self._cache.housenumbers[norm_name] = result
 740         else:
 741             # Otherwise use the analyzer to determine the canonical name.
 742             # Per convention we use the first variant as the 'lookup name', the
 743             # name that gets saved in the housenumber field of the place.
 744             word_id = analyzer.get_canonical_id(hnr)
 745             if word_id:
 746                 result = self._cache.housenumbers.get(word_id, result)
 747                 if result[0] is None:
 748                     variants = analyzer.compute_variants(word_id)
 749                     if variants:
 750                         with self.conn.cursor() as cur:
 751                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
 752                                              (word_id, list(variants)))
 753                             result = hid, variants[0]
 754                             self._cache.housenumbers[word_id] = result
 755
 756         return result
 757
 758
 759     def _compute_partial_tokens(self, name: str) -> List[int]:
 760         """ Normalize the given term, split it into partial words and return
 761             then token list for them.
 762         """
 763         assert self.conn is not None
 764         norm_name = self._search_normalized(name)
 765
 766         tokens = []
 767         need_lookup = []
 768         for partial in norm_name.split():
 769             token = self._cache.partials.get(partial)
 770             if token:
 771                 tokens.append(token)
 772             else:
 773                 need_lookup.append(partial)
 774
 775         if need_lookup:
 776             with self.conn.cursor() as cur:
 777                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 778                                FROM unnest(%s) word""",
 779                             (need_lookup, ))
 780
 781                 for partial, token in cur:
 782                     assert token is not None
 783                     tokens.append(token)
 784                     self._cache.partials[partial] = token
 785
 786         return tokens
 787
 788
 789     def _retrieve_full_tokens(self, name: str) -> List[int]:
 790         """ Get the full name token for the given name, if it exists.
 791             The name is only retrieved for the standard analyser.
 792         """
 793         assert self.conn is not None
 794         norm_name = self._search_normalized(name)
 795
 796         # return cached if possible
 797         if norm_name in self._cache.fulls:
 798             return self._cache.fulls[norm_name]
 799
 800         with self.conn.cursor() as cur:
 801             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 802                         (norm_name, ))
 803             full = [row[0] for row in cur]
 804
 805         self._cache.fulls[norm_name] = full
 806
 807         return full
 808
 809
 810     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 811         """ Computes the full name and partial name tokens for the given
 812             dictionary of names.
 813         """
 814         assert self.conn is not None
 815         full_tokens: Set[int] = set()
 816         partial_tokens: Set[int] = set()
 817
 818         for name in names:
 819             analyzer_id = name.get_attr('analyzer')
 820             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 821             word_id = analyzer.get_canonical_id(name)
 822             if analyzer_id is None:
 823                 token_id = word_id
 824             else:
 825                 token_id = f'{word_id}@{analyzer_id}'
 826
 827             full, part = self._cache.names.get(token_id, (None, None))
 828             if full is None:
 829                 variants = analyzer.compute_variants(word_id)
 830                 if not variants:
 831                     continue
 832
 833                 with self.conn.cursor() as cur:
 834                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 835                                 (token_id, variants))
 836                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 837
 838                 self._cache.names[token_id] = (full, part)
 839
 840             assert part is not None
 841
 842             full_tokens.add(full)
 843             partial_tokens.update(part)
 844
 845         return full_tokens, partial_tokens
 846
 847
 848     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 849         """ Make sure the normalized postcode is present in the word table.
 850         """
 851         assert self.conn is not None
 852         analyzer = self.token_analysis.analysis.get('@postcode')
 853
 854         if analyzer is None:
 855             postcode_name = item.name.strip().upper()
 856             variant_base = None
 857         else:
 858             postcode_name = analyzer.get_canonical_id(item)
 859             variant_base = item.get_attr("variant")
 860
 861         if variant_base:
 862             postcode = f'{postcode_name}@{variant_base}'
 863         else:
 864             postcode = postcode_name
 865
 866         if postcode not in self._cache.postcodes:
 867             term = self._search_normalized(postcode_name)
 868             if not term:
 869                 return None
 870
 871             variants = {term}
 872             if analyzer is not None and variant_base:
 873                 variants.update(analyzer.compute_variants(variant_base))
 874
 875             with self.conn.cursor() as cur:
 876                 cur.execute("SELECT create_postcode_word(%s, %s)",
 877                             (postcode, list(variants)))
 878             self._cache.postcodes.add(postcode)
 879
 880         return postcode_name
 881
 882
 883 class _TokenInfo:
 884     """ Collect token information to be sent back to the database.
 885     """
 886     def __init__(self) -> None:
 887         self.names: Optional[str] = None
 888         self.housenumbers: Set[str] = set()
 889         self.housenumber_tokens: Set[int] = set()
 890         self.street_tokens: Optional[Set[int]] = None
 891         self.place_tokens: Set[int] = set()
 892         self.address_tokens: Dict[str, str] = {}
 893         self.postcode: Optional[str] = None
 894
 895
 896     def _mk_array(self, tokens: Iterable[Any]) -> str:
 897         return f"{{{','.join((str(s) for s in tokens))}}}"
 898
 899
 900     def to_dict(self) -> Dict[str, Any]:
 901         """ Return the token information in database importable format.
 902         """
 903         out: Dict[str, Any] = {}
 904
 905         if self.names:
 906             out['names'] = self.names
 907
 908         if self.housenumbers:
 909             out['hnr'] = ';'.join(self.housenumbers)
 910             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 911
 912         if self.street_tokens is not None:
 913             out['street'] = self._mk_array(self.street_tokens)
 914
 915         if self.place_tokens:
 916             out['place'] = self._mk_array(self.place_tokens)
 917
 918         if self.address_tokens:
 919             out['addr'] = self.address_tokens
 920
 921         if self.postcode:
 922             out['postcode'] = self.postcode
 923
 924         return out
 925
 926
 927     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 928         """ Adds token information for the normalised names.
 929         """
 930         self.names = self._mk_array(itertools.chain(fulls, partials))
 931
 932
 933     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 934         """ Extract housenumber information from a list of normalised
 935             housenumbers.
 936         """
 937         if token:
 938             assert hnr is not None
 939             self.housenumbers.add(hnr)
 940             self.housenumber_tokens.add(token)
 941
 942
 943     def add_street(self, tokens: Iterable[int]) -> None:
 944         """ Add addr:street match terms.
 945         """
 946         if self.street_tokens is None:
 947             self.street_tokens = set()
 948         self.street_tokens.update(tokens)
 949
 950
 951     def add_place(self, tokens: Iterable[int]) -> None:
 952         """ Add addr:place search and match terms.
 953         """
 954         self.place_tokens.update(tokens)
 955
 956
 957     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 958         """ Add additional address terms.
 959         """
 960         if partials:
 961             self.address_tokens[key] = self._mk_array(partials)
 962
 963     def set_postcode(self, postcode: Optional[str]) -> None:
 964         """ Set the postcode to the given one.
 965         """
 966         self.postcode = postcode
 967
 968
 969 class _TokenCache:
 970     """ Cache for token information to avoid repeated database queries.
 971
 972         This cache is not thread-safe and needs to be instantiated per
 973         analyzer.
 974     """
 975     def __init__(self) -> None:
 976         self.names: Dict[str, Tuple[int, List[int]]] = {}
 977         self.partials: Dict[str, int] = {}
 978         self.fulls: Dict[str, List[int]] = {}
 979         self.postcodes: Set[str] = set()
 980         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}