2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
11 from textwrap import dedent
12 from pathlib import Path
14 from icu import Transliterator
15 import psycopg2.extras
17 from nominatim.db.connection import connect
18 from nominatim.db.properties import set_property, get_property
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
21 DBCFG_NORMALIZATION = "tokenizer_normalization"
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
23 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
24 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer:
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatm 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
43 self.normalization = None
44 self.transliteration = None
45 self.abbreviations = None
48 def init_new_db(self, config, init_db=True):
49 """ Set up a new tokenizer for the database.
51 This copies all necessary data in the project directory to make
52 sure the tokenizer remains stable even over updates.
54 if config.TOKENIZER_CONFIG:
55 cfgfile = Path(config.TOKENIZER_CONFIG)
57 cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
59 rules = json.loads(cfgfile.read_text())
60 self.transliteration = ';'.join(rules['normalization']) + ';'
61 self.abbreviations = rules["abbreviations"]
62 self.normalization = config.TERM_NORMALIZATION
64 self._install_php(config)
65 self._save_config(config)
68 self.update_sql_functions(config)
69 self._init_db_tables(config)
72 def init_from_project(self):
73 """ Initialise the tokenizer from the project directory.
75 with connect(self.dsn) as conn:
76 self.normalization = get_property(conn, DBCFG_NORMALIZATION)
77 self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
78 self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
81 def finalize_import(self, config):
82 """ Do any required postprocessing to make the tokenizer data ready
85 with connect(self.dsn) as conn:
86 sqlp = SQLPreprocessor(conn, config)
87 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
90 def update_sql_functions(self, config):
91 """ Reimport the SQL functions for this tokenizer.
93 with connect(self.dsn) as conn:
94 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
95 sqlp = SQLPreprocessor(conn, config)
96 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
97 max_word_freq=max_word_freq)
100 def check_database(self):
101 """ Check that the tokenizer is set up correctly.
103 self.init_from_project()
105 if self.normalization is None\
106 or self.transliteration is None\
107 or self.abbreviations is None:
108 return "Configuration for tokenizer 'legacy_icu' are missing."
113 def name_analyzer(self):
114 """ Create a new analyzer for tokenizing names and queries
115 using this tokinzer. Analyzers are context managers and should
119 with tokenizer.name_analyzer() as analyzer:
123 When used outside the with construct, the caller must ensure to
124 call the close() function before destructing the analyzer.
126 Analyzers are not thread-safe. You need to instantiate one per thread.
128 norm = Transliterator.createFromRules("normalizer", self.normalization)
129 trans = Transliterator.createFromRules("normalizer", self.transliteration)
130 return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
133 def _install_php(self, config):
134 """ Install the php script for the tokenizer.
136 php_file = self.data_dir / "tokenizer.php"
137 php_file.write_text(dedent("""\
139 @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
140 @define('CONST_Term_Normalization_Rules', "{0.normalization}");
141 @define('CONST_Transliteration'. "{0.transliteration}");
143 require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
144 """.format(self, config)))
147 def _save_config(self, config):
148 """ Save the configuration that needs to remain stable for the given
149 database as database properties.
151 with connect(self.dsn) as conn:
152 set_property(conn, DBCFG_NORMALIZATION, self.normalization)
153 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
154 set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
155 set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
158 def _init_db_tables(self, config):
159 """ Set up the word table and fill it with pre-computed word
162 with connect(self.dsn) as conn:
163 sqlp = SQLPreprocessor(conn, config)
164 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
167 LOG.warning("Precomputing word tokens")
169 # get partial words and their frequencies
171 with self.name_analyzer() as analyzer:
172 with conn.cursor(name="words") as cur:
173 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
175 for name, cnt in cur:
176 term = analyzer.make_standard_word(name)
178 for word in term.split():
181 # copy them back into the word table
182 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
184 with conn.cursor() as cur:
185 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
186 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
187 WHERE word_id is null""")
192 class LegacyICUNameAnalyzer:
193 """ The legacy analyzer uses the ICU library for splitting names.
195 Each instance opens a connection to the database to request the
199 def __init__(self, dsn, normalizer, transliterator, abbreviations):
200 self.conn = connect(dsn).connection
201 self.conn.autocommit = True
202 self.normalizer = normalizer
203 self.transliterator = transliterator
204 self.abbreviations = abbreviations
205 #psycopg2.extras.register_hstore(self.conn)
207 self._cache = _TokenCache()
214 def __exit__(self, exc_type, exc_value, traceback):
219 """ Free all resources used by the analyzer.
226 def normalize(self, phrase):
227 """ Normalize the given phrase, i.e. remove all properties that
228 are irrelevant for search.
230 return self.normalizer.transliterate(phrase)
232 def make_standard_word(self, name):
233 """ Create the normalised version of the name.
235 norm = ' ' + self.transliterator.transliterate(name) + ' '
236 for full, abbr in self.abbreviations:
238 norm = norm.replace(full, abbr)
243 def _make_standard_hnr(self, hnr):
244 """ Create a normalised version of a housenumber.
246 This function takes minor shortcuts on transliteration.
251 return self.transliterator.transliterate(hnr)
253 def add_postcodes_from_db(self):
254 """ Add postcodes from the location_postcode table to the word table.
256 copystr = io.StringIO()
257 with self.conn.cursor() as cur:
258 cur.execute("SELECT distinct(postcode) FROM location_postcode")
259 for (postcode, ) in cur:
260 copystr.write(postcode)
262 copystr.write(self.transliterator.transliterate(postcode))
263 copystr.write('\tplace\tpostcode\t0\n')
265 cur.copy_from(copystr, 'word',
266 columns=['word', 'word_token', 'class', 'type',
267 'search_name_count'])
268 # Don't really need an ID for postcodes....
269 # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
270 # WHERE word_id is null and type = 'postcode'""")
273 def update_special_phrases(self, phrases):
274 """ Replace the search index for special phrases with the new phrases.
276 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
279 with self.conn.cursor() as cur:
280 # Get the old phrases.
281 existing_phrases = set()
282 cur.execute("""SELECT word, class, type, operator FROM word
283 WHERE class != 'place'
284 OR (type != 'house' AND type != 'postcode')""")
285 for label, cls, typ, oper in cur:
286 existing_phrases.add((label, cls, typ, oper or '-'))
288 to_add = norm_phrases - existing_phrases
289 to_delete = existing_phrases - norm_phrases
292 copystr = io.StringIO()
293 for word, cls, typ, oper in to_add:
294 term = self.make_standard_word(word)
304 copystr.write(oper if oper in ('in', 'near') else '\\N')
305 copystr.write('\t0\n')
307 cur.copy_from(copystr, 'word',
308 columns=['word', 'word_token', 'class', 'type',
309 'operator', 'search_name_count'])
312 psycopg2.extras.execute_values(
314 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
315 WHERE word = name and class = in_class and type = in_type
316 and ((op = '-' and operator is null) or op = operator)""",
319 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
320 len(norm_phrases), len(to_add), len(to_delete))
323 def add_country_names(self, country_code, names):
324 """ Add names for the given country to the search index.
326 full_names = set((self.make_standard_word(n) for n in names))
327 full_names.discard('')
328 self._add_normalised_country_names(country_code, full_names)
331 def _add_normalised_country_names(self, country_code, names):
332 """ Add names for the given country to the search index.
334 with self.conn.cursor() as cur:
336 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
338 new_names = names.difference((t[0] for t in cur))
341 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
343 (SELECT nextval('seq_word'), token, '{}', 0
344 FROM unnest(%s) as token)
345 """.format(country_code), (list(new_names),))
348 def process_place(self, place):
349 """ Determine tokenizer information about the given place.
351 Returns a JSON-serialisable structure that will be handed into
352 the database via the token_info field.
354 token_info = _TokenInfo(self._cache)
356 names = place.get('name')
359 full_names = set((self.make_standard_word(name) for name in names.values()))
360 full_names.discard('')
362 token_info.add_names(self.conn, full_names)
364 country_feature = place.get('country_feature')
365 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
366 self._add_normalised_country_names(country_feature.lower(),
369 address = place.get('address')
374 for key, value in address.items():
375 if key == 'postcode':
376 self._add_postcode(value)
377 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
379 elif key == 'street':
380 token_info.add_street(self.conn, self.make_standard_word(value))
382 token_info.add_place(self.conn, self.make_standard_word(value))
383 elif not key.startswith('_') and \
384 key not in ('country', 'full'):
385 addr_terms.append((key, self.make_standard_word(value)))
388 hnrs = self._split_housenumbers(hnrs)
389 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
392 token_info.add_address_terms(self.conn, addr_terms)
394 return token_info.data
397 def _add_postcode(self, postcode):
398 """ Make sure the normalized postcode is present in the word table.
400 if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
401 term = self.make_standard_word(postcode)
405 with self.conn.cursor() as cur:
406 # no word_id needed for postcodes
407 cur.execute("""INSERT INTO word (word, word_token, class, type,
409 (SELECT pc, %s, 'place', 'postcode', 0
410 FROM (VALUES (%s)) as v(pc)
413 WHERE word = pc and class='place' and type='postcode'))
414 """, (' ' + term, postcode))
415 self._cache.postcodes.add(postcode)
418 def _split_housenumbers(hnrs):
419 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
420 # split numbers if necessary
423 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
425 if len(simple_list) > 1:
426 hnrs = list(set(simple_list))
436 """ Collect token information to be sent back to the database.
438 def __init__(self, cache):
443 def _mk_array(tokens):
444 return '{%s}' % ','.join((str(s) for s in tokens))
447 def add_names(self, conn, names):
448 """ Adds token information for the normalised names.
450 # Start with all partial names
451 terms = set((part for ns in names for part in ns.split()))
452 # Add partials for the full terms (TO BE REMOVED)
453 terms.update((n for n in names))
455 terms.update((' ' + n for n in names))
457 self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
460 def add_housenumbers(self, conn, hnrs):
461 """ Extract housenumber information from a list of normalised
464 self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
465 self.data['hnr'] = ';'.join(hnrs)
468 def add_street(self, conn, street):
469 """ Add addr:street match terms.
476 tid = self.cache.names.get(term)
479 with conn.cursor() as cur:
480 cur.execute("""SELECT word_id FROM word
481 WHERE word_token = %s
482 and class is null and type is null""",
485 tid = cur.fetchone()[0]
486 self.cache.names[term] = tid
489 self.data['street'] = '{%d}' % tid
492 def add_place(self, conn, place):
493 """ Add addr:place search and match terms.
498 partial_ids = self.cache.get_term_tokens(conn, place.split())
499 tid = self.cache.get_term_tokens(conn, [' ' + place])
501 self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
502 self.data['place_match'] = '{%s}' % tid[0]
505 def add_address_terms(self, conn, terms):
506 """ Add additional address terms.
510 for key, value in terms:
513 partial_ids = self.cache.get_term_tokens(conn, value.split())
515 tid = self.cache.names.get(term)
518 with conn.cursor() as cur:
519 cur.execute("""SELECT word_id FROM word
520 WHERE word_token = %s
521 and class is null and type is null""",
524 tid = cur.fetchone()[0]
525 self.cache.names[term] = tid
527 tokens[key] = [self._mk_array(partial_ids),
528 '{%s}' % ('' if tid is None else str(tid))]
531 self.data['addr'] = tokens
535 """ Cache for token information to avoid repeated database queries.
537 This cache is not thread-safe and needs to be instantiated per
542 self.postcodes = set()
543 self.housenumbers = {}
546 def get_term_tokens(self, conn, terms):
547 """ Get token ids for a list of terms, looking them up in the database
554 token = self.names.get(term)
561 with conn.cursor() as cur:
562 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
564 for term, tid in cur:
565 self.names[term] = tid
572 def get_hnr_tokens(self, conn, terms):
573 """ Get token ids for a list of housenumbers, looking them up in the
574 database if necessary.
580 token = self.housenumbers.get(term)
587 with conn.cursor() as cur:
588 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
590 for term, tid in cur:
591 self.housenumbers[term] = tid