]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/legacy_icu_tokenizer.py
update unit tests for adapted abbreviation code
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
1 """
2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
4 """
5 from collections import Counter
6 import io
7 import itertools
8 import logging
9 import re
10 from textwrap import dedent
11 from pathlib import Path
12
13 import psycopg2.extras
14
15 from nominatim.db.connection import connect
16 from nominatim.db.properties import set_property, get_property
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
20
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
23
24 LOG = logging.getLogger()
25
26 def create(dsn, data_dir):
27     """ Create a new instance of the tokenizer provided by this module.
28     """
29     return LegacyICUTokenizer(dsn, data_dir)
30
31
32 class LegacyICUTokenizer:
33     """ This tokenizer uses libICU to covert names and queries to ASCII.
34         Otherwise it uses the same algorithms and data structures as the
35         normalization routines in Nominatim 3.
36     """
37
38     def __init__(self, dsn, data_dir):
39         self.dsn = dsn
40         self.data_dir = data_dir
41         self.naming_rules = None
42         self.term_normalization = None
43         self.max_word_frequency = None
44
45
46     def init_new_db(self, config, init_db=True):
47         """ Set up a new tokenizer for the database.
48
49             This copies all necessary data in the project directory to make
50             sure the tokenizer remains stable even over updates.
51         """
52         if config.TOKENIZER_CONFIG:
53             cfgfile = Path(config.TOKENIZER_CONFIG)
54         else:
55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
56
57         loader = ICURuleLoader(cfgfile)
58         self.naming_rules = ICUNameProcessorRules(loader=loader)
59         self.term_normalization = config.TERM_NORMALIZATION
60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
61
62         self._install_php(config.lib_dir.php)
63         self._save_config(config)
64
65         if init_db:
66             self.update_sql_functions(config)
67             self._init_db_tables(config)
68
69
70     def init_from_project(self):
71         """ Initialise the tokenizer from the project directory.
72         """
73         with connect(self.dsn) as conn:
74             self.naming_rules = ICUNameProcessorRules(conn=conn)
75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
77
78
79     def finalize_import(self, config):
80         """ Do any required postprocessing to make the tokenizer data ready
81             for use.
82         """
83         with connect(self.dsn) as conn:
84             sqlp = SQLPreprocessor(conn, config)
85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
86
87
88     def update_sql_functions(self, config):
89         """ Reimport the SQL functions for this tokenizer.
90         """
91         with connect(self.dsn) as conn:
92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
93             sqlp = SQLPreprocessor(conn, config)
94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
95                               max_word_freq=max_word_freq)
96
97
98     def check_database(self):
99         """ Check that the tokenizer is set up correctly.
100         """
101         self.init_from_project()
102
103         if self.naming_rules is None:
104             return "Configuration for tokenizer 'legacy_icu' are missing."
105
106         return None
107
108
109     def name_analyzer(self):
110         """ Create a new analyzer for tokenizing names and queries
111             using this tokinzer. Analyzers are context managers and should
112             be used accordingly:
113
114             ```
115             with tokenizer.name_analyzer() as analyzer:
116                 analyser.tokenize()
117             ```
118
119             When used outside the with construct, the caller must ensure to
120             call the close() function before destructing the analyzer.
121
122             Analyzers are not thread-safe. You need to instantiate one per thread.
123         """
124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
125
126
127     def _install_php(self, phpdir):
128         """ Install the php script for the tokenizer.
129         """
130         php_file = self.data_dir / "tokenizer.php"
131         php_file.write_text(dedent("""\
132             <?php
133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
137             """.format(self, phpdir)))
138
139
140     def _save_config(self, config):
141         """ Save the configuration that needs to remain stable for the given
142             database as database properties.
143         """
144         with connect(self.dsn) as conn:
145             self.naming_rules.save_rules(conn)
146
147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
149
150
151     def _init_db_tables(self, config):
152         """ Set up the word table and fill it with pre-computed word
153             frequencies.
154         """
155         with connect(self.dsn) as conn:
156             sqlp = SQLPreprocessor(conn, config)
157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
158             conn.commit()
159
160             LOG.warning("Precomputing word tokens")
161
162             # get partial words and their frequencies
163             words = Counter()
164             name_proc = ICUNameProcessor(self.naming_rules)
165             with conn.cursor(name="words") as cur:
166                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
167
168                 for name, cnt in cur:
169                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
170                         for term in word.split():
171                             words[term] += cnt
172
173             # copy them back into the word table
174             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
175
176
177             with conn.cursor() as cur:
178                 copystr.seek(0)
179                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
180                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
181                                WHERE word_id is null""")
182
183             conn.commit()
184
185
186 class LegacyICUNameAnalyzer:
187     """ The legacy analyzer uses the ICU library for splitting names.
188
189         Each instance opens a connection to the database to request the
190         normalization.
191     """
192
193     def __init__(self, dsn, name_proc):
194         self.conn = connect(dsn).connection
195         self.conn.autocommit = True
196         self.name_processor = name_proc
197
198         self._cache = _TokenCache()
199
200
201     def __enter__(self):
202         return self
203
204
205     def __exit__(self, exc_type, exc_value, traceback):
206         self.close()
207
208
209     def close(self):
210         """ Free all resources used by the analyzer.
211         """
212         if self.conn:
213             self.conn.close()
214             self.conn = None
215
216
217     def get_word_token_info(self, words):
218         """ Return token information for the given list of words.
219             If a word starts with # it is assumed to be a full name
220             otherwise is a partial name.
221
222             The function returns a list of tuples with
223             (original word, word token, word id).
224
225             The function is used for testing and debugging only
226             and not necessarily efficient.
227         """
228         tokens = {}
229         for word in words:
230             if word.startswith('#'):
231                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
232             else:
233                 tokens[word] = self.name_processor.get_search_normalized(word)
234
235         with self.conn.cursor() as cur:
236             cur.execute("""SELECT word_token, word_id
237                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
238                            WHERE word_token = t.term
239                                  and class is null and country_code is null""",
240                         (list(tokens.values()), ))
241             ids = {r[0]: r[1] for r in cur}
242
243         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
244
245
246     @staticmethod
247     def normalize_postcode(postcode):
248         """ Convert the postcode to a standardized form.
249
250             This function must yield exactly the same result as the SQL function
251             'token_normalized_postcode()'.
252         """
253         return postcode.strip().upper()
254
255
256     def _make_standard_hnr(self, hnr):
257         """ Create a normalised version of a housenumber.
258
259             This function takes minor shortcuts on transliteration.
260         """
261         return self.name_processor.get_search_normalized(hnr)
262
263     def update_postcodes_from_db(self):
264         """ Update postcode tokens in the word table from the location_postcode
265             table.
266         """
267         to_delete = []
268         copystr = io.StringIO()
269         with self.conn.cursor() as cur:
270             # This finds us the rows in location_postcode and word that are
271             # missing in the other table.
272             cur.execute("""SELECT * FROM
273                             (SELECT pc, word FROM
274                               (SELECT distinct(postcode) as pc FROM location_postcode) p
275                               FULL JOIN
276                               (SELECT word FROM word
277                                 WHERE class ='place' and type = 'postcode') w
278                               ON pc = word) x
279                            WHERE pc is null or word is null""")
280
281             for postcode, word in cur:
282                 if postcode is None:
283                     to_delete.append(word)
284                 else:
285                     copystr.write(postcode)
286                     copystr.write('\t ')
287                     copystr.write(self.name_processor.get_search_normalized(postcode))
288                     copystr.write('\tplace\tpostcode\t0\n')
289
290             if to_delete:
291                 cur.execute("""DELETE FROM WORD
292                                WHERE class ='place' and type = 'postcode'
293                                      and word = any(%s)
294                             """, (to_delete, ))
295
296             if copystr.getvalue():
297                 copystr.seek(0)
298                 cur.copy_from(copystr, 'word',
299                               columns=['word', 'word_token', 'class', 'type',
300                                        'search_name_count'])
301
302
303     def update_special_phrases(self, phrases, should_replace):
304         """ Replace the search index for special phrases with the new phrases.
305         """
306         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
307                             for p in phrases))
308
309         with self.conn.cursor() as cur:
310             # Get the old phrases.
311             existing_phrases = set()
312             cur.execute("""SELECT word, class, type, operator FROM word
313                            WHERE class != 'place'
314                                  OR (type != 'house' AND type != 'postcode')""")
315             for label, cls, typ, oper in cur:
316                 existing_phrases.add((label, cls, typ, oper or '-'))
317
318             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
319             if should_replace:
320                 deleted = self._remove_special_phrases(cur, norm_phrases,
321                                                        existing_phrases)
322             else:
323                 deleted = 0
324
325         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
326                  len(norm_phrases), added, deleted)
327
328
329     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
330         """ Add all phrases to the database that are not yet there.
331         """
332         to_add = new_phrases - existing_phrases
333
334         copystr = io.StringIO()
335         added = 0
336         for word, cls, typ, oper in to_add:
337             term = self.name_processor.get_search_normalized(word)
338             if term:
339                 copystr.write(word)
340                 copystr.write('\t ')
341                 copystr.write(term)
342                 copystr.write('\t')
343                 copystr.write(cls)
344                 copystr.write('\t')
345                 copystr.write(typ)
346                 copystr.write('\t')
347                 copystr.write(oper if oper in ('in', 'near')  else '\\N')
348                 copystr.write('\t0\n')
349                 added += 1
350
351
352         if copystr.tell() > 0:
353             copystr.seek(0)
354             cursor.copy_from(copystr, 'word',
355                              columns=['word', 'word_token', 'class', 'type',
356                                       'operator', 'search_name_count'])
357
358         return added
359
360
361     def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
362         """ Remove all phrases from the databse that are no longer in the
363             new phrase list.
364         """
365         to_delete = existing_phrases - new_phrases
366
367         if to_delete:
368             psycopg2.extras.execute_values(
369                 cursor,
370                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
371                     WHERE word = name and class = in_class and type = in_type
372                           and ((op = '-' and operator is null) or op = operator)""",
373                 to_delete)
374
375         return len(to_delete)
376
377
378     def add_country_names(self, country_code, names):
379         """ Add names for the given country to the search index.
380         """
381         word_tokens = set()
382         for name in self._compute_full_names(names):
383             if name:
384                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
385
386         with self.conn.cursor() as cur:
387             # Get existing names
388             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
389                         (country_code, ))
390             word_tokens.difference_update((t[0] for t in cur))
391
392             if word_tokens:
393                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
394                                                  search_name_count)
395                                (SELECT nextval('seq_word'), token, '{}', 0
396                                 FROM unnest(%s) as token)
397                             """.format(country_code), (list(word_tokens),))
398
399
400     def process_place(self, place):
401         """ Determine tokenizer information about the given place.
402
403             Returns a JSON-serialisable structure that will be handed into
404             the database via the token_info field.
405         """
406         token_info = _TokenInfo(self._cache)
407
408         names = place.get('name')
409
410         if names:
411             fulls, partials = self._compute_name_tokens(names)
412
413             token_info.add_names(fulls, partials)
414
415             country_feature = place.get('country_feature')
416             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
417                 self.add_country_names(country_feature.lower(), names)
418
419         address = place.get('address')
420
421         if address:
422             hnrs = []
423             addr_terms = []
424             for key, value in address.items():
425                 if key == 'postcode':
426                     self._add_postcode(value)
427                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
428                     hnrs.append(value)
429                 elif key == 'street':
430                     token_info.add_street(*self._compute_name_tokens({'name': value}))
431                 elif key == 'place':
432                     token_info.add_place(*self._compute_name_tokens({'name': value}))
433                 elif not key.startswith('_') and \
434                      key not in ('country', 'full'):
435                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
436
437             if hnrs:
438                 hnrs = self._split_housenumbers(hnrs)
439                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
440
441             if addr_terms:
442                 token_info.add_address_terms(addr_terms)
443
444         return token_info.data
445
446
447     def _compute_name_tokens(self, names):
448         """ Computes the full name and partial name tokens for the given
449             dictionary of names.
450         """
451         full_names = self._compute_full_names(names)
452         full_tokens = set()
453         partial_tokens = set()
454
455         for name in full_names:
456             norm_name = self.name_processor.get_normalized(name)
457             full, part = self._cache.names.get(norm_name, (None, None))
458             if full is None:
459                 variants = self.name_processor.get_variants_ascii(norm_name)
460                 with self.conn.cursor() as cur:
461                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
462                                 (norm_name, variants))
463                     full, part = cur.fetchone()
464
465                 self._cache.names[norm_name] = (full, part)
466
467             full_tokens.add(full)
468             partial_tokens.update(part)
469
470         return full_tokens, partial_tokens
471
472
473     @staticmethod
474     def _compute_full_names(names):
475         """ Return the set of all full name word ids to be used with the
476             given dictionary of names.
477         """
478         full_names = set()
479         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
480             full_names.add(name.strip())
481
482             brace_idx = name.find('(')
483             if brace_idx >= 0:
484                 full_names.add(name[:brace_idx].strip())
485
486         return full_names
487
488
489     def _add_postcode(self, postcode):
490         """ Make sure the normalized postcode is present in the word table.
491         """
492         if re.search(r'[:,;]', postcode) is None:
493             postcode = self.normalize_postcode(postcode)
494
495             if postcode not in self._cache.postcodes:
496                 term = self.name_processor.get_search_normalized(postcode)
497                 if not term:
498                     return
499
500                 with self.conn.cursor() as cur:
501                     # no word_id needed for postcodes
502                     cur.execute("""INSERT INTO word (word, word_token, class, type,
503                                                      search_name_count)
504                                    (SELECT pc, %s, 'place', 'postcode', 0
505                                     FROM (VALUES (%s)) as v(pc)
506                                     WHERE NOT EXISTS
507                                      (SELECT * FROM word
508                                       WHERE word = pc and class='place' and type='postcode'))
509                                 """, (' ' + term, postcode))
510                 self._cache.postcodes.add(postcode)
511
512
513     @staticmethod
514     def _split_housenumbers(hnrs):
515         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
516             # split numbers if necessary
517             simple_list = []
518             for hnr in hnrs:
519                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
520
521             if len(simple_list) > 1:
522                 hnrs = list(set(simple_list))
523             else:
524                 hnrs = simple_list
525
526         return hnrs
527
528
529
530
531 class _TokenInfo:
532     """ Collect token information to be sent back to the database.
533     """
534     def __init__(self, cache):
535         self._cache = cache
536         self.data = {}
537
538     @staticmethod
539     def _mk_array(tokens):
540         return '{%s}' % ','.join((str(s) for s in tokens))
541
542
543     def add_names(self, fulls, partials):
544         """ Adds token information for the normalised names.
545         """
546         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
547
548
549     def add_housenumbers(self, conn, hnrs):
550         """ Extract housenumber information from a list of normalised
551             housenumbers.
552         """
553         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
554         self.data['hnr'] = ';'.join(hnrs)
555
556
557     def add_street(self, fulls, _):
558         """ Add addr:street match terms.
559         """
560         if fulls:
561             self.data['street'] = self._mk_array(fulls)
562
563
564     def add_place(self, fulls, partials):
565         """ Add addr:place search and match terms.
566         """
567         if fulls:
568             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
569             self.data['place_match'] = self._mk_array(fulls)
570
571
572     def add_address_terms(self, terms):
573         """ Add additional address terms.
574         """
575         tokens = {}
576
577         for key, fulls, partials in terms:
578             if fulls:
579                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
580                                self._mk_array(fulls)]
581
582         if tokens:
583             self.data['addr'] = tokens
584
585
586 class _TokenCache:
587     """ Cache for token information to avoid repeated database queries.
588
589         This cache is not thread-safe and needs to be instantiated per
590         analyzer.
591     """
592     def __init__(self):
593         self.names = {}
594         self.postcodes = set()
595         self.housenumbers = {}
596
597
598     def get_hnr_tokens(self, conn, terms):
599         """ Get token ids for a list of housenumbers, looking them up in the
600             database if necessary.
601         """
602         tokens = []
603         askdb = []
604
605         for term in terms:
606             token = self.housenumbers.get(term)
607             if token is None:
608                 askdb.append(term)
609             else:
610                 tokens.append(token)
611
612         if askdb:
613             with conn.cursor() as cur:
614                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
615                             (askdb, ))
616                 for term, tid in cur:
617                     self.housenumbers[term] = tid
618                     tokens.append(tid)
619
620         return tokens