Helper class to create ICU rules from a configuration file.
"""
import io
-import yaml
import logging
from collections import defaultdict
import itertools
+import yaml
from icu import Transliterator
from nominatim.errors import UsageError
def __init__(self, configfile):
self.configfile = configfile
+ self.compound_suffixes = set()
+ self.abbreviations = defaultdict()
if configfile.suffix == '.yaml':
self._load_from_yaml()
suffixes.add(suffix)
suffixes.update(self.abbreviations.get(suffix, []))
- for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
+ for suffix in sorted(suffixes, key=len, reverse=True):
rules.write("'{0} ' > ' {0} ';".format(suffix))
# Finally add transliteration.
synonyms[abbr + ' '].add(' ' + abbr + ' ')
# sort the resulting list by descending length (longer matches are prefered).
- sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+ sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
return [(k, list(synonyms[k])) for k in sorted_keys]
libICU instead of the PostgreSQL module.
"""
from collections import Counter
-import functools
import io
import itertools
-import json
import logging
import re
from textwrap import dedent
from pathlib import Path
-from icu import Transliterator
import psycopg2.extras
from nominatim.db.connection import connect
"""
self.init_from_project()
- if self.normalization is None\
- or self.transliteration is None\
- or self.abbreviations is None:
+ if self.naming_rules is None:
return "Configuration for tokenizer 'legacy_icu' are missing."
return None
for label, cls, typ, oper in cur:
existing_phrases.add((label, cls, typ, oper or '-'))
- to_add = norm_phrases - existing_phrases
- to_delete = existing_phrases - norm_phrases
-
- if to_add:
- copystr = io.StringIO()
- for word, cls, typ, oper in to_add:
- term = self.name_processor.get_search_normalized(word)
- if term:
- copystr.write(word)
- copystr.write('\t ')
- copystr.write(term)
- copystr.write('\t')
- copystr.write(cls)
- copystr.write('\t')
- copystr.write(typ)
- copystr.write('\t')
- copystr.write(oper if oper in ('in', 'near') else '\\N')
- copystr.write('\t0\n')
+ added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+ if should_replace:
+ deleted = self._remove_special_phrases(cur, norm_phrases,
+ existing_phrases)
+ else:
+ deleted = 0
- copystr.seek(0)
- cur.copy_from(copystr, 'word',
- columns=['word', 'word_token', 'class', 'type',
- 'operator', 'search_name_count'])
+ LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+ len(norm_phrases), added, deleted)
- if to_delete and should_replace:
- psycopg2.extras.execute_values(
- cur,
- """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
- WHERE word = name and class = in_class and type = in_type
- and ((op = '-' and operator is null) or op = operator)""",
- to_delete)
- LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
- len(norm_phrases), len(to_add), len(to_delete))
+ def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+ """ Add all phrases to the database that are not yet there.
+ """
+ to_add = new_phrases - existing_phrases
+
+ copystr = io.StringIO()
+ added = 0
+ for word, cls, typ, oper in to_add:
+ term = self.name_processor.get_search_normalized(word)
+ if term:
+ copystr.write(word)
+ copystr.write('\t ')
+ copystr.write(term)
+ copystr.write('\t')
+ copystr.write(cls)
+ copystr.write('\t')
+ copystr.write(typ)
+ copystr.write('\t')
+ copystr.write(oper if oper in ('in', 'near') else '\\N')
+ copystr.write('\t0\n')
+ added += 1
+
+
+ if copystr.tell() > 0:
+ copystr.seek(0)
+ cursor.copy_from(copystr, 'word',
+ columns=['word', 'word_token', 'class', 'type',
+ 'operator', 'search_name_count'])
+
+ return added
+
+
+ def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
+ """ Remove all phrases from the databse that are no longer in the
+ new phrase list.
+ """
+ to_delete = existing_phrases - new_phrases
+
+ if to_delete:
+ psycopg2.extras.execute_values(
+ cursor,
+ """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+ WHERE word = name and class = in_class and type = in_type
+ and ((op = '-' and operator is null) or op = operator)""",
+ to_delete)
+
+ return len(to_delete)
def add_country_names(self, country_code, names):
return full_tokens, partial_tokens
- def _compute_full_names(self, names):
+ @staticmethod
+ def _compute_full_names(names):
""" Return the set of all full name word ids to be used with the
given dictionary of names.
"""
self.data['hnr'] = ';'.join(hnrs)
- def add_street(self, fulls, partials):
+ def add_street(self, fulls, _):
""" Add addr:street match terms.
"""
if fulls:
def test_add_country_names(analyzer, word_table, make_standard_name):
- analyzer.add_country_names('de', ['Germany', 'Deutschland', 'germany'])
+ analyzer.add_country_names('de', {'name': 'Germany',
+ 'name:de': 'Deutschland',
+ 'short_name': 'germany'})
assert word_table.get_country() \
== {('de', ' #germany#'),
word_table.add_country('it', ' #italy#')
word_table.add_country('it', ' #itala#')
- analyzer.add_country_names('it', ['Italy', 'IT'])
+ analyzer.add_country_names('it', {'name': 'Italy', 'ref': 'IT'})
assert word_table.get_country() \
== {('fr', ' #france#'),
def test_update_special_phrase_empty_table(analyzer, word_table):
with analyzer() as anl:
anl.update_special_phrases([
- ("König bei", "amenity", "royal", "near"),
- ("Könige", "amenity", "royal", "-"),
+ ("König bei", "amenity", "royal", "near"),
+ ("Könige ", "amenity", "royal", "-"),
("street", "highway", "primary", "in")
], True)
assert word_table.get_special() \
- == {(' KÖNIG BEI', 'könig bei', 'amenity', 'royal', 'near'),
- (' KÖNIGE', 'könige', 'amenity', 'royal', None),
+ == {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
+ (' KÖNIGE', 'Könige', 'amenity', 'royal', None),
(' STREET', 'street', 'highway', 'primary', 'in')}