]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tools/special_phrases.py
Code cleaning, tests simplification and use of python3-icu package
[nominatim.git] / nominatim / tools / special_phrases.py
index a70d304770e049bedff9acc72cfca5ce06ea61cc..c0f472d65feca9419a34389a47738e3e114975d6 100644 (file)
@@ -2,29 +2,40 @@
     Functions to import special phrases into the database.
 """
 import logging
+import os
 import re
-import sys
+import subprocess
+import json
+from os.path import isfile
+from icu import Transliterator
 from psycopg2.sql import Identifier, Literal, SQL
-from settings.phrase_settings import BLACK_LIST, WHITE_LIST
 from nominatim.tools.exec_utils import get_url
 
 LOG = logging.getLogger()
 
-def import_from_wiki(config, db_connection, languages=None):
+def import_from_wiki(args, db_connection, languages=None):
+    # pylint: disable-msg=too-many-locals
     """
         Iterate through all specified languages and
         extract corresponding special phrases from the wiki.
     """
+    black_list, white_list = _load_white_and_black_lists(args)
+
     #Compile the match regex to increase performance for the following loop.
     occurence_pattern = re.compile(
         r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
     )
     sanity_check_pattern = re.compile(r'^\w+$')
 
-    languages = _get_languages(config) if not languages else languages
+    #Get all languages to process.
+    languages = _get_languages(args.config) if not languages else languages
 
     #array for pairs of class/type
-    pairs = dict()
+    class_type_pairs = set()
+
+    transliterator = Transliterator.createFromRules("special-phrases normalizer",
+                                                    args.config.TERM_NORMALIZATION)
+
     for lang in languages:
         LOG.warning('Import phrases for lang: %s', lang)
         wiki_page_xml_content = _get_wiki_content(lang)
@@ -33,6 +44,7 @@ def import_from_wiki(config, db_connection, languages=None):
 
         for match in matches:
             phrase_label = match[0].strip()
+            normalized_label = transliterator.transliterate(phrase_label)
             phrase_class = match[1].strip()
             phrase_type = match[2].strip()
             phrase_operator = match[3].strip()
@@ -43,23 +55,37 @@ def import_from_wiki(config, db_connection, languages=None):
             _check_sanity(lang, phrase_class, phrase_type, sanity_check_pattern)
 
             #blacklisting: disallow certain class/type combinations
-            if phrase_class in BLACK_LIST.keys() and phrase_type in BLACK_LIST[phrase_class]:
+            if phrase_class in black_list.keys() and phrase_type in black_list[phrase_class]:
                 continue
             #whitelisting: if class is in whitelist, allow only tags in the list
-            if phrase_class in WHITE_LIST.keys() and phrase_type not in WHITE_LIST[phrase_class]:
+            if phrase_class in white_list.keys() and phrase_type not in white_list[phrase_class]:
                 continue
 
             #add class/type to the pairs dict
-            pairs[f'{phrase_class}|{phrase_type}'] = (phrase_class, phrase_type)
+            class_type_pairs.add((phrase_class, phrase_type))
 
             _process_amenity(
-                db_connection, phrase_label, phrase_class, phrase_type, phrase_operator
+                db_connection, phrase_label, normalized_label,
+                phrase_class, phrase_type, phrase_operator
             )
 
-    _create_place_classtype_table_and_indexes(db_connection, config, pairs)
+    _create_place_classtype_table_and_indexes(db_connection, args.config, class_type_pairs)
     db_connection.commit()
     LOG.warning('Import done.')
 
+def _load_white_and_black_lists(args):
+    """
+        Load white and black lists from phrases-settings.json.
+    """
+    config = args.config
+    settings_path = str(config.config_dir)+'/phrase-settings.json'
+
+    if config.PHRASE_CONFIG:
+        settings_path = _convert_php_settings_if_needed(args, config.PHRASE_CONFIG)
+
+    with open(settings_path, "r") as json_settings:
+        settings = json.load(json_settings)
+    return settings['blackList'], settings['whiteList']
 
 def _get_languages(config):
     """
@@ -91,34 +117,32 @@ def _check_sanity(lang, phrase_class, phrase_type, pattern):
         Check sanity of given inputs in case somebody added garbage in the wiki.
         If a bad class/type is detected the system will exit with an error.
     """
-    try:
-        if len(pattern.findall(phrase_class)) < 1 or len(pattern.findall(phrase_type)) < 1:
-            sys.exit()
-    except SystemExit:
+    if len(pattern.findall(phrase_class)) < 1 or len(pattern.findall(phrase_type)) < 1:
         LOG.error("Bad class/type for language %s: %s=%s", lang, phrase_class, phrase_type)
-        raise
 
 
-def _process_amenity(db_connection, phrase_label, phrase_class, phrase_type, phrase_operator):
+def _process_amenity(db_connection, phrase_label, normalized_label,
+                     phrase_class, phrase_type, phrase_operator):
+    # pylint: disable-msg=too-many-arguments
     """
         Add phrase lookup and corresponding class and type to the word table based on the operator.
     """
     with db_connection.cursor() as db_cursor:
         if phrase_operator == 'near':
             db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                              make_standard_name(%s), %s, %s, 'near')""",
-                              (phrase_label, phrase_class, phrase_type))
+                              make_standard_name(%s), %s, %s, %s, 'near')""",
+                              (phrase_label, normalized_label, phrase_class, phrase_type))
         elif phrase_operator == 'in':
             db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                              make_standard_name(%s), %s, %s, 'in')""",
-                              (phrase_label, phrase_class, phrase_type))
+                              make_standard_name(%s), %s, %s, %s, 'in')""",
+                              (phrase_label, normalized_label, phrase_class, phrase_type))
         else:
             db_cursor.execute("""SELECT getorcreate_amenity(
-                              make_standard_name(%s), %s, %s)""",
-                              (phrase_label, phrase_class, phrase_type))
+                              make_standard_name(%s), %s, %s, %s)""",
+                              (phrase_label, normalized_label, phrase_class, phrase_type))
 
 
-def _create_place_classtype_table_and_indexes(db_connection, config, pairs):
+def _create_place_classtype_table_and_indexes(db_connection, config, class_type_pairs):
     """
         Create table place_classtype for each given pair.
         Also create indexes on place_id and centroid.
@@ -132,7 +156,7 @@ def _create_place_classtype_table_and_indexes(db_connection, config, pairs):
     with db_connection.cursor() as db_cursor:
         db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
 
-    for _, pair in pairs.items():
+    for pair in class_type_pairs.items():
         phrase_class = pair[0]
         phrase_type = pair[1]
 
@@ -159,43 +183,63 @@ def _create_place_classtype_table(db_connection, sql_tablespace, phrase_class, p
     """
         Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
     """
+    table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
     with db_connection.cursor() as db_cursor:
-        db_cursor.execute(SQL(f"""
-                CREATE TABLE IF NOT EXISTS {{}} {sql_tablespace
+        db_cursor.execute(SQL("""
+                CREATE TABLE IF NOT EXISTS {{}} {} 
                 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex 
-                WHERE class = {{}} AND type = {{}}""")
-                          .format(Identifier(f'place_classtype_{phrase_class}_{phrase_type}'),
-                                  Literal(phrase_class), Literal(phrase_type)))
+                WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
+                          .format(Identifier(table_name), Literal(phrase_class),
+                                  Literal(phrase_type)))
 
 
 def _create_place_classtype_indexes(db_connection, sql_tablespace, phrase_class, phrase_type):
     """
         Create indexes on centroid and place_id for the place_classtype table.
     """
+    index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
+    base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
     #Index on centroid
-    if not db_connection.index_exists(f'idx_place_classtype_{phrase_class}_{phrase_type}_centroid'):
+    if not db_connection.index_exists(index_prefix + 'centroid'):
         with db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL(f"""
-                    CREATE INDEX {{}} ON {{}} USING GIST (centroid) {sql_tablespace}""")
-                              .format(Identifier(
-                                  f"""idx_place_classtype_{phrase_class}_{phrase_type}_centroid"""),
-                                      Identifier(f'place_classtype_{phrase_class}_{phrase_type}')))
+            db_cursor.execute(SQL("""
+                CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
+                              .format(Identifier(index_prefix + 'centroid'),
+                                      Identifier(base_table)), sql_tablespace)
 
     #Index on place_id
-    if not db_connection.index_exists(f'idx_place_classtype_{phrase_class}_{phrase_type}_place_id'):
+    if not db_connection.index_exists(index_prefix + 'place_id'):
         with db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL(f"""
-            CREATE INDEX {{}} ON {{}} USING btree(place_id) {sql_tablespace}""")
-                              .format(Identifier(
-                                  f"""idx_place_classtype_{phrase_class}_{phrase_type}_place_id"""),
-                                      Identifier(f'place_classtype_{phrase_class}_{phrase_type}')))
+            db_cursor.execute(SQL(
+                """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
+                              .format(Identifier(index_prefix + 'place_id'),
+                                      Identifier(base_table)))
 
 
 def _grant_access_to_webuser(db_connection, config, phrase_class, phrase_type):
     """
         Grant access on read to the table place_classtype for the webuser.
     """
+    table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
     with db_connection.cursor() as db_cursor:
         db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
-                          .format(Identifier(f'place_classtype_{phrase_class}_{phrase_type}'),
-                                  Identifier(config.DATABASE_WEBUSER)))
+                          .format(Identifier(table_name), Identifier(config.DATABASE_WEBUSER)))
+
+def _convert_php_settings_if_needed(args, file_path):
+    """
+        Convert php settings file of special phrases to json file if it is still in php format.
+    """
+    file, extension = os.path.splitext(file_path)
+    json_file_path = file + '.json'
+    if extension == '.php' and not isfile(json_file_path):
+        try:
+            subprocess.run(['/usr/bin/env', 'php', '-Cq',
+                            args.phplib_dir / 'migration/phraseSettingsToJson.php',
+                            file_path], check=True)
+            LOG.warning('special_phrase configuration file has been converted to json.')
+            return json_file_path
+        except subprocess.CalledProcessError:
+            LOG.error('Error while converting %s to json.', file_path)
+            raise
+    else:
+        return json_file_path