]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 29 Mar 2021 10:10:25 +0000 (12:10 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Mon, 29 Mar 2021 10:10:25 +0000 (12:10 +0200)
36 files changed:
.github/actions/build-nominatim/action.yml
.github/workflows/ci-tests.yml
.gitignore
.pylintrc
CMakeLists.txt
docs/admin/Import.md
docs/admin/Installation.md
lib-php/SearchDescription.php
lib-php/admin/specialphrases.php
lib-php/migration/PhraseSettingsToJson.php [new file with mode: 0644]
lib-sql/indices.sql
lib-sql/tables.sql
nominatim/cli.py
nominatim/clicmd/__init__.py
nominatim/clicmd/api.py
nominatim/clicmd/special_phrases.py [new file with mode: 0644]
nominatim/indexer/progress.py
nominatim/tools/special_phrases.py [new file with mode: 0644]
settings/env.defaults
settings/phrase-settings.json [new file with mode: 0644]
test/bdd/steps/nominatim_environment.py
test/bdd/steps/steps_db_ops.py
test/bdd/steps/steps_osm_data.py
test/python/conftest.py
test/python/sample.tar.gz [new file with mode: 0644]
test/python/test_cli.py
test/python/test_tools_exec_utils.py
test/python/test_tools_import_special_phrases.py [new file with mode: 0644]
test/testdata/special_phrases_test_content.txt [new file with mode: 0644]
test/testfiles/phrase-settings.json [new file with mode: 0644]
test/testfiles/phrase_settings.php [moved from settings/phrase_settings.php with 100% similarity]
test/testfiles/random_file.html [new file with mode: 0644]
vagrant/Install-on-Centos-7.sh
vagrant/Install-on-Centos-8.sh
vagrant/Install-on-Ubuntu-18.sh
vagrant/Install-on-Ubuntu-20.sh

index 414783d96f26fe19e2bc2a70a67f5502bb953343..d0a89774637eb9238de77f767daa4451d047e34b 100644 (file)
@@ -6,7 +6,7 @@ runs:
     steps:
         - name: Install prerequisites
           run: |
-            sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil python3-jinja2
+            sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev libicu-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil python3-jinja2 python3-icu
           shell: bash
 
         - name: Download dependencies
index e0e68a9c42303af52b52bf0a0a76695ef9ad552c..2f920a660f95c3f19fd7b8ad4ca1fcd542c243ca 100644 (file)
@@ -120,7 +120,7 @@ jobs:
               working-directory: data-env
 
             - name: Import special phrases
-              run: nominatim special-phrases --from-wiki | psql -d nominatim
+              run: nominatim special-phrases --import-from-wiki
               working-directory: data-env
 
             - name: Check import
index 23fb34a61a66a953fd06af91a94ffdf67f26a7ee..44b8eb32203dfeb897a8f4d3a1243c7a536b6ee1 100644 (file)
@@ -9,3 +9,4 @@ data/wiki_specialphrases.sql
 data/osmosischange.osc
 
 .vagrant
+data/country_osm_grid.sql.gz
index da6dbe0376cd5285a6cc206f1c0f5beaadad0a79..eab041818058526209a36951c902e615095d7d23 100644 (file)
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,6 +1,7 @@
 [MASTER]
 
 extension-pkg-whitelist=osmium
+ignored-modules=icu
 
 [MESSAGES CONTROL]
 
index fdc67ffe86c6c393aa77c74abed0b0c3f319f24b..40140056a1d99e9355ae1e9656bf70ee1523e782 100644 (file)
@@ -114,7 +114,6 @@ if (BUILD_IMPORTER)
        export.php
        query.php
        setup.php
-       specialphrases.php
        update.php
        warm.php
       )
@@ -280,7 +279,7 @@ endif()
 
 install(FILES settings/env.defaults
               settings/address-levels.json
-              settings/phrase_settings.php
+              settings/phrase-settings.json
               settings/import-admin.style
               settings/import-street.style
               settings/import-address.style
index ef0da0be5e88135403c0007725e1f49ff7456927..e3a32481819d6a6b0cc93528f8c77b91eea309ab 100644 (file)
@@ -268,10 +268,9 @@ running this function.
 
 If you want to be able to search for places by their type through
 [special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
-you also need to enable these key phrases like this:
+you also need to import these key phrases like this:
 
-    nominatim special-phrases --from-wiki > specialphrases.sql
-    psql -d nominatim -f specialphrases.sql
+    nominatim special-phrases --import-from-wiki
 
 Note that this command downloads the phrases from the wiki link above. You
 need internet access for the step.
index eadaaff1fc635704f06fa20c45528ce7fe85eaba..6237a9d4a37b9026c4d1dd3938a6f3cb76e945b1 100644 (file)
@@ -30,6 +30,7 @@ For compiling:
   * [proj](https://proj.org/)
   * [bzip2](http://www.bzip.org/)
   * [zlib](https://www.zlib.net/)
+  * [ICU](http://site.icu-project.org/)
   * [Boost libraries](https://www.boost.org/), including system and filesystem
   * PostgreSQL client libraries
   * a recent C++ compiler (gcc 5+ or Clang 3.8+)
@@ -43,6 +44,7 @@ For running Nominatim:
   * [Python Dotenv](https://github.com/theskumar/python-dotenv)
   * [psutil](https://github.com/giampaolo/psutil)
   * [Jinja2](https://palletsprojects.com/p/jinja/)
+  * [PyICU](https://pypi.org/project/PyICU/)
   * [PHP](https://php.net) (7.0 or later)
   * PHP-pgsql
   * PHP-intl (bundled with PHP)
index 228b099917d6377f102a2987bf9e6f76af68058b..2b39443f6e79f1b86b1571520ac0c1d4369cf6bc 100644 (file)
@@ -614,7 +614,7 @@ class SearchDescription
         // too many results are expected for the street, i.e. if the result
         // will be narrowed down by an address. Remeber that with ordering
         // every single result has to be checked.
-        if ($this->sHouseNumber && (!empty($this->aAddress) || $this->sPostcode)) {
+        if ($this->sHouseNumber && ($this->bRareName || !empty($this->aAddress) || $this->sPostcode)) {
             $sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
             $aOrder[] = ' (';
             $aOrder[0] .= 'EXISTS(';
index 8d2d91296e8231ec35dd7e558952b6c3864c1db6..84bcfb5c949ad51da1f019372a99546ec618310e 100644 (file)
+
 <?php
 @define('CONST_LibDir', dirname(dirname(__FILE__)));
 
 require_once(CONST_LibDir.'/init-cmd.php');
-ini_set('memory_limit', '800M');
-ini_set('display_errors', 'stderr');
-
-$aCMDOptions
-= array(
-   'Import and export special phrases',
-   array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
-   array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
-   array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
-   array('wiki-import', '', 0, 1, 0, 0, 'bool', 'Create import script for search phrases '),
-   array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
-  );
-getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
-
-loadSettings($aCMDResult['project-dir'] ?? getcwd());
-setupHTTPProxy();
-
-include(getSettingConfig('PHRASE_CONFIG', 'phrase_settings.php'));
-
-if ($aCMDResult['wiki-import']) {
-    $oNormalizer = Transliterator::createFromRules(getSetting('TERM_NORMALIZATION'));
-    $aPairs = array();
-
-    $sLanguageIn = getSetting(
-        'LANGUAGES',
-        'af,ar,br,ca,cs,de,en,es,et,eu,fa,fi,fr,gl,hr,hu,'.
-        'ia,is,it,ja,mk,nl,no,pl,ps,pt,ru,sk,sl,sv,uk,vi'
-    );
-
-    foreach (explode(',', $sLanguageIn) as $sLanguage) {
-        $sURL = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/'.strtoupper($sLanguage);
-        $sWikiPageXML = file_get_contents($sURL);
-
-        if (!preg_match_all(
-            '#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#',
-            $sWikiPageXML,
-            $aMatches,
-            PREG_SET_ORDER
-        )) {
-            continue;
-        }
-
-        foreach ($aMatches as $aMatch) {
-            $sLabel = trim($aMatch[1]);
-            if ($oNormalizer !== null) {
-                $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel));
-            } else {
-                $sTrans = null;
-            }
-            $sClass = trim($aMatch[2]);
-            $sType = trim($aMatch[3]);
-            // hack around a bug where building=yes was imported with
-            // quotes into the wiki
-            $sType = preg_replace('/(&quot;|")/', '', $sType);
-            // sanity check, in case somebody added garbage in the wiki
-            if (preg_match('/^\\w+$/', $sClass) < 1
-                || preg_match('/^\\w+$/', $sType) < 1
-            ) {
-                trigger_error("Bad class/type for language $sLanguage: $sClass=$sType");
-                exit;
-            }
-            // blacklisting: disallow certain class/type combinations
-            if (isset($aTagsBlacklist[$sClass]) && in_array($sType, $aTagsBlacklist[$sClass])) {
-                // fwrite(STDERR, "Blacklisted: ".$sClass."/".$sType."\n");
-                continue;
-            }
-            // whitelisting: if class is in whitelist, allow only tags in the list
-            if (isset($aTagsWhitelist[$sClass]) && !in_array($sType, $aTagsWhitelist[$sClass])) {
-                // fwrite(STDERR, "Non-Whitelisted: ".$sClass."/".$sType."\n");
-                continue;
-            }
-            $aPairs[$sClass.'|'.$sType] = array($sClass, $sType);
-
-            switch (trim($aMatch[4])) {
-                case 'near':
-                    printf(
-                        "SELECT getorcreate_amenityoperator(make_standard_name('%s'), '%s', '%s', '%s', 'near');\n",
-                        pg_escape_string($sLabel),
-                        $sTrans,
-                        $sClass,
-                        $sType
-                    );
-                    break;
-                case 'in':
-                    printf(
-                        "SELECT getorcreate_amenityoperator(make_standard_name('%s'), '%s', '%s', '%s', 'in');\n",
-                        pg_escape_string($sLabel),
-                        $sTrans,
-                        $sClass,
-                        $sType
-                    );
-                    break;
-                default:
-                    printf(
-                        "SELECT getorcreate_amenity(make_standard_name('%s'), '%s', '%s', '%s');\n",
-                        pg_escape_string($sLabel),
-                        $sTrans,
-                        $sClass,
-                        $sType
-                    );
-                    break;
-            }
-        }
-    }
-
-    echo 'CREATE INDEX idx_placex_classtype ON placex (class, type);';
-
-    foreach ($aPairs as $aPair) {
-        $sql_tablespace = getSetting('TABLESPACE_AUX_DATA');
-        if ($sql_tablespace) {
-            $sql_tablespace = ' TABLESPACE '.$sql_tablespace;
-        }
-
-        printf(
-            'CREATE TABLE place_classtype_%s_%s'
-            . $sql_tablespace
-            . ' AS'
-            . ' SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex'
-            . " WHERE class = '%s' AND type = '%s'"
-            . ";\n",
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1]),
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1])
-        );
-
-        printf(
-            'CREATE INDEX idx_place_classtype_%s_%s_centroid'
-            . ' ON place_classtype_%s_%s USING GIST (centroid)'
-            . $sql_tablespace
-            . ";\n",
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1]),
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1])
-        );
-
-        printf(
-            'CREATE INDEX idx_place_classtype_%s_%s_place_id'
-            . ' ON place_classtype_%s_%s USING btree(place_id)'
-            . $sql_tablespace
-            . ";\n",
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1]),
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1])
-        );
 
-        printf(
-            'GRANT SELECT ON place_classtype_%s_%s TO "%s"'
-            . ";\n",
-            pg_escape_string($aPair[0]),
-            pg_escape_string($aPair[1]),
-            getSetting('DATABASE_WEBUSER')
-        );
-    }
+loadSettings(getcwd());
 
-    echo 'DROP INDEX idx_placex_classtype;';
-}
+(new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')))
+    ->addParams('special-phrases', '--import-from-wiki')
+    ->run();
diff --git a/lib-php/migration/PhraseSettingsToJson.php b/lib-php/migration/PhraseSettingsToJson.php
new file mode 100644 (file)
index 0000000..15c49f0
--- /dev/null
@@ -0,0 +1,19 @@
+<?php
+
+$phpPhraseSettingsFile = $argv[1];
+$jsonPhraseSettingsFile = dirname($phpPhraseSettingsFile).'/'.basename($phpPhraseSettingsFile, '.php').'.json';
+
+if (file_exists($phpPhraseSettingsFile) && !file_exists($jsonPhraseSettingsFile)) {
+    include $phpPhraseSettingsFile;
+
+    $data = array();
+
+    if (isset($aTagsBlacklist))
+        $data['blackList'] = $aTagsBlacklist;
+    if (isset($aTagsWhitelist))
+        $data['whiteList'] = $aTagsWhitelist;
+
+    $jsonFile = fopen($jsonPhraseSettingsFile, 'w');
+    fwrite($jsonFile, json_encode($data));
+    fclose($jsonFile);
+}
index cb77e02b60b8ca0c9525e0af3ae6327888955543..f8c9d2ce86511ef993231507883aa64998ef2eee 100644 (file)
@@ -35,9 +35,6 @@ CREATE INDEX {{sql.if_index_not_exists}} idx_osmline_parent_place_id
 CREATE INDEX {{sql.if_index_not_exists}} idx_osmline_parent_osm_id
   ON location_property_osmline USING BTREE (osm_id) {{db.tablespace.search_index}};
 
-CREATE UNIQUE INDEX {{sql.if_index_not_exists}} idx_postcode_id
-  ON location_postcode USING BTREE (place_id) {{db.tablespace.search_index}};
-
 CREATE INDEX {{sql.if_index_not_exists}} idx_postcode_postcode
   ON location_postcode USING BTREE (postcode) {{db.tablespace.search_index}};
 
index 0895c6dd3b83b812fb0575384c2ea85ff5f47f4e..329eb7a1ab9491f8fb063b92747dd0abfcf9c2d2 100644 (file)
@@ -209,6 +209,7 @@ CREATE TABLE location_postcode (
   postcode TEXT,
   geometry GEOMETRY(Geometry, 4326)
   );
+CREATE UNIQUE INDEX idx_postcode_id ON location_postcode USING BTREE (place_id) {{db.tablespace.search_index}};
 CREATE INDEX idx_postcode_geometry ON location_postcode USING GIST (geometry) {{db.tablespace.address_index}};
 GRANT SELECT ON location_postcode TO "{{config.DATABASE_WEBUSER}}" ;
 
index b3d9eee6d735742ef61941379a00ca5477930d68..0876d9a44df960e9589ca8577521e58d9d56cc50 100644 (file)
@@ -112,30 +112,6 @@ class CommandlineParser:
 # pylint: disable=C0111
 # Using non-top-level imports to make pyosmium optional for replication only.
 # pylint: disable=E0012,C0415
-
-
-class SetupSpecialPhrases:
-    """\
-    Maintain special phrases.
-    """
-
-    @staticmethod
-    def add_args(parser):
-        group = parser.add_argument_group('Input arguments')
-        group.add_argument('--from-wiki', action='store_true',
-                           help='Pull special phrases from the OSM wiki.')
-        group = parser.add_argument_group('Output arguments')
-        group.add_argument('-o', '--output', default='-',
-                           help="""File to write the preprocessed phrases to.
-                                   If omitted, it will be written to stdout.""")
-
-    @staticmethod
-    def run(args):
-        if args.output != '-':
-            raise NotImplementedError('Only output to stdout is currently implemented.')
-        return run_legacy_script('specialphrases.php', '--wiki-import', nominatim_env=args)
-
-
 class UpdateAddData:
     """\
     Add additional data from a file or an online source.
@@ -278,7 +254,7 @@ def nominatim(**kwargs):
     parser.add_subcommand('freeze', clicmd.SetupFreeze)
     parser.add_subcommand('replication', clicmd.UpdateReplication)
 
-    parser.add_subcommand('special-phrases', SetupSpecialPhrases)
+    parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases)
 
     parser.add_subcommand('add-data', UpdateAddData)
     parser.add_subcommand('index', clicmd.UpdateIndex)
index 9101e0c08973cc7877849d1b9248e2788cd8457b..ca64f3635031a449f5f9112a0906ccb70de882ce 100644 (file)
@@ -10,3 +10,4 @@ from .refresh import UpdateRefresh
 from .admin import AdminFuncs
 from .freeze import SetupFreeze
 from .transition import AdminTransition
+from .special_phrases import ImportSpecialPhrases
index e50c00dc7f6b836d32cb8e1e0cbf15daa31c488e..7185d97c14bf659f406e56d6c1b10e916467a278 100644 (file)
@@ -154,7 +154,7 @@ class APIReverse:
 
 class APILookup:
     """\
-    Execute API reverse query.
+    Execute API lookup query.
     """
 
     @staticmethod
@@ -189,7 +189,7 @@ class APILookup:
 
 class APIDetails:
     """\
-    Execute API lookup query.
+    Execute API details query.
     """
 
     @staticmethod
diff --git a/nominatim/clicmd/special_phrases.py b/nominatim/clicmd/special_phrases.py
new file mode 100644 (file)
index 0000000..99e8259
--- /dev/null
@@ -0,0 +1,31 @@
+"""
+    Implementation of the 'special-phrases' command.
+"""
+import logging
+from nominatim.tools.special_phrases import SpecialPhrasesImporter
+from nominatim.db.connection import connect
+
+LOG = logging.getLogger()
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+
+class ImportSpecialPhrases:
+    """\
+    Import special phrases.
+    """
+    @staticmethod
+    def add_args(parser):
+        group = parser.add_argument_group('Input arguments')
+        group.add_argument('--import-from-wiki', action='store_true',
+                           help='Import special phrases from the OSM wiki to the database.')
+
+    @staticmethod
+    def run(args):
+        if args.import_from_wiki:
+            LOG.warning('Special phrases importation starting')
+            with connect(args.config.get_libpq_dsn()) as db_connection:
+                SpecialPhrasesImporter(
+                    args.config, args.phplib_dir, db_connection
+                ).import_from_wiki()
+        return 0
index c9d8816be989fb99675341a512c6806efcf06465..177e67b812aef0ea05116928c214ed5434f5a622 100644 (file)
@@ -57,8 +57,14 @@ class ProgressLogger:
         """ Print final statistics about the progress.
         """
         rank_end_time = datetime.now()
-        diff_seconds = (rank_end_time-self.rank_start_time).total_seconds()
+
+        if rank_end_time == self.rank_start_time:
+            diff_seconds = 0
+            places_per_sec = self.done_places
+        else:
+            diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
+            places_per_sec = self.done_places/diff_seconds
 
         LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
                     self.done_places, self.total_places, int(diff_seconds),
-                    self.done_places/diff_seconds, self.name)
+                    places_per_sec, self.name)
diff --git a/nominatim/tools/special_phrases.py b/nominatim/tools/special_phrases.py
new file mode 100644 (file)
index 0000000..fd46a18
--- /dev/null
@@ -0,0 +1,278 @@
+"""
+    Functions to import special phrases into the database.
+"""
+import logging
+import os
+from pathlib import Path
+import re
+import subprocess
+import json
+from os.path import isfile
+from icu import Transliterator
+from psycopg2.sql import Identifier, Literal, SQL
+from nominatim.tools.exec_utils import get_url
+from nominatim.errors import UsageError
+
+LOG = logging.getLogger()
+class SpecialPhrasesImporter():
+    # pylint: disable-msg=too-many-instance-attributes
+    # pylint: disable-msg=too-few-public-methods
+    """
+        Class handling the process of special phrases importations.
+    """
+    def __init__(self, config, phplib_dir, db_connection) -> None:
+        self.db_connection = db_connection
+        self.config = config
+        self.phplib_dir = phplib_dir
+        self.black_list, self.white_list = self._load_white_and_black_lists()
+        #Compile the regex here to increase performances.
+        self.occurence_pattern = re.compile(
+            r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
+        )
+        self.sanity_check_pattern = re.compile(r'^\w+$')
+        self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
+                                                             self.config.TERM_NORMALIZATION)
+
+    def import_from_wiki(self, languages=None):
+        """
+            Iterate through all specified languages and
+            extract corresponding special phrases from the wiki.
+        """
+        if languages is not None and not isinstance(languages, list):
+            raise TypeError('The \'languages\' argument should be of type list.')
+
+        #Get all languages to process.
+        languages = self._load_languages() if not languages else languages
+
+        #Store pairs of class/type for further processing
+        class_type_pairs = set()
+
+        for lang in languages:
+            LOG.warning('Import phrases for lang: %s', lang)
+            wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
+            class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
+
+        self._create_place_classtype_table_and_indexes(class_type_pairs)
+        self.db_connection.commit()
+        LOG.warning('Import done.')
+
+    def _load_white_and_black_lists(self):
+        """
+            Load white and black lists from phrases-settings.json.
+        """
+        settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
+
+        if self.config.PHRASE_CONFIG:
+            settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
+
+        with open(settings_path, "r") as json_settings:
+            settings = json.load(json_settings)
+        return settings['blackList'], settings['whiteList']
+
+    def _load_languages(self):
+        """
+            Get list of all languages from env config file
+            or default if there is no languages configured.
+            The system will extract special phrases only from all specified languages.
+        """
+        default_languages = [
+            'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
+            'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
+            'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
+            'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
+        return self.config.LANGUAGES or default_languages
+
+    @staticmethod
+    def _get_wiki_content(lang):
+        """
+            Request and return the wiki page's content
+            corresponding to special phrases for a given lang.
+            Requested URL Example :
+                https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
+        """
+        url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
+        return get_url(url)
+
+    def _check_sanity(self, lang, phrase_class, phrase_type):
+        """
+            Check sanity of given inputs in case somebody added garbage in the wiki.
+            If a bad class/type is detected the system will exit with an error.
+        """
+        type_matchs = self.sanity_check_pattern.findall(phrase_type)
+        class_matchs = self.sanity_check_pattern.findall(phrase_class)
+
+        if len(class_matchs) < 1 or len(type_matchs) < 1:
+            raise UsageError("Bad class/type for language {}: {}={}".format(
+                lang, phrase_class, phrase_type))
+
+    def _process_xml_content(self, xml_content, lang):
+        """
+            Process given xml content by extracting matching patterns.
+            Matching patterns are processed there and returned in a
+            set of class/type pairs.
+        """
+        #One match will be of format [label, class, type, operator, plural]
+        matches = self.occurence_pattern.findall(xml_content)
+        #Store pairs of class/type for further processing
+        class_type_pairs = set()
+
+        for match in matches:
+            phrase_label = match[0].strip()
+            normalized_label = self.transliterator.transliterate(phrase_label)
+            phrase_class = match[1].strip()
+            phrase_type = match[2].strip()
+            phrase_operator = match[3].strip()
+            #hack around a bug where building=yes was imported with quotes into the wiki
+            phrase_type = re.sub(r'\"|&quot;', '', phrase_type)
+
+            #sanity check, in case somebody added garbage in the wiki
+            self._check_sanity(lang, phrase_class, phrase_type)
+
+            #blacklisting: disallow certain class/type combinations
+            if (
+                    phrase_class in self.black_list.keys() and
+                    phrase_type in self.black_list[phrase_class]
+            ):
+                continue
+            #whitelisting: if class is in whitelist, allow only tags in the list
+            if (
+                    phrase_class in self.white_list.keys() and
+                    phrase_type not in self.white_list[phrase_class]
+            ):
+                continue
+
+            #add class/type to the pairs dict
+            class_type_pairs.add((phrase_class, phrase_type))
+
+            self._process_amenity(
+                phrase_label, normalized_label, phrase_class,
+                phrase_type, phrase_operator
+            )
+
+        return class_type_pairs
+
+    def _process_amenity(self, phrase_label, normalized_label,
+                         phrase_class, phrase_type, phrase_operator):
+        # pylint: disable-msg=too-many-arguments
+        """
+            Add phrase lookup and corresponding class and
+            type to the word table based on the operator.
+        """
+        with self.db_connection.cursor() as db_cursor:
+            if phrase_operator == 'near':
+                db_cursor.execute("""SELECT getorcreate_amenityoperator(
+                                  make_standard_name(%s), %s, %s, %s, 'near')""",
+                                  (phrase_label, normalized_label, phrase_class, phrase_type))
+            elif phrase_operator == 'in':
+                db_cursor.execute("""SELECT getorcreate_amenityoperator(
+                                  make_standard_name(%s), %s, %s, %s, 'in')""",
+                                  (phrase_label, normalized_label, phrase_class, phrase_type))
+            else:
+                db_cursor.execute("""SELECT getorcreate_amenity(
+                                  make_standard_name(%s), %s, %s, %s)""",
+                                  (phrase_label, normalized_label, phrase_class, phrase_type))
+
+
+    def _create_place_classtype_table_and_indexes(self, class_type_pairs):
+        """
+            Create table place_classtype for each given pair.
+            Also create indexes on place_id and centroid.
+        """
+        LOG.warning('Create tables and indexes...')
+
+        sql_tablespace = self.config.TABLESPACE_AUX_DATA
+        if sql_tablespace:
+            sql_tablespace = ' TABLESPACE '+sql_tablespace
+
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
+
+        for pair in class_type_pairs:
+            phrase_class = pair[0]
+            phrase_type = pair[1]
+
+            #Table creation
+            self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
+
+            #Indexes creation
+            self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
+
+            #Grant access on read to the web user.
+            self._grant_access_to_webuser(phrase_class, phrase_type)
+
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute("DROP INDEX idx_placex_classtype")
+
+
+    def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
+        """
+            Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
+        """
+        table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL("""
+                    CREATE TABLE IF NOT EXISTS {{}} {} 
+                    AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex 
+                    WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
+                              .format(Identifier(table_name), Literal(phrase_class),
+                                      Literal(phrase_type)))
+
+
+    def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
+        """
+            Create indexes on centroid and place_id for the place_classtype table.
+        """
+        index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
+        base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+        #Index on centroid
+        if not self.db_connection.index_exists(index_prefix + 'centroid'):
+            with self.db_connection.cursor() as db_cursor:
+                db_cursor.execute(SQL("""
+                    CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
+                                  .format(Identifier(index_prefix + 'centroid'),
+                                          Identifier(base_table)), sql_tablespace)
+
+        #Index on place_id
+        if not self.db_connection.index_exists(index_prefix + 'place_id'):
+            with self.db_connection.cursor() as db_cursor:
+                db_cursor.execute(SQL(
+                    """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
+                                  .format(Identifier(index_prefix + 'place_id'),
+                                          Identifier(base_table)))
+
+
+    def _grant_access_to_webuser(self, phrase_class, phrase_type):
+        """
+            Grant access on read to the table place_classtype for the webuser.
+        """
+        table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
+                              .format(Identifier(table_name),
+                                      Identifier(self.config.DATABASE_WEBUSER)))
+
+    def _convert_php_settings_if_needed(self, file_path):
+        """
+            Convert php settings file of special phrases to json file if it is still in php format.
+        """
+        if not isfile(file_path):
+            raise UsageError(str(file_path) + ' is not a valid file.')
+
+        file, extension = os.path.splitext(file_path)
+        json_file_path = Path(file + '.json').resolve()
+
+        if extension not in('.php', '.json'):
+            raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
+
+        if extension == '.php' and not isfile(json_file_path):
+            try:
+                subprocess.run(['/usr/bin/env', 'php', '-Cq',
+                                (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
+                                file_path], check=True)
+                LOG.warning('special_phrase configuration file has been converted to json.')
+                return json_file_path
+            except subprocess.CalledProcessError:
+                LOG.error('Error while converting %s to json.', file_path)
+                raise
+        else:
+            return json_file_path
index 53efb3f79f016ac835bad7944249b3383c80eeaa..4069270eb73161073fba7ed0af593a8c3730e69c 100644 (file)
@@ -77,7 +77,7 @@ NOMINATIM_TIGER_DATA_PATH=
 NOMINATIM_WIKIPEDIA_DATA_PATH=
 
 # Configuration file for special phrase import.
-# When unset, the internal default settings from 'settings/phrase_settings.php'
+# When unset, the internal default settings from 'settings/phrase-settings.json'
 # are used.
 NOMINATIM_PHRASE_CONFIG=
 
diff --git a/settings/phrase-settings.json b/settings/phrase-settings.json
new file mode 100644 (file)
index 0000000..a097dca
--- /dev/null
@@ -0,0 +1,25 @@
+{
+    "Comments": [
+        "Black list correspond to class/type combinations to exclude",
+        "If a class is in the white list then all types will",
+        "be ignored except the ones given in the list.",
+        "Also use this list to exclude an entire class from special phrases."
+    ],
+    "blackList": {
+        "bounday": [
+            "administrative"
+        ],
+        "place": [
+            "house",
+            "houses"
+        ]
+    },
+    "whiteList": {
+        "highway": [
+            "bus_stop",
+            "rest_area",
+            "raceway'"
+        ],
+        "building": []
+    }
+}
index 168334b1659d381b1ea98875d0ce90a14f8e412e..170dd6392ba1d8b86569c59bea277013348b79c4 100644 (file)
@@ -109,6 +109,22 @@ class NominatimEnvironment:
         cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env)
         refresh.setup_website(Path(self.website_dir.name) / 'website', self.src_dir / 'lib-php', cfg)
 
+    def get_libpq_dsn(self):
+        dsn = self.test_env['NOMINATIM_DATABASE_DSN']
+
+        def quote_param(param):
+            key, val = param.split('=')
+            val = val.replace('\\', '\\\\').replace("'", "\\'")
+            if ' ' in val:
+                val = "'" + val + "'"
+            return key + '=' + val
+
+        if dsn.startswith('pgsql:'):
+            # Old PHP DSN format. Convert before returning.
+            return ' '.join([quote_param(p) for p in dsn[6:].split(';')])
+
+        return dsn
+
 
     def db_drop_database(self, name):
         """ Drop the database with the given name.
@@ -132,34 +148,16 @@ class NominatimEnvironment:
         if self._reuse_or_drop_db(self.template_db):
             return
 
-        try:
-            # call the first part of database setup
-            self.write_nominatim_config(self.template_db)
-            self.run_setup_script('create-db', 'setup-db')
-            # remove external data to speed up indexing for tests
-            conn = self.connect_database(self.template_db)
-            cur = conn.cursor()
-            cur.execute("""select tablename from pg_tables
-                           where tablename in ('gb_postcode', 'us_postcode')""")
-            for t in cur:
-                conn.cursor().execute('TRUNCATE TABLE {}'.format(t[0]))
-            conn.commit()
-            conn.close()
+        self.write_nominatim_config(self.template_db)
 
-            # execute osm2pgsql import on an empty file to get the right tables
+        try:
+            # execute nominatim import on an empty file to get the right tables
             with tempfile.NamedTemporaryFile(dir='/tmp', suffix='.xml') as fd:
                 fd.write(b'<osm version="0.6"></osm>')
                 fd.flush()
-                self.run_setup_script('import-data',
-                                      'ignore-errors',
-                                      'create-functions',
-                                      'create-tables',
-                                      'create-partition-tables',
-                                      'create-partition-functions',
-                                      'load-data',
-                                      'create-search-indices',
-                                      osm_file=fd.name,
-                                      osm2pgsql_cache='200')
+                self.run_nominatim('import', '--osm-file', fd.name,
+                                             '--osm2pgsql-cache', '1',
+                                             '--ignore-errors')
         except:
             self.db_drop_database(self.template_db)
             raise
@@ -179,12 +177,11 @@ class NominatimEnvironment:
             return
 
         testdata = Path('__file__') / '..' / '..' / 'testdb'
-        self.test_env['NOMINATIM_TIGER_DATA_PATH'] = str((testdata / 'tiger').resolve())
         self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
 
         try:
             self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-            self.run_setup_script('import-tiger-data')
+            self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
             self.run_nominatim('freeze')
 
             phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
@@ -259,6 +256,9 @@ class NominatimEnvironment:
     def run_nominatim(self, *cmdline):
         """ Run the nominatim command-line tool via the library.
         """
+        if self.website_dir is not None:
+            cmdline = list(cmdline) + ['--project-dir', self.website_dir.name]
+
         cli.nominatim(module_dir='',
                       osm2pgsql_path=str(self.build_dir / 'osm2pgsql' / 'osm2pgsql'),
                       phplib_dir=str(self.src_dir / 'lib-php'),
@@ -269,31 +269,6 @@ class NominatimEnvironment:
                       phpcgi_path='',
                       environ=self.test_env)
 
-    def run_setup_script(self, *args, **kwargs):
-        """ Run the Nominatim setup script with the given arguments.
-        """
-        self.run_nominatim_script('setup', *args, **kwargs)
-
-    def run_update_script(self, *args, **kwargs):
-        """ Run the Nominatim update script with the given arguments.
-        """
-        self.run_nominatim_script('update', *args, **kwargs)
-
-    def run_nominatim_script(self, script, *args, **kwargs):
-        """ Run one of the Nominatim utility scripts with the given arguments.
-        """
-        cmd = ['/usr/bin/env', 'php', '-Cq']
-        cmd.append((Path(self.src_dir) / 'lib-php' / 'admin' / '{}.php'.format(script)).resolve())
-        cmd.extend(['--' + x for x in args])
-        for k, v in kwargs.items():
-            cmd.extend(('--' + k.replace('_', '-'), str(v)))
-
-        if self.website_dir is not None:
-            cwd = self.website_dir.name
-        else:
-            cwd = None
-
-        run_script(cmd, cwd=cwd, env=self.test_env)
 
     def copy_from_place(self, db):
         """ Copy data from place to the placex and location_property_osmline
index 9d443b434da263654ed0f40c4f5e843bf2a0d433..72a610eb123733db313ee74d510b633afdac5fb3 100644 (file)
@@ -1,3 +1,4 @@
+import logging
 from itertools import chain
 
 import psycopg2.extras
@@ -5,7 +6,7 @@ import psycopg2.extras
 from place_inserter import PlaceColumn
 from table_compare import NominatimID, DBRow
 
-from nominatim.indexer.indexer import Indexer
+from nominatim.indexer import indexer
 
 def check_database_integrity(context):
     """ Check some generic constraints on the tables.
@@ -86,11 +87,25 @@ def import_and_index_data_from_place_table(context):
     """ Import data previously set up in the place table.
     """
     context.nominatim.copy_from_place(context.db)
-    context.nominatim.run_setup_script('calculate-postcodes')
+
+    # XXX use tool function as soon as it is ported
+    with context.db.cursor() as cur:
+        with (context.nominatim.src_dir / 'lib-sql' / 'postcode_tables.sql').open('r') as fd:
+            cur.execute(fd.read())
+        cur.execute("""
+            INSERT INTO location_postcode
+             (place_id, indexed_status, country_code, postcode, geometry)
+            SELECT nextval('seq_place'), 1, country_code,
+                   upper(trim (both ' ' from address->'postcode')) as pc,
+                   ST_Centroid(ST_Collect(ST_Centroid(geometry)))
+              FROM placex
+             WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                   AND geometry IS NOT null
+             GROUP BY country_code, pc""")
 
     # Call directly as the refresh function does not include postcodes.
-    indexer = Indexer(context.nominatim.test_env['NOMINATIM_DATABASE_DSN'][6:], 1)
-    indexer.index_full(analyse=False)
+    indexer.LOG.setLevel(logging.ERROR)
+    indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
 
     check_database_integrity(context)
 
index 844fb27484f984a51873e3268315f9d94756174b..fb4591bfb53504d45ba8805f592b28a629ad4110 100644 (file)
@@ -1,6 +1,23 @@
 import tempfile
 import random
 import os
+from pathlib import Path
+
+from nominatim.tools.exec_utils import run_osm2pgsql
+
+def get_osm2pgsql_options(nominatim_env, fname, append):
+    return dict(import_file=fname,
+                osm2pgsql=str(nominatim_env.build_dir / 'osm2pgsql' / 'osm2pgsql'),
+                osm2pgsql_cache=50,
+                osm2pgsql_style=str(nominatim_env.src_dir / 'settings' / 'import-extratags.style'),
+                threads=1,
+                dsn=nominatim_env.get_libpq_dsn(),
+                flatnode_file='',
+                tablespaces=dict(slim_data='', slim_index='',
+                                 main_data='', main_index=''),
+                append=append
+               )
+
 
 def write_opl_file(opl, grid):
     """ Create a temporary OSM file from OPL and return the file name. It is
@@ -52,9 +69,10 @@ def load_osm_file(context):
     """
     # create an OSM file and import it
     fname = write_opl_file(context.text, context.osm)
-    context.nominatim.run_setup_script('import-data', osm_file=fname,
-                                       osm2pgsql_cache=300)
-    os.remove(fname)
+    try:
+        run_osm2pgsql(get_osm2pgsql_options(context.nominatim, fname, append=False))
+    finally:
+        os.remove(fname)
 
     ### reintroduce the triggers/indexes we've lost by having osm2pgsql set up place again
     cur = context.db.cursor()
@@ -80,5 +98,7 @@ def update_from_osm_file(context):
 
     # create an OSM file and import it
     fname = write_opl_file(context.text, context.osm)
-    context.nominatim.run_update_script(import_diff=fname)
-    os.remove(fname)
+    try:
+        run_osm2pgsql(get_osm2pgsql_options(context.nominatim, fname, append=True))
+    finally:
+        os.remove(fname)
index 4b7cccc39f09992b73c48d348ebd150a4942d8b9..871365d90214f0515365179644d6ffe69e8a0100 100644 (file)
@@ -5,6 +5,7 @@ from pathlib import Path
 import psycopg2
 import psycopg2.extras
 import pytest
+import tempfile
 
 SRC_DIR = Path(__file__) / '..' / '..' / '..'
 
@@ -133,6 +134,13 @@ def def_config():
 def src_dir():
     return SRC_DIR.resolve()
 
+@pytest.fixture
+def tmp_phplib_dir():
+    with tempfile.TemporaryDirectory() as phpdir:
+        (Path(phpdir) / 'admin').mkdir()
+
+        yield Path(phpdir)
+
 @pytest.fixture
 def status_table(temp_db_conn):
     """ Create an empty version of the status table and
diff --git a/test/python/sample.tar.gz b/test/python/sample.tar.gz
new file mode 100644 (file)
index 0000000..65bff09
Binary files /dev/null and b/test/python/sample.tar.gz differ
index 918d84993d603270638209aadd37eadd920d8f32..eb0ee58487b5917112824ad4b0ce2940d5d21f3a 100644 (file)
@@ -64,7 +64,6 @@ def test_cli_help(capsys):
 
 
 @pytest.mark.parametrize("command,script", [
-                         (('special-phrases',), 'specialphrases'),
                          (('add-data', '--file', 'foo.osm'), 'update'),
                          (('export',), 'export')
                          ])
@@ -172,6 +171,12 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
     assert bnd_mock.called == do_bnds
     assert rank_mock.called == do_ranks
 
+def test_special_phrases_command(temp_db, mock_func_factory):
+    func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
+
+    call_nominatim('special-phrases', '--import-from-wiki')
+
+    assert func.called == 1
 
 @pytest.mark.parametrize("command,func", [
                          ('postcodes', 'update_postcodes'),
index 8f60ac7404e0c05e1d3ebf51e080477f9fb269da..3abe98181232cfbb1a45e98e20e0d57175defb56 100644 (file)
@@ -9,13 +9,6 @@ import pytest
 
 import nominatim.tools.exec_utils as exec_utils
 
-@pytest.fixture
-def tmp_phplib_dir():
-    with tempfile.TemporaryDirectory() as phpdir:
-        (Path(phpdir) / 'admin').mkdir()
-
-        yield Path(phpdir)
-
 @pytest.fixture
 def nominatim_env(tmp_phplib_dir, def_config):
     class _NominatimEnv:
diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py
new file mode 100644 (file)
index 0000000..7a8b832
--- /dev/null
@@ -0,0 +1,346 @@
+"""
+    Tests for import special phrases methods
+    of the class SpecialPhrasesImporter.
+"""
+from nominatim.errors import UsageError
+from pathlib import Path
+import tempfile
+from shutil import copyfile
+import pytest
+from nominatim.tools.special_phrases import SpecialPhrasesImporter
+
+TEST_BASE_DIR = Path(__file__) / '..' / '..'
+
+def test_check_sanity_class(special_phrases_importer):
+    """
+        Check for _check_sanity() method.
+        If a wrong class or type is given, an UsageError should raise.
+        If a good class and type are given, nothing special happens.
+    """
+    with pytest.raises(UsageError) as wrong_class:
+        special_phrases_importer._check_sanity('en', '', 'type')
+    
+    with pytest.raises(UsageError) as wrong_type:
+        special_phrases_importer._check_sanity('en', 'class', '')
+
+    special_phrases_importer._check_sanity('en', 'class', 'type')
+
+    assert wrong_class and wrong_type
+
+def test_load_white_and_black_lists(special_phrases_importer):
+    """
+        Test that _load_white_and_black_lists() well return
+        black list and white list and that they are of dict type.
+    """
+    black_list, white_list = special_phrases_importer._load_white_and_black_lists()
+
+    assert isinstance(black_list, dict) and isinstance(white_list, dict)
+
+def test_convert_php_settings(special_phrases_importer):
+    """
+        Test that _convert_php_settings_if_needed() convert the given
+        php file to a json file.
+    """
+    php_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.php').resolve()
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_settings = (Path(temp_dir) / 'phrase_settings.php').resolve()
+        copyfile(php_file, temp_settings)
+        special_phrases_importer._convert_php_settings_if_needed(temp_settings)
+
+        assert (Path(temp_dir) / 'phrase_settings.json').is_file()
+
+def test_convert_settings_wrong_file(special_phrases_importer):
+    """
+        Test that _convert_php_settings_if_needed() raise an exception
+        if the given file is not a valid file.
+    """
+
+    with pytest.raises(UsageError) as exceptioninfos:
+        special_phrases_importer._convert_php_settings_if_needed('random_file')
+
+    assert str(exceptioninfos.value) == 'random_file is not a valid file.'
+
+def test_convert_settings_json_already_exist(special_phrases_importer):
+    """
+        Test that if we give to '_convert_php_settings_if_needed' a php file path
+        and that a the corresponding json file already exists, it is returned.
+    """
+    php_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.php').resolve()
+    json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
+
+    returned = special_phrases_importer._convert_php_settings_if_needed(php_file)
+
+    assert returned == json_file
+
+def test_convert_settings_giving_json(special_phrases_importer):
+    """
+        Test that if we give to '_convert_php_settings_if_needed' a json file path
+        the same path is directly returned
+    """
+    json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase-settings.json').resolve()
+    
+    returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
+
+    assert returned == json_file
+
+def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
+                                       word_table, temp_db_conn):
+    """
+        Test that _process_amenity() execute well the 
+        getorcreate_amenityoperator() SQL function and that
+        the 2 differents operators are well handled.
+    """
+    special_phrases_importer._process_amenity('', '', '', '', 'near')
+    special_phrases_importer._process_amenity('', '', '', '', 'in')
+
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("SELECT * FROM temp_with_operator WHERE op='near' OR op='in'")
+        results = temp_db_cursor.fetchall()
+
+    assert len(results) == 2
+
+def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
+                                          temp_db_conn):
+    """
+        Test that _process_amenity() execute well the
+        getorcreate_amenity() SQL function.
+    """
+    special_phrases_importer._process_amenity('', '', '', '', '')
+
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("SELECT * FROM temp_without_operator WHERE op='no_operator'")
+        result = temp_db_cursor.fetchone()
+
+    assert result
+
+def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
+    """
+        Test that _create_place_classtype_indexes() create the
+        place_id index and centroid index on the right place_class_type table.
+    """
+    phrase_class = 'class'
+    phrase_type = 'type'
+    table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("CREATE EXTENSION postgis;")
+        temp_db_cursor.execute('CREATE TABLE {}(place_id BIGINT, centroid GEOMETRY)'.format(table_name))
+
+    special_phrases_importer._create_place_classtype_indexes('', phrase_class, phrase_type)
+
+    assert check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type)
+
+def test_create_place_classtype_table(temp_db_conn, placex_table, special_phrases_importer):
+    """
+        Test that _create_place_classtype_table() create
+        the right place_classtype table.
+    """
+    phrase_class = 'class'
+    phrase_type = 'type'
+    special_phrases_importer._create_place_classtype_table('', phrase_class, phrase_type)
+
+    assert check_table_exist(temp_db_conn, phrase_class, phrase_type)
+
+def test_grant_access_to_web_user(temp_db_conn, def_config, special_phrases_importer):
+    """
+        Test that _grant_access_to_webuser() give 
+        right access to the web user.
+    """
+    phrase_class = 'class'
+    phrase_type = 'type'
+    table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute('CREATE TABLE {}()'.format(table_name))
+
+    special_phrases_importer._grant_access_to_webuser(phrase_class, phrase_type)
+
+    assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, phrase_class, phrase_type)
+
+def test_create_place_classtype_table_and_indexes(
+        temp_db_conn, def_config, placex_table, getorcreate_amenity_funcs,
+        getorcreate_amenityoperator_funcs, special_phrases_importer):
+    """
+        Test that _create_place_classtype_table_and_indexes()
+        create the right place_classtype tables and place_id indexes
+        and centroid indexes and grant access to the web user
+        for the given set of pairs.
+    """
+    pairs = set([('class1', 'type1'), ('class2', 'type2')])
+
+    special_phrases_importer._create_place_classtype_table_and_indexes(pairs)
+
+    for pair in pairs:
+        assert check_table_exist(temp_db_conn, pair[0], pair[1])
+        assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
+        assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
+
+def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, 
+                             getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+    """
+        Test that _process_xml_content() process the given xml content right
+        by executing the right SQL functions for amenities and 
+        by returning the right set of pairs.
+    """
+    class_test = 'aerialway'
+    type_test = 'zip_line'
+
+    #Converted output set to a dict for easy assert further.
+    results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
+
+    assert check_amenities_with_op(temp_db_conn)
+    assert check_amenities_without_op(temp_db_conn)
+    assert results[class_test] and type_test in results.values()
+
+def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, 
+                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+    """
+        Check that the main import_from_wiki() method is well executed.
+        It should create the place_classtype table, the place_id and centroid indexes,
+        grand access to the web user and executing the SQL functions for amenities.
+    """
+    monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
+    special_phrases_importer.import_from_wiki(['en'])
+
+    class_test = 'aerialway'
+    type_test = 'zip_line'
+
+    assert check_table_exist(temp_db_conn, class_test, type_test)
+    assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
+    assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
+    assert check_amenities_with_op(temp_db_conn)
+    assert check_amenities_without_op(temp_db_conn)
+
+def mock_get_wiki_content(lang):
+    """
+        Mock the _get_wiki_content() method to return
+        static xml test file content.
+    """
+    return get_test_xml_wiki_content()
+
+def get_test_xml_wiki_content():
+    """
+        return the content of the static xml test file.
+    """
+    xml_test_content_path = (TEST_BASE_DIR / 'testdata' / 'special_phrases_test_content.txt').resolve()
+    with open(xml_test_content_path) as xml_content_reader:
+        return xml_content_reader.read()
+
+def check_table_exist(temp_db_conn, phrase_class, phrase_type):
+    """
+        Verify that the place_classtype table exists for the given
+        phrase_class and phrase_type.
+    """
+    table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("""
+            SELECT *
+            FROM information_schema.tables
+            WHERE table_type='BASE TABLE'
+            AND table_name='{}'""".format(table_name))
+        return temp_db_cursor.fetchone()
+
+def check_grant_access(temp_db_conn, user, phrase_class, phrase_type):
+    """
+        Check that the web user has been granted right access to the
+        place_classtype table of the given phrase_class and phrase_type.
+    """
+    table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("""
+                SELECT * FROM information_schema.role_table_grants
+                WHERE table_name='{}'
+                AND grantee='{}'
+                AND privilege_type='SELECT'""".format(table_name, user))
+        return temp_db_cursor.fetchone()
+
+def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
+    """
+        Check that the place_id index and centroid index exist for the
+        place_classtype table of the given phrase_class and phrase_type.
+    """
+    index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
+
+    return (
+        temp_db_conn.index_exists(index_prefix + 'centroid')
+        and
+        temp_db_conn.index_exists(index_prefix + 'place_id')
+    )
+
+def check_amenities_with_op(temp_db_conn):
+    """
+        Check that the test table for the SQL function getorcreate_amenityoperator()
+        contains more than one value (so that the SQL function was call more than one time).
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("SELECT * FROM temp_with_operator")
+        return len(temp_db_cursor.fetchall()) > 1
+
+def check_amenities_without_op(temp_db_conn):
+    """
+        Check that the test table for the SQL function getorcreate_amenity()
+        contains more than one value (so that the SQL function was call more than one time).
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        temp_db_cursor.execute("SELECT * FROM temp_without_operator")
+        return len(temp_db_cursor.fetchall()) > 1
+
+@pytest.fixture
+def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
+    """
+        Return an instance of SpecialPhrasesImporter.
+    """
+    return SpecialPhrasesImporter(def_config, temp_phplib_dir_with_migration, temp_db_conn)
+
+@pytest.fixture
+def temp_phplib_dir_with_migration():
+    """
+        Return temporary phpdir with migration subdirectory and
+        PhraseSettingsToJson.php script inside.
+    """
+    migration_file = (TEST_BASE_DIR / '..' / 'lib-php' / 'migration'
+                      / 'PhraseSettingsToJson.php').resolve()
+    with tempfile.TemporaryDirectory() as phpdir:
+        (Path(phpdir) / 'migration').mkdir()
+        migration_dest_path = (Path(phpdir) / 'migration' / 'PhraseSettingsToJson.php').resolve()
+        copyfile(migration_file, migration_dest_path)
+
+        yield Path(phpdir)
+
+@pytest.fixture
+def make_strandard_name_func(temp_db_cursor):
+    temp_db_cursor.execute("""
+        CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
+        BEGIN
+        RETURN trim(name); --Basically return only the trimed name for the tests
+        END;
+        $$ LANGUAGE plpgsql IMMUTABLE;""")
+        
+@pytest.fixture
+def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
+    temp_db_cursor.execute("""
+        CREATE TABLE temp_without_operator(op TEXT);
+    
+        CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
+                                                    lookup_class text, lookup_type text)
+        RETURNS void as $$
+        BEGIN
+            INSERT INTO temp_without_operator VALUES('no_operator');
+        END;
+        $$ LANGUAGE plpgsql""")
+
+@pytest.fixture
+def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
+    temp_db_cursor.execute("""
+        CREATE TABLE temp_with_operator(op TEXT);
+
+        CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
+                                                    lookup_class text, lookup_type text, op text)
+        RETURNS void as $$
+        BEGIN 
+            INSERT INTO temp_with_operator VALUES(op);
+        END;
+        $$ LANGUAGE plpgsql""")
\ No newline at end of file
diff --git a/test/testdata/special_phrases_test_content.txt b/test/testdata/special_phrases_test_content.txt
new file mode 100644 (file)
index 0000000..bc8c65d
--- /dev/null
@@ -0,0 +1,78 @@
+<mediawiki xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
+<siteinfo>
+<sitename>OpenStreetMap Wiki</sitename>
+<dbname>wiki</dbname>
+<base>https://wiki.openstreetmap.org/wiki/Main_Page</base>
+<generator>MediaWiki 1.35.1</generator>
+<case>first-letter</case>
+<namespaces>
+<namespace key="-2" case="first-letter">Media</namespace>
+<namespace key="-1" case="first-letter">Special</namespace>
+<namespace key="0" case="first-letter"/>
+<namespace key="1" case="first-letter">Talk</namespace>
+<namespace key="2" case="first-letter">User</namespace>
+<namespace key="3" case="first-letter">User talk</namespace>
+<namespace key="4" case="first-letter">Wiki</namespace>
+<namespace key="5" case="first-letter">Wiki talk</namespace>
+<namespace key="6" case="first-letter">File</namespace>
+<namespace key="7" case="first-letter">File talk</namespace>
+<namespace key="8" case="first-letter">MediaWiki</namespace>
+<namespace key="9" case="first-letter">MediaWiki talk</namespace>
+<namespace key="10" case="first-letter">Template</namespace>
+<namespace key="11" case="first-letter">Template talk</namespace>
+<namespace key="12" case="first-letter">Help</namespace>
+<namespace key="13" case="first-letter">Help talk</namespace>
+<namespace key="14" case="first-letter">Category</namespace>
+<namespace key="15" case="first-letter">Category talk</namespace>
+<namespace key="120" case="first-letter">Item</namespace>
+<namespace key="121" case="first-letter">Item talk</namespace>
+<namespace key="122" case="first-letter">Property</namespace>
+<namespace key="123" case="first-letter">Property talk</namespace>
+<namespace key="200" case="first-letter">DE</namespace>
+<namespace key="201" case="first-letter">DE talk</namespace>
+<namespace key="202" case="first-letter">FR</namespace>
+<namespace key="203" case="first-letter">FR talk</namespace>
+<namespace key="204" case="first-letter">ES</namespace>
+<namespace key="205" case="first-letter">ES talk</namespace>
+<namespace key="206" case="first-letter">IT</namespace>
+<namespace key="207" case="first-letter">IT talk</namespace>
+<namespace key="208" case="first-letter">NL</namespace>
+<namespace key="209" case="first-letter">NL talk</namespace>
+<namespace key="210" case="first-letter">RU</namespace>
+<namespace key="211" case="first-letter">RU talk</namespace>
+<namespace key="212" case="first-letter">JA</namespace>
+<namespace key="213" case="first-letter">JA talk</namespace>
+<namespace key="710" case="first-letter">TimedText</namespace>
+<namespace key="711" case="first-letter">TimedText talk</namespace>
+<namespace key="828" case="first-letter">Module</namespace>
+<namespace key="829" case="first-letter">Module talk</namespace>
+<namespace key="2300" case="first-letter">Gadget</namespace>
+<namespace key="2301" case="first-letter">Gadget talk</namespace>
+<namespace key="2302" case="case-sensitive">Gadget definition</namespace>
+<namespace key="2303" case="case-sensitive">Gadget definition talk</namespace>
+</namespaces>
+</siteinfo>
+<page>
+<title>Nominatim/Special Phrases/EN</title>
+<ns>0</ns>
+<id>67365</id>
+<revision>
+<id>2100424</id>
+<parentid>2100422</parentid>
+<timestamp>2021-01-27T20:29:53Z</timestamp>
+<contributor>
+<username>Violaine Do</username>
+<id>88152</id>
+</contributor>
+<minor/>
+<comment>/* en */ add coworking amenity</comment>
+<origin>2100424</origin>
+<model>wikitext</model>
+<format>text/x-wiki</format>
+<text bytes="158218" sha1="cst5x7tt58izti1pxzgljf27tx8qjcj" xml:space="preserve">
+== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Zip Lines near || aerialway || zip_line || near || Y |- | Zip Wire || aerialway || zip_line || - || N |- | Zip Wires || aerialway || zip_line || - || Y |- | Zip Wire in || aerialway || zip_line || in || N |- | Zip Wires in || aerialway || zip_line || in || Y |- | Zip Wire near || aerialway || zip_line || near || N |} [[Category:Word list]]
+</text>
+<sha1>cst5x7tt58izti1pxzgljf27tx8qjcj</sha1>
+</revision>
+</page>
+</mediawiki>
\ No newline at end of file
diff --git a/test/testfiles/phrase-settings.json b/test/testfiles/phrase-settings.json
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/test/testfiles/random_file.html b/test/testfiles/random_file.html
new file mode 100644 (file)
index 0000000..e69de29
index 24d88926e2ee6fc9368ec2845935e820129f03f5..610bbb04fc2c2c86e5a262cca3a5074e27776206 100755 (executable)
@@ -40,9 +40,9 @@
                         php-pgsql php php-intl libpqxx-devel \
                         proj-epsg bzip2-devel proj-devel boost-devel \
                         python3-pip python3-setuptools python3-devel \
-                        expat-devel zlib-devel
+                        expat-devel zlib-devel libicu-dev
 
-    pip3 install --user psycopg2 python-dotenv psutil Jinja2
+    pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU
 
 
 #
index 859c48b9517f6cf7714c978a6f2eaaf59f3cc86f..17d07aec70e5729c07e10b30b935e35b695c6659 100755 (executable)
@@ -33,9 +33,9 @@
                         php-pgsql php php-intl php-json libpq-devel \
                         bzip2-devel proj-devel boost-devel \
                         python3-pip python3-setuptools python3-devel \
-                        expat-devel zlib-devel
+                        expat-devel zlib-devel libicu-dev
 
-    pip3 install --user psycopg2 python-dotenv psutil Jinja2
+    pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU
 
 
 #
index 5cbbd5837558963c7fdde7954725b0f27db993cd..b90da87195de0509845926688baa4b69c37e92ba 100755 (executable)
@@ -29,8 +29,8 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
                         libbz2-dev libpq-dev libproj-dev \
                         postgresql-server-dev-10 postgresql-10-postgis-2.4 \
                         postgresql-contrib-10 postgresql-10-postgis-scripts \
-                        php php-pgsql php-intl python3-pip \
-                        python3-psycopg2 python3-psutil python3-jinja2 git
+                        php php-pgsql php-intl libicu-dev python3-pip \
+                        python3-psycopg2 python3-psutil python3-jinja2 python3-icu git
 
 # The python-dotenv package that comes with Ubuntu 18.04 is too old, so
 # install the latest version from pip:
index 0649c9a667372ca6030f4c3912525b9700e91315..d04f5796180a19efef59ae84a75b2b20cb23b321 100755 (executable)
@@ -32,8 +32,8 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
                         libbz2-dev libpq-dev libproj-dev \
                         postgresql-server-dev-12 postgresql-12-postgis-3 \
                         postgresql-contrib-12 postgresql-12-postgis-3-scripts \
-                        php php-pgsql php-intl python3-dotenv \
-                        python3-psycopg2 python3-psutil python3-jinja2 git
+                        php php-pgsql php-intl libicu-dev python3-dotenv \
+                        python3-psycopg2 python3-psutil python3-jinja2 python3-icu git
 
 #
 # System Configuration