From: Sarah Hoffmann Date: Mon, 29 Mar 2021 07:49:35 +0000 (+0200) Subject: Merge pull request #2228 from AntoJvlt/import-special-phrases-porting-python X-Git-Tag: v3.7.0~10 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/09b2510219e97203ad17aa2250ea8351dff06b23?hp=28b4fb12b684a5c76ca24002ba1ae19ce2e22c87 Merge pull request #2228 from AntoJvlt/import-special-phrases-porting-python Import special phrases porting python --- diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml index 414783d9..d0a89774 100644 --- a/.github/actions/build-nominatim/action.yml +++ b/.github/actions/build-nominatim/action.yml @@ -6,7 +6,7 @@ runs: steps: - name: Install prerequisites run: | - sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil python3-jinja2 + sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev libicu-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil python3-jinja2 python3-icu shell: bash - name: Download dependencies diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index e0e68a9c..2f920a66 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -120,7 +120,7 @@ jobs: working-directory: data-env - name: Import special phrases - run: nominatim special-phrases --from-wiki | psql -d nominatim + run: nominatim special-phrases --import-from-wiki working-directory: data-env - name: Check import diff --git a/.gitignore b/.gitignore index 23fb34a6..44b8eb32 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ data/wiki_specialphrases.sql data/osmosischange.osc .vagrant +data/country_osm_grid.sql.gz diff --git a/.pylintrc b/.pylintrc index da6dbe03..eab04181 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,7 @@ [MASTER] extension-pkg-whitelist=osmium +ignored-modules=icu [MESSAGES CONTROL] diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b4c2976..1c6336a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,7 +114,6 @@ if (BUILD_IMPORTER) export.php query.php setup.php - specialphrases.php update.php warm.php ) @@ -259,7 +258,7 @@ endif() install(FILES settings/env.defaults settings/address-levels.json - settings/phrase_settings.php + settings/phrase-settings.json settings/import-admin.style settings/import-street.style settings/import-address.style diff --git a/docs/admin/Import.md b/docs/admin/Import.md index ef0da0be..e3a32481 100644 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@ -268,10 +268,9 @@ running this function. If you want to be able to search for places by their type through [special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases) -you also need to enable these key phrases like this: +you also need to import these key phrases like this: - nominatim special-phrases --from-wiki > specialphrases.sql - psql -d nominatim -f specialphrases.sql + nominatim special-phrases --import-from-wiki Note that this command downloads the phrases from the wiki link above. You need internet access for the step. diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md index eadaaff1..6237a9d4 100644 --- a/docs/admin/Installation.md +++ b/docs/admin/Installation.md @@ -30,6 +30,7 @@ For compiling: * [proj](https://proj.org/) * [bzip2](http://www.bzip.org/) * [zlib](https://www.zlib.net/) + * [ICU](http://site.icu-project.org/) * [Boost libraries](https://www.boost.org/), including system and filesystem * PostgreSQL client libraries * a recent C++ compiler (gcc 5+ or Clang 3.8+) @@ -43,6 +44,7 @@ For running Nominatim: * [Python Dotenv](https://github.com/theskumar/python-dotenv) * [psutil](https://github.com/giampaolo/psutil) * [Jinja2](https://palletsprojects.com/p/jinja/) + * [PyICU](https://pypi.org/project/PyICU/) * [PHP](https://php.net) (7.0 or later) * PHP-pgsql * PHP-intl (bundled with PHP) diff --git a/lib-php/admin/specialphrases.php b/lib-php/admin/specialphrases.php index 8d2d9129..84bcfb5c 100644 --- a/lib-php/admin/specialphrases.php +++ b/lib-php/admin/specialphrases.php @@ -1,163 +1,11 @@ + transliterate($sLabel)); - } else { - $sTrans = null; - } - $sClass = trim($aMatch[2]); - $sType = trim($aMatch[3]); - // hack around a bug where building=yes was imported with - // quotes into the wiki - $sType = preg_replace('/("|")/', '', $sType); - // sanity check, in case somebody added garbage in the wiki - if (preg_match('/^\\w+$/', $sClass) < 1 - || preg_match('/^\\w+$/', $sType) < 1 - ) { - trigger_error("Bad class/type for language $sLanguage: $sClass=$sType"); - exit; - } - // blacklisting: disallow certain class/type combinations - if (isset($aTagsBlacklist[$sClass]) && in_array($sType, $aTagsBlacklist[$sClass])) { - // fwrite(STDERR, "Blacklisted: ".$sClass."/".$sType."\n"); - continue; - } - // whitelisting: if class is in whitelist, allow only tags in the list - if (isset($aTagsWhitelist[$sClass]) && !in_array($sType, $aTagsWhitelist[$sClass])) { - // fwrite(STDERR, "Non-Whitelisted: ".$sClass."/".$sType."\n"); - continue; - } - $aPairs[$sClass.'|'.$sType] = array($sClass, $sType); - - switch (trim($aMatch[4])) { - case 'near': - printf( - "SELECT getorcreate_amenityoperator(make_standard_name('%s'), '%s', '%s', '%s', 'near');\n", - pg_escape_string($sLabel), - $sTrans, - $sClass, - $sType - ); - break; - case 'in': - printf( - "SELECT getorcreate_amenityoperator(make_standard_name('%s'), '%s', '%s', '%s', 'in');\n", - pg_escape_string($sLabel), - $sTrans, - $sClass, - $sType - ); - break; - default: - printf( - "SELECT getorcreate_amenity(make_standard_name('%s'), '%s', '%s', '%s');\n", - pg_escape_string($sLabel), - $sTrans, - $sClass, - $sType - ); - break; - } - } - } - - echo 'CREATE INDEX idx_placex_classtype ON placex (class, type);'; - - foreach ($aPairs as $aPair) { - $sql_tablespace = getSetting('TABLESPACE_AUX_DATA'); - if ($sql_tablespace) { - $sql_tablespace = ' TABLESPACE '.$sql_tablespace; - } - - printf( - 'CREATE TABLE place_classtype_%s_%s' - . $sql_tablespace - . ' AS' - . ' SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex' - . " WHERE class = '%s' AND type = '%s'" - . ";\n", - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]), - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]) - ); - - printf( - 'CREATE INDEX idx_place_classtype_%s_%s_centroid' - . ' ON place_classtype_%s_%s USING GIST (centroid)' - . $sql_tablespace - . ";\n", - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]), - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]) - ); - - printf( - 'CREATE INDEX idx_place_classtype_%s_%s_place_id' - . ' ON place_classtype_%s_%s USING btree(place_id)' - . $sql_tablespace - . ";\n", - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]), - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]) - ); - printf( - 'GRANT SELECT ON place_classtype_%s_%s TO "%s"' - . ";\n", - pg_escape_string($aPair[0]), - pg_escape_string($aPair[1]), - getSetting('DATABASE_WEBUSER') - ); - } +loadSettings(getcwd()); - echo 'DROP INDEX idx_placex_classtype;'; -} +(new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'))) + ->addParams('special-phrases', '--import-from-wiki') + ->run(); diff --git a/lib-php/migration/PhraseSettingsToJson.php b/lib-php/migration/PhraseSettingsToJson.php new file mode 100644 index 00000000..15c49f0a --- /dev/null +++ b/lib-php/migration/PhraseSettingsToJson.php @@ -0,0 +1,19 @@ + None: + self.db_connection = db_connection + self.config = config + self.phplib_dir = phplib_dir + self.black_list, self.white_list = self._load_white_and_black_lists() + #Compile the regex here to increase performances. + self.occurence_pattern = re.compile( + r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])' + ) + self.sanity_check_pattern = re.compile(r'^\w+$') + self.transliterator = Transliterator.createFromRules("special-phrases normalizer", + self.config.TERM_NORMALIZATION) + + def import_from_wiki(self, languages=None): + """ + Iterate through all specified languages and + extract corresponding special phrases from the wiki. + """ + if languages is not None and not isinstance(languages, list): + raise TypeError('The \'languages\' argument should be of type list.') + + #Get all languages to process. + languages = self._load_languages() if not languages else languages + + #Store pairs of class/type for further processing + class_type_pairs = set() + + for lang in languages: + LOG.warning('Import phrases for lang: %s', lang) + wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang) + class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang)) + + self._create_place_classtype_table_and_indexes(class_type_pairs) + self.db_connection.commit() + LOG.warning('Import done.') + + def _load_white_and_black_lists(self): + """ + Load white and black lists from phrases-settings.json. + """ + settings_path = (self.config.config_dir / 'phrase-settings.json').resolve() + + if self.config.PHRASE_CONFIG: + settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG) + + with open(settings_path, "r") as json_settings: + settings = json.load(json_settings) + return settings['blackList'], settings['whiteList'] + + def _load_languages(self): + """ + Get list of all languages from env config file + or default if there is no languages configured. + The system will extract special phrases only from all specified languages. + """ + default_languages = [ + 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', + 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', + 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', + 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] + return self.config.LANGUAGES or default_languages + + @staticmethod + def _get_wiki_content(lang): + """ + Request and return the wiki page's content + corresponding to special phrases for a given lang. + Requested URL Example : + https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN + """ + url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long + return get_url(url) + + def _check_sanity(self, lang, phrase_class, phrase_type): + """ + Check sanity of given inputs in case somebody added garbage in the wiki. + If a bad class/type is detected the system will exit with an error. + """ + type_matchs = self.sanity_check_pattern.findall(phrase_type) + class_matchs = self.sanity_check_pattern.findall(phrase_class) + + if len(class_matchs) < 1 or len(type_matchs) < 1: + raise UsageError("Bad class/type for language {}: {}={}".format( + lang, phrase_class, phrase_type)) + + def _process_xml_content(self, xml_content, lang): + """ + Process given xml content by extracting matching patterns. + Matching patterns are processed there and returned in a + set of class/type pairs. + """ + #One match will be of format [label, class, type, operator, plural] + matches = self.occurence_pattern.findall(xml_content) + #Store pairs of class/type for further processing + class_type_pairs = set() + + for match in matches: + phrase_label = match[0].strip() + normalized_label = self.transliterator.transliterate(phrase_label) + phrase_class = match[1].strip() + phrase_type = match[2].strip() + phrase_operator = match[3].strip() + #hack around a bug where building=yes was imported with quotes into the wiki + phrase_type = re.sub(r'\"|"', '', phrase_type) + + #sanity check, in case somebody added garbage in the wiki + self._check_sanity(lang, phrase_class, phrase_type) + + #blacklisting: disallow certain class/type combinations + if ( + phrase_class in self.black_list.keys() and + phrase_type in self.black_list[phrase_class] + ): + continue + #whitelisting: if class is in whitelist, allow only tags in the list + if ( + phrase_class in self.white_list.keys() and + phrase_type not in self.white_list[phrase_class] + ): + continue + + #add class/type to the pairs dict + class_type_pairs.add((phrase_class, phrase_type)) + + self._process_amenity( + phrase_label, normalized_label, phrase_class, + phrase_type, phrase_operator + ) + + return class_type_pairs + + def _process_amenity(self, phrase_label, normalized_label, + phrase_class, phrase_type, phrase_operator): + # pylint: disable-msg=too-many-arguments + """ + Add phrase lookup and corresponding class and + type to the word table based on the operator. + """ + with self.db_connection.cursor() as db_cursor: + if phrase_operator == 'near': + db_cursor.execute("""SELECT getorcreate_amenityoperator( + make_standard_name(%s), %s, %s, %s, 'near')""", + (phrase_label, normalized_label, phrase_class, phrase_type)) + elif phrase_operator == 'in': + db_cursor.execute("""SELECT getorcreate_amenityoperator( + make_standard_name(%s), %s, %s, %s, 'in')""", + (phrase_label, normalized_label, phrase_class, phrase_type)) + else: + db_cursor.execute("""SELECT getorcreate_amenity( + make_standard_name(%s), %s, %s, %s)""", + (phrase_label, normalized_label, phrase_class, phrase_type)) + + + def _create_place_classtype_table_and_indexes(self, class_type_pairs): + """ + Create table place_classtype for each given pair. + Also create indexes on place_id and centroid. + """ + LOG.warning('Create tables and indexes...') + + sql_tablespace = self.config.TABLESPACE_AUX_DATA + if sql_tablespace: + sql_tablespace = ' TABLESPACE '+sql_tablespace + + with self.db_connection.cursor() as db_cursor: + db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)") + + for pair in class_type_pairs: + phrase_class = pair[0] + phrase_type = pair[1] + + #Table creation + self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type) + + #Indexes creation + self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type) + + #Grant access on read to the web user. + self._grant_access_to_webuser(phrase_class, phrase_type) + + with self.db_connection.cursor() as db_cursor: + db_cursor.execute("DROP INDEX idx_placex_classtype") + + + def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type): + """ + Create table place_classtype of the given phrase_class/phrase_type if doesn't exit. + """ + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(""" + CREATE TABLE IF NOT EXISTS {{}} {} + AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex + WHERE class = {{}} AND type = {{}}""".format(sql_tablespace)) + .format(Identifier(table_name), Literal(phrase_class), + Literal(phrase_type))) + + + def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type): + """ + Create indexes on centroid and place_id for the place_classtype table. + """ + index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type) + base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + #Index on centroid + if not self.db_connection.index_exists(index_prefix + 'centroid'): + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(""" + CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace)) + .format(Identifier(index_prefix + 'centroid'), + Identifier(base_table)), sql_tablespace) + + #Index on place_id + if not self.db_connection.index_exists(index_prefix + 'place_id'): + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL( + """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace)) + .format(Identifier(index_prefix + 'place_id'), + Identifier(base_table))) + + + def _grant_access_to_webuser(self, phrase_class, phrase_type): + """ + Grant access on read to the table place_classtype for the webuser. + """ + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""") + .format(Identifier(table_name), + Identifier(self.config.DATABASE_WEBUSER))) + + def _convert_php_settings_if_needed(self, file_path): + """ + Convert php settings file of special phrases to json file if it is still in php format. + """ + if not isfile(file_path): + raise UsageError(str(file_path) + ' is not a valid file.') + + file, extension = os.path.splitext(file_path) + json_file_path = Path(file + '.json').resolve() + + if extension not in('.php', '.json'): + raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.') + + if extension == '.php' and not isfile(json_file_path): + try: + subprocess.run(['/usr/bin/env', 'php', '-Cq', + (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(), + file_path], check=True) + LOG.warning('special_phrase configuration file has been converted to json.') + return json_file_path + except subprocess.CalledProcessError: + LOG.error('Error while converting %s to json.', file_path) + raise + else: + return json_file_path diff --git a/settings/env.defaults b/settings/env.defaults index 53efb3f7..4069270e 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -77,7 +77,7 @@ NOMINATIM_TIGER_DATA_PATH= NOMINATIM_WIKIPEDIA_DATA_PATH= # Configuration file for special phrase import. -# When unset, the internal default settings from 'settings/phrase_settings.php' +# When unset, the internal default settings from 'settings/phrase-settings.json' # are used. NOMINATIM_PHRASE_CONFIG= diff --git a/settings/phrase-settings.json b/settings/phrase-settings.json new file mode 100644 index 00000000..a097dca4 --- /dev/null +++ b/settings/phrase-settings.json @@ -0,0 +1,25 @@ +{ + "Comments": [ + "Black list correspond to class/type combinations to exclude", + "If a class is in the white list then all types will", + "be ignored except the ones given in the list.", + "Also use this list to exclude an entire class from special phrases." + ], + "blackList": { + "bounday": [ + "administrative" + ], + "place": [ + "house", + "houses" + ] + }, + "whiteList": { + "highway": [ + "bus_stop", + "rest_area", + "raceway'" + ], + "building": [] + } +} diff --git a/test/python/conftest.py b/test/python/conftest.py index 4b7cccc3..871365d9 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -5,6 +5,7 @@ from pathlib import Path import psycopg2 import psycopg2.extras import pytest +import tempfile SRC_DIR = Path(__file__) / '..' / '..' / '..' @@ -133,6 +134,13 @@ def def_config(): def src_dir(): return SRC_DIR.resolve() +@pytest.fixture +def tmp_phplib_dir(): + with tempfile.TemporaryDirectory() as phpdir: + (Path(phpdir) / 'admin').mkdir() + + yield Path(phpdir) + @pytest.fixture def status_table(temp_db_conn): """ Create an empty version of the status table and diff --git a/test/python/sample.tar.gz b/test/python/sample.tar.gz new file mode 100644 index 00000000..65bff096 Binary files /dev/null and b/test/python/sample.tar.gz differ diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 918d8499..eb0ee584 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -64,7 +64,6 @@ def test_cli_help(capsys): @pytest.mark.parametrize("command,script", [ - (('special-phrases',), 'specialphrases'), (('add-data', '--file', 'foo.osm'), 'update'), (('export',), 'export') ]) @@ -172,6 +171,12 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra assert bnd_mock.called == do_bnds assert rank_mock.called == do_ranks +def test_special_phrases_command(temp_db, mock_func_factory): + func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki') + + call_nominatim('special-phrases', '--import-from-wiki') + + assert func.called == 1 @pytest.mark.parametrize("command,func", [ ('postcodes', 'update_postcodes'), diff --git a/test/python/test_tools_exec_utils.py b/test/python/test_tools_exec_utils.py index 8f60ac74..3abe9818 100644 --- a/test/python/test_tools_exec_utils.py +++ b/test/python/test_tools_exec_utils.py @@ -9,13 +9,6 @@ import pytest import nominatim.tools.exec_utils as exec_utils -@pytest.fixture -def tmp_phplib_dir(): - with tempfile.TemporaryDirectory() as phpdir: - (Path(phpdir) / 'admin').mkdir() - - yield Path(phpdir) - @pytest.fixture def nominatim_env(tmp_phplib_dir, def_config): class _NominatimEnv: diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py new file mode 100644 index 00000000..7a8b832d --- /dev/null +++ b/test/python/test_tools_import_special_phrases.py @@ -0,0 +1,346 @@ +""" + Tests for import special phrases methods + of the class SpecialPhrasesImporter. +""" +from nominatim.errors import UsageError +from pathlib import Path +import tempfile +from shutil import copyfile +import pytest +from nominatim.tools.special_phrases import SpecialPhrasesImporter + +TEST_BASE_DIR = Path(__file__) / '..' / '..' + +def test_check_sanity_class(special_phrases_importer): + """ + Check for _check_sanity() method. + If a wrong class or type is given, an UsageError should raise. + If a good class and type are given, nothing special happens. + """ + with pytest.raises(UsageError) as wrong_class: + special_phrases_importer._check_sanity('en', '', 'type') + + with pytest.raises(UsageError) as wrong_type: + special_phrases_importer._check_sanity('en', 'class', '') + + special_phrases_importer._check_sanity('en', 'class', 'type') + + assert wrong_class and wrong_type + +def test_load_white_and_black_lists(special_phrases_importer): + """ + Test that _load_white_and_black_lists() well return + black list and white list and that they are of dict type. + """ + black_list, white_list = special_phrases_importer._load_white_and_black_lists() + + assert isinstance(black_list, dict) and isinstance(white_list, dict) + +def test_convert_php_settings(special_phrases_importer): + """ + Test that _convert_php_settings_if_needed() convert the given + php file to a json file. + """ + php_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.php').resolve() + + with tempfile.TemporaryDirectory() as temp_dir: + temp_settings = (Path(temp_dir) / 'phrase_settings.php').resolve() + copyfile(php_file, temp_settings) + special_phrases_importer._convert_php_settings_if_needed(temp_settings) + + assert (Path(temp_dir) / 'phrase_settings.json').is_file() + +def test_convert_settings_wrong_file(special_phrases_importer): + """ + Test that _convert_php_settings_if_needed() raise an exception + if the given file is not a valid file. + """ + + with pytest.raises(UsageError) as exceptioninfos: + special_phrases_importer._convert_php_settings_if_needed('random_file') + + assert str(exceptioninfos.value) == 'random_file is not a valid file.' + +def test_convert_settings_json_already_exist(special_phrases_importer): + """ + Test that if we give to '_convert_php_settings_if_needed' a php file path + and that a the corresponding json file already exists, it is returned. + """ + php_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.php').resolve() + json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve() + + returned = special_phrases_importer._convert_php_settings_if_needed(php_file) + + assert returned == json_file + +def test_convert_settings_giving_json(special_phrases_importer): + """ + Test that if we give to '_convert_php_settings_if_needed' a json file path + the same path is directly returned + """ + json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase-settings.json').resolve() + + returned = special_phrases_importer._convert_php_settings_if_needed(json_file) + + assert returned == json_file + +def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs, + word_table, temp_db_conn): + """ + Test that _process_amenity() execute well the + getorcreate_amenityoperator() SQL function and that + the 2 differents operators are well handled. + """ + special_phrases_importer._process_amenity('', '', '', '', 'near') + special_phrases_importer._process_amenity('', '', '', '', 'in') + + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute("SELECT * FROM temp_with_operator WHERE op='near' OR op='in'") + results = temp_db_cursor.fetchall() + + assert len(results) == 2 + +def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs, + temp_db_conn): + """ + Test that _process_amenity() execute well the + getorcreate_amenity() SQL function. + """ + special_phrases_importer._process_amenity('', '', '', '', '') + + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute("SELECT * FROM temp_without_operator WHERE op='no_operator'") + result = temp_db_cursor.fetchone() + + assert result + +def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer): + """ + Test that _create_place_classtype_indexes() create the + place_id index and centroid index on the right place_class_type table. + """ + phrase_class = 'class' + phrase_type = 'type' + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute("CREATE EXTENSION postgis;") + temp_db_cursor.execute('CREATE TABLE {}(place_id BIGINT, centroid GEOMETRY)'.format(table_name)) + + special_phrases_importer._create_place_classtype_indexes('', phrase_class, phrase_type) + + assert check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type) + +def test_create_place_classtype_table(temp_db_conn, placex_table, special_phrases_importer): + """ + Test that _create_place_classtype_table() create + the right place_classtype table. + """ + phrase_class = 'class' + phrase_type = 'type' + special_phrases_importer._create_place_classtype_table('', phrase_class, phrase_type) + + assert check_table_exist(temp_db_conn, phrase_class, phrase_type) + +def test_grant_access_to_web_user(temp_db_conn, def_config, special_phrases_importer): + """ + Test that _grant_access_to_webuser() give + right access to the web user. + """ + phrase_class = 'class' + phrase_type = 'type' + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute('CREATE TABLE {}()'.format(table_name)) + + special_phrases_importer._grant_access_to_webuser(phrase_class, phrase_type) + + assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, phrase_class, phrase_type) + +def test_create_place_classtype_table_and_indexes( + temp_db_conn, def_config, placex_table, getorcreate_amenity_funcs, + getorcreate_amenityoperator_funcs, special_phrases_importer): + """ + Test that _create_place_classtype_table_and_indexes() + create the right place_classtype tables and place_id indexes + and centroid indexes and grant access to the web user + for the given set of pairs. + """ + pairs = set([('class1', 'type1'), ('class2', 'type2')]) + + special_phrases_importer._create_place_classtype_table_and_indexes(pairs) + + for pair in pairs: + assert check_table_exist(temp_db_conn, pair[0], pair[1]) + assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1]) + assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1]) + +def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, + getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs): + """ + Test that _process_xml_content() process the given xml content right + by executing the right SQL functions for amenities and + by returning the right set of pairs. + """ + class_test = 'aerialway' + type_test = 'zip_line' + + #Converted output set to a dict for easy assert further. + results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en')) + + assert check_amenities_with_op(temp_db_conn) + assert check_amenities_without_op(temp_db_conn) + assert results[class_test] and type_test in results.values() + +def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, + getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs): + """ + Check that the main import_from_wiki() method is well executed. + It should create the place_classtype table, the place_id and centroid indexes, + grand access to the web user and executing the SQL functions for amenities. + """ + monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content) + special_phrases_importer.import_from_wiki(['en']) + + class_test = 'aerialway' + type_test = 'zip_line' + + assert check_table_exist(temp_db_conn, class_test, type_test) + assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test) + assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test) + assert check_amenities_with_op(temp_db_conn) + assert check_amenities_without_op(temp_db_conn) + +def mock_get_wiki_content(lang): + """ + Mock the _get_wiki_content() method to return + static xml test file content. + """ + return get_test_xml_wiki_content() + +def get_test_xml_wiki_content(): + """ + return the content of the static xml test file. + """ + xml_test_content_path = (TEST_BASE_DIR / 'testdata' / 'special_phrases_test_content.txt').resolve() + with open(xml_test_content_path) as xml_content_reader: + return xml_content_reader.read() + +def check_table_exist(temp_db_conn, phrase_class, phrase_type): + """ + Verify that the place_classtype table exists for the given + phrase_class and phrase_type. + """ + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute(""" + SELECT * + FROM information_schema.tables + WHERE table_type='BASE TABLE' + AND table_name='{}'""".format(table_name)) + return temp_db_cursor.fetchone() + +def check_grant_access(temp_db_conn, user, phrase_class, phrase_type): + """ + Check that the web user has been granted right access to the + place_classtype table of the given phrase_class and phrase_type. + """ + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute(""" + SELECT * FROM information_schema.role_table_grants + WHERE table_name='{}' + AND grantee='{}' + AND privilege_type='SELECT'""".format(table_name, user)) + return temp_db_cursor.fetchone() + +def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type): + """ + Check that the place_id index and centroid index exist for the + place_classtype table of the given phrase_class and phrase_type. + """ + index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type) + + return ( + temp_db_conn.index_exists(index_prefix + 'centroid') + and + temp_db_conn.index_exists(index_prefix + 'place_id') + ) + +def check_amenities_with_op(temp_db_conn): + """ + Check that the test table for the SQL function getorcreate_amenityoperator() + contains more than one value (so that the SQL function was call more than one time). + """ + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute("SELECT * FROM temp_with_operator") + return len(temp_db_cursor.fetchall()) > 1 + +def check_amenities_without_op(temp_db_conn): + """ + Check that the test table for the SQL function getorcreate_amenity() + contains more than one value (so that the SQL function was call more than one time). + """ + with temp_db_conn.cursor() as temp_db_cursor: + temp_db_cursor.execute("SELECT * FROM temp_without_operator") + return len(temp_db_cursor.fetchall()) > 1 + +@pytest.fixture +def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration): + """ + Return an instance of SpecialPhrasesImporter. + """ + return SpecialPhrasesImporter(def_config, temp_phplib_dir_with_migration, temp_db_conn) + +@pytest.fixture +def temp_phplib_dir_with_migration(): + """ + Return temporary phpdir with migration subdirectory and + PhraseSettingsToJson.php script inside. + """ + migration_file = (TEST_BASE_DIR / '..' / 'lib-php' / 'migration' + / 'PhraseSettingsToJson.php').resolve() + with tempfile.TemporaryDirectory() as phpdir: + (Path(phpdir) / 'migration').mkdir() + migration_dest_path = (Path(phpdir) / 'migration' / 'PhraseSettingsToJson.php').resolve() + copyfile(migration_file, migration_dest_path) + + yield Path(phpdir) + +@pytest.fixture +def make_strandard_name_func(temp_db_cursor): + temp_db_cursor.execute(""" + CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$ + BEGIN + RETURN trim(name); --Basically return only the trimed name for the tests + END; + $$ LANGUAGE plpgsql IMMUTABLE;""") + +@pytest.fixture +def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func): + temp_db_cursor.execute(""" + CREATE TABLE temp_without_operator(op TEXT); + + CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT, + lookup_class text, lookup_type text) + RETURNS void as $$ + BEGIN + INSERT INTO temp_without_operator VALUES('no_operator'); + END; + $$ LANGUAGE plpgsql""") + +@pytest.fixture +def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func): + temp_db_cursor.execute(""" + CREATE TABLE temp_with_operator(op TEXT); + + CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT, + lookup_class text, lookup_type text, op text) + RETURNS void as $$ + BEGIN + INSERT INTO temp_with_operator VALUES(op); + END; + $$ LANGUAGE plpgsql""") \ No newline at end of file diff --git a/test/testdata/special_phrases_test_content.txt b/test/testdata/special_phrases_test_content.txt new file mode 100644 index 00000000..bc8c65d0 --- /dev/null +++ b/test/testdata/special_phrases_test_content.txt @@ -0,0 +1,78 @@ + + +OpenStreetMap Wiki +wiki +https://wiki.openstreetmap.org/wiki/Main_Page +MediaWiki 1.35.1 +first-letter + +Media +Special + +Talk +User +User talk +Wiki +Wiki talk +File +File talk +MediaWiki +MediaWiki talk +Template +Template talk +Help +Help talk +Category +Category talk +Item +Item talk +Property +Property talk +DE +DE talk +FR +FR talk +ES +ES talk +IT +IT talk +NL +NL talk +RU +RU talk +JA +JA talk +TimedText +TimedText talk +Module +Module talk +Gadget +Gadget talk +Gadget definition +Gadget definition talk + + + +Nominatim/Special Phrases/EN +0 +67365 + +2100424 +2100422 +2021-01-27T20:29:53Z + +Violaine Do +88152 + + +/* en */ add coworking amenity +2100424 +wikitext +text/x-wiki + +== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Zip Lines near || aerialway || zip_line || near || Y |- | Zip Wire || aerialway || zip_line || - || N |- | Zip Wires || aerialway || zip_line || - || Y |- | Zip Wire in || aerialway || zip_line || in || N |- | Zip Wires in || aerialway || zip_line || in || Y |- | Zip Wire near || aerialway || zip_line || near || N |} [[Category:Word list]] + +cst5x7tt58izti1pxzgljf27tx8qjcj + + + \ No newline at end of file diff --git a/test/testfiles/phrase-settings.json b/test/testfiles/phrase-settings.json new file mode 100644 index 00000000..e69de29b diff --git a/settings/phrase_settings.php b/test/testfiles/phrase_settings.php similarity index 100% rename from settings/phrase_settings.php rename to test/testfiles/phrase_settings.php diff --git a/test/testfiles/random_file.html b/test/testfiles/random_file.html new file mode 100644 index 00000000..e69de29b diff --git a/vagrant/Install-on-Centos-7.sh b/vagrant/Install-on-Centos-7.sh index 24d88926..610bbb04 100755 --- a/vagrant/Install-on-Centos-7.sh +++ b/vagrant/Install-on-Centos-7.sh @@ -40,9 +40,9 @@ php-pgsql php php-intl libpqxx-devel \ proj-epsg bzip2-devel proj-devel boost-devel \ python3-pip python3-setuptools python3-devel \ - expat-devel zlib-devel + expat-devel zlib-devel libicu-dev - pip3 install --user psycopg2 python-dotenv psutil Jinja2 + pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU # diff --git a/vagrant/Install-on-Centos-8.sh b/vagrant/Install-on-Centos-8.sh index 859c48b9..17d07aec 100755 --- a/vagrant/Install-on-Centos-8.sh +++ b/vagrant/Install-on-Centos-8.sh @@ -33,9 +33,9 @@ php-pgsql php php-intl php-json libpq-devel \ bzip2-devel proj-devel boost-devel \ python3-pip python3-setuptools python3-devel \ - expat-devel zlib-devel + expat-devel zlib-devel libicu-dev - pip3 install --user psycopg2 python-dotenv psutil Jinja2 + pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU # diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh index 5cbbd583..b90da871 100755 --- a/vagrant/Install-on-Ubuntu-18.sh +++ b/vagrant/Install-on-Ubuntu-18.sh @@ -29,8 +29,8 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: libbz2-dev libpq-dev libproj-dev \ postgresql-server-dev-10 postgresql-10-postgis-2.4 \ postgresql-contrib-10 postgresql-10-postgis-scripts \ - php php-pgsql php-intl python3-pip \ - python3-psycopg2 python3-psutil python3-jinja2 git + php php-pgsql php-intl libicu-dev python3-pip \ + python3-psycopg2 python3-psutil python3-jinja2 python3-icu git # The python-dotenv package that comes with Ubuntu 18.04 is too old, so # install the latest version from pip: diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh index 0649c9a6..d04f5796 100755 --- a/vagrant/Install-on-Ubuntu-20.sh +++ b/vagrant/Install-on-Ubuntu-20.sh @@ -32,8 +32,8 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: libbz2-dev libpq-dev libproj-dev \ postgresql-server-dev-12 postgresql-12-postgis-3 \ postgresql-contrib-12 postgresql-12-postgis-3-scripts \ - php php-pgsql php-intl python3-dotenv \ - python3-psycopg2 python3-psutil python3-jinja2 git + php php-pgsql php-intl libicu-dev python3-dotenv \ + python3-psycopg2 python3-psutil python3-jinja2 python3-icu git # # System Configuration