From 118858a55e5ec522d870842532d26ff0276c85ba Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 17 Aug 2021 23:11:47 +0200 Subject: [PATCH] rename legacy_icu tokenizer to icu tokenizer The new icu tokenizer is now no longer compatible with the old legacy tokenizer in terms of data structures. Therefore there is also no longer a need to refer to the legacy tokenizer in the name. --- CMakeLists.txt | 2 +- docs/admin/Tokenizers.md | 6 ++++++ ...egacy_icu_tokenizer.php => icu_tokenizer.php} | 0 ...egacy_icu_tokenizer.sql => icu_tokenizer.sql} | 0 ...{legacy_icu_tokenizer.py => icu_tokenizer.py} | 8 ++++---- ...acy_icu_tokenizer.yaml => icu_tokenizer.yaml} | 0 test/Makefile | 2 +- test/bdd/steps/nominatim_environment.py | 2 +- test/bdd/steps/steps_db_ops.py | 2 +- ...nizer_legacy_icu.py => test_tokenizer_icu.py} | 16 ++++++++-------- 10 files changed, 22 insertions(+), 16 deletions(-) rename lib-php/tokenizer/{legacy_icu_tokenizer.php => icu_tokenizer.php} (100%) rename lib-sql/tokenizer/{legacy_icu_tokenizer.sql => icu_tokenizer.sql} (100%) rename nominatim/tokenizer/{legacy_icu_tokenizer.py => icu_tokenizer.py} (98%) rename settings/{legacy_icu_tokenizer.yaml => icu_tokenizer.yaml} (100%) rename test/python/{test_tokenizer_legacy_icu.py => test_tokenizer_icu.py} (96%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b2d7b11..ef76a4af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,6 +258,6 @@ install(FILES settings/env.defaults settings/import-address.style settings/import-full.style settings/import-extratags.style - settings/legacy_icu_tokenizer.yaml + settings/icu_tokenizer.yaml settings/icu-rules/extended-unicode-to-asccii.yaml DESTINATION ${NOMINATIM_CONFIGDIR}) diff --git a/docs/admin/Tokenizers.md b/docs/admin/Tokenizers.md index f3454f67..6f8898c8 100644 --- a/docs/admin/Tokenizers.md +++ b/docs/admin/Tokenizers.md @@ -52,6 +52,12 @@ The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to normalize names and queries. It also offers configurable decomposition and abbreviation handling. +To enable the tokenizer add the following line to your project configuration: + +``` +NOMINATIM_TOKENIZER=icu +``` + ### How it works On import the tokenizer processes names in the following four stages: diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php similarity index 100% rename from lib-php/tokenizer/legacy_icu_tokenizer.php rename to lib-php/tokenizer/icu_tokenizer.php diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql similarity index 100% rename from lib-sql/tokenizer/legacy_icu_tokenizer.sql rename to lib-sql/tokenizer/icu_tokenizer.sql diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py similarity index 98% rename from nominatim/tokenizer/legacy_icu_tokenizer.py rename to nominatim/tokenizer/icu_tokenizer.py index 44034f84..cb411204 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -52,7 +52,7 @@ class LegacyICUTokenizer(AbstractTokenizer): if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: - cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' + cfgfile = config.config_dir / 'icu_tokenizer.yaml' loader = ICURuleLoader(cfgfile) self.naming_rules = ICUNameProcessorRules(loader=loader) @@ -88,7 +88,7 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql', + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql', max_word_freq=max_word_freq) @@ -98,7 +98,7 @@ class LegacyICUTokenizer(AbstractTokenizer): self.init_from_project() if self.naming_rules is None: - return "Configuration for tokenizer 'legacy_icu' are missing." + return "Configuration for tokenizer 'icu' are missing." return None @@ -130,7 +130,7 @@ class LegacyICUTokenizer(AbstractTokenizer): @define('CONST_Max_Word_Frequency', {self.max_word_frequency}); @define('CONST_Term_Normalization_Rules', "{self.term_normalization}"); @define('CONST_Transliteration', "{self.naming_rules.search_rules}"); - require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');""")) + require_once('{phpdir}/tokenizer/icu_tokenizer.php');""")) def _save_config(self, config): diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/icu_tokenizer.yaml similarity index 100% rename from settings/legacy_icu_tokenizer.yaml rename to settings/icu_tokenizer.yaml diff --git a/test/Makefile b/test/Makefile index b8afdf9b..6dd9a349 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,7 +5,7 @@ bdd: cd bdd && behave -DREMOVE_TEMPLATE=1 icu: - cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu + cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=icu php: cd php && phpunit ./ diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index 1deb43f3..76f90cfa 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -201,7 +201,7 @@ class NominatimEnvironment: self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) self.run_nominatim('freeze') - if self.tokenizer != 'legacy_icu': + if self.tokenizer != 'icu': phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) else: diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index ac61fc67..d1f27235 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -280,7 +280,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes): plist.sort() with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - if nctx.tokenizer == 'legacy_icu': + if nctx.tokenizer == 'icu': cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", (plist,)) else: diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_icu.py similarity index 96% rename from test/python/test_tokenizer_legacy_icu.py rename to test/python/test_tokenizer_icu.py index ed489662..5ec434b6 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -6,7 +6,7 @@ import yaml import pytest -from nominatim.tokenizer import legacy_icu_tokenizer +from nominatim.tokenizer import icu_tokenizer from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.db import properties @@ -26,7 +26,7 @@ def test_config(def_config, tmp_path): sqldir = tmp_path / 'sql' sqldir.mkdir() (sqldir / 'tokenizer').mkdir() - (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'") + (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'") shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'), str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql')) @@ -41,7 +41,7 @@ def tokenizer_factory(dsn, tmp_path, property_table, (tmp_path / 'tokenizer').mkdir() def _maker(): - return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer') + return icu_tokenizer.create(dsn, tmp_path / 'tokenizer') return _maker @@ -57,7 +57,7 @@ def db_prop(temp_db_conn): @pytest.fixture def analyzer(tokenizer_factory, test_config, monkeypatch, temp_db_with_extensions, tmp_path): - sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql' + sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql' sql.write_text("SELECT 'a';") monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();') @@ -146,8 +146,8 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop): tok = tokenizer_factory() tok.init_new_db(test_config) - assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();' - assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None + assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();' + assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None def test_init_word_table(tokenizer_factory, test_config, place_row, word_table): @@ -187,11 +187,11 @@ def test_update_sql_functions(db_prop, temp_db_cursor, tok.init_new_db(test_config) monkeypatch.undo() - assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133' + assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133' table_factory('test', 'txt TEXT') - func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql' + func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql' func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""") tok.update_sql_functions(test_config) -- 2.45.1