From: Sarah Hoffmann Date: Wed, 11 May 2022 06:52:49 +0000 (+0200) Subject: Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default X-Git-Tag: v4.1.0~44 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/5ff35d998459260e60bcb01aa7302f4706d043b1?hp=b332b1ae236d91738e7b41c1cf05ea9ff7c6329c Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default Make ICU tokenizer the default --- diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml index 757decd4..042166ad 100644 --- a/.github/actions/build-nominatim/action.yml +++ b/.github/actions/build-nominatim/action.yml @@ -5,6 +5,10 @@ inputs: description: 'Version of Ubuntu to install on' required: false default: '20' + cmake-args: + description: 'Additional options to hand to cmake' + required: false + default: '' runs: using: "composite" @@ -21,18 +25,13 @@ runs: shell: bash env: UBUNTUVER: ${{ inputs.ubuntu }} - - - name: Download dependencies - run: | - if [ ! -f country_grid.sql.gz ]; then - wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz - fi - cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz - shell: bash + CMAKE_ARGS: ${{ inputs.cmake-args }} - name: Configure - run: mkdir build && cd build && cmake ../Nominatim + run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim shell: bash + env: + CMAKE_ARGS: ${{ inputs.cmake-args }} - name: Build run: | diff --git a/.github/actions/setup-postgresql/action.yml b/.github/actions/setup-postgresql/action.yml index 060a6789..19a19e17 100644 --- a/.github/actions/setup-postgresql/action.yml +++ b/.github/actions/setup-postgresql/action.yml @@ -22,7 +22,7 @@ runs: - name: Install PostgreSQL run: | - sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER} + sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} shell: bash env: PGVER: ${{ inputs.postgresql-version }} diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 6ebf1ab9..a08a995f 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -113,19 +113,9 @@ jobs: working-directory: Nominatim/test/bdd - icu-test: + legacy-test: needs: create-archive - strategy: - matrix: - ubuntu: [20] - include: - - ubuntu: 20 - postgresql: 13 - postgis: 3 - pytest: py.test-3 - php: 7.4 - - runs-on: ubuntu-${{ matrix.ubuntu }}.04 + runs-on: ubuntu-20.04 steps: - uses: actions/download-artifact@v2 @@ -138,35 +128,27 @@ jobs: - name: Setup PHP uses: shivammathur/setup-php@v2 with: - php-version: ${{ matrix.php }} - coverage: xdebug - tools: phpunit, phpcs, composer - - - uses: actions/setup-python@v2 - with: - python-version: 3.6 - if: matrix.ubuntu == 18 + php-version: 7.4 - uses: ./Nominatim/.github/actions/setup-postgresql with: - postgresql-version: ${{ matrix.postgresql }} - postgis-version: ${{ matrix.postgis }} + postgresql-version: 13 + postgis-version: 3 + + - name: Install Postgresql server dev + run: sudo apt-get install postgresql-server-dev-13 - uses: ./Nominatim/.github/actions/build-nominatim with: - ubuntu: ${{ matrix.ubuntu }} + ubuntu: 20 + cmake-args: -DBUILD_MODULE=on - name: Install test prerequsites run: sudo apt-get install -y -qq python3-behave - if: matrix.ubuntu == 20 - - - name: Install test prerequsites - run: pip3 install behave==1.2.6 - if: matrix.ubuntu == 18 - - name: BDD tests (icu tokenizer) + - name: BDD tests (legacy tokenizer) run: | - behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3 + behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3 working-directory: Nominatim/test/bdd diff --git a/CMakeLists.txt b/CMakeLists.txt index af7dbc2a..8360d549 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ endif() set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database") set(BUILD_API on CACHE BOOL "Build everything for the API server") -set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module") +set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer") set(BUILD_TESTS on CACHE BOOL "Build test suite") set(BUILD_DOCS on CACHE BOOL "Build documentation") set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page") diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md index 8c4c670b..f5411604 100644 --- a/docs/admin/Installation.md +++ b/docs/admin/Installation.md @@ -158,6 +158,17 @@ make sudo make install ``` +!!! warning + The default installation no longer compiles the PostgreSQL module that + is needed for the legacy tokenizer from older Nominatim versions. If you + are upgrading an older database or want to run the + [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for + some other reason, you need to enable the PostgreSQL module via + cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module + you need to have the server development headers for PostgreSQL installed. + On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-` + + Nominatim installs itself into `/usr/local` per default. To choose a different installation directory add `-DCMAKE_INSTALL_PREFIX=` to the cmake command. Make sure that the `bin` directory is available in your path diff --git a/docs/admin/Migration.md b/docs/admin/Migration.md index 11ee7f05..950f7e19 100644 --- a/docs/admin/Migration.md +++ b/docs/admin/Migration.md @@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.** ## 4.0.0 -> master +### ICU tokenizer is the new default + +Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer) +by default. This only has an effect on newly installed databases. When +updating older databases, it keeps its installed tokenizer. If you still +run with the legacy tokenizer, make sure to compile Nominatim with the +PostgreSQL module, see [Installation](Installation.md#building-nominatim). + ### geocodejson output changed The `type` field of the geocodejson output has changed. It now contains diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index d849eb48..19d867dd 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -19,7 +19,22 @@ they can be configured. The legacy tokenizer implements the analysis algorithms of older Nominatim versions. It uses a special Postgresql module to normalize names and queries. -This tokenizer is currently the default. +This tokenizer is automatically installed and used when upgrading an older +database. It should not be used for new installations anymore. + +### Compiling the PostgreSQL module + +The tokeinzer needs a special C module for PostgreSQL which is not compiled +by default. If you need the legacy tokenizer, compile Nominatim as follows: + +``` +mkdir build +cd build +cmake -DBUILD_MODULE=on +make +``` + +### Enabling the tokenizer To enable the tokenizer add the following line to your project configuration: @@ -47,6 +62,7 @@ normalization functions are hard-coded. The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to normalize names and queries. It also offers configurable decomposition and abbreviation handling. +This tokenizer is currently the default. To enable the tokenizer add the following line to your project configuration: diff --git a/nominatim/config.py b/nominatim/config.py index 13d9cd8a..a3f91055 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -187,7 +187,7 @@ class Configuration: if configfile.suffix in ('.yaml', '.yml'): result = self._load_from_yaml(configfile) elif configfile.suffix == '.json': - with configfile.open('r') as cfg: + with configfile.open('r', encoding='utf-8') as cfg: result = json.load(cfg) else: raise UsageError(f"Config file '{configfile}' has unknown format.") diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index b553dbc6..9c7138ce 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer): @define('CONST_Max_Word_Frequency', 10000000); @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}"); @define('CONST_Transliteration', "{self.loader.get_search_rules()}"); - require_once('{phpdir}/tokenizer/icu_tokenizer.php');""")) + require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8') def _save_config(self): diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 3b8f7569..97ce6d16 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer): @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY}); @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}"); require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php'); - """.format(config))) + """.format(config)), encoding='utf-8') def _init_db_tables(self, config): diff --git a/settings/env.defaults b/settings/env.defaults index e5dfe4a6..3115f438 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH= # Tokenizer used for normalizing and parsing queries and names. # The tokenizer is set up during import and cannot be changed afterwards # without a reimport. -# Currently available tokenizers: legacy -NOMINATIM_TOKENIZER="legacy" +# Currently available tokenizers: icu, legacy +NOMINATIM_TOKENIZER="icu" # Number of occurrences of a word before it is considered frequent. # Similar to the concept of stop words. Frequent partial words get ignored diff --git a/test/bdd/environment.py b/test/bdd/environment.py index 0acc73b4..ee07e602 100644 --- a/test/bdd/environment.py +++ b/test/bdd/environment.py @@ -59,5 +59,5 @@ def after_scenario(context, scenario): def before_tag(context, tag): if tag == 'fail-legacy': - if context.config.userdata['TOKENIZER'] in (None, 'legacy'): + if context.config.userdata['TOKENIZER'] == 'legacy': context.scenario.skip("Not implemented in legacy tokenizer") diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index 7de32e48..70a03e6e 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -207,7 +207,7 @@ class NominatimEnvironment: self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) self.run_nominatim('freeze') - if self.tokenizer != 'icu': + if self.tokenizer == 'legacy': phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) else: diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index e02cad8f..4c711b72 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes): plist.sort() with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - if nctx.tokenizer == 'icu': + if nctx.tokenizer != 'legacy': cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", (plist,)) else: diff --git a/test/python/conftest.py b/test/python/conftest.py index f4581bf9..40526295 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory): country_code VARCHAR(2)""") -@pytest.fixture -def word_table(temp_db_conn): - return mocks.MockWordTable(temp_db_conn) - - @pytest.fixture def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions): table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, ))) diff --git a/test/python/mocks.py b/test/python/mocks.py index 2cd2e3e3..9c6ef532 100644 --- a/test/python/mocks.py +++ b/test/python/mocks.py @@ -14,7 +14,7 @@ import psycopg2.extras from nominatim.db import properties # This must always point to the mock word table for the default tokenizer. -from mock_legacy_word_table import MockLegacyWordTable as MockWordTable +from mock_icu_word_table import MockIcuWordTable as MockWordTable class MockPlacexTable: """ A placex table for testing. diff --git a/test/python/tools/test_database_import.py b/test/python/tools/test_database_import.py index 8ac31bc0..68d19a07 100644 --- a/test/python/tools/test_database_import.py +++ b/test/python/tools/test_database_import.py @@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w @pytest.mark.parametrize("threads", (1, 5)) def test_load_data(dsn, place_row, placex_table, osmline_table, - word_table, temp_db_cursor, threads): + temp_db_cursor, threads): for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'): temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT) RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL diff --git a/test/python/tools/test_migration.py b/test/python/tools/test_migration.py index 8fef0dc1..d102b97d 100644 --- a/test/python/tools/test_migration.py +++ b/test/python/tools/test_migration.py @@ -14,6 +14,8 @@ from nominatim.tools import migration from nominatim.errors import UsageError import nominatim.version +from mock_legacy_word_table import MockLegacyWordTable + class DummyTokenizer: def update_sql_functions(self, config): @@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch): monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db', lambda *args: DummyTokenizer()) +@pytest.fixture +def legacy_word_table(temp_db_conn): + return MockLegacyWordTable(temp_db_conn) + def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config): table_factory('country_name', 'name HSTORE, country_code TEXT') @@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor, def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor, - word_table, placex_table): + legacy_word_table, placex_table): placex_table.add(housenumber='3A') temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py index 556d6457..bdfe3094 100644 --- a/test/python/tools/test_postcodes.py +++ b/test/python/tools/test_postcodes.py @@ -65,7 +65,7 @@ def tokenizer(): return dummy_tokenizer.DummyTokenizer(None, None) @pytest.fixture -def postcode_table(temp_db_conn, placex_table, word_table): +def postcode_table(temp_db_conn, placex_table): return MockPostcodeTable(temp_db_conn) diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh index 40ee7ba8..3537bcf4 100755 --- a/vagrant/Install-on-Ubuntu-18.sh +++ b/vagrant/Install-on-Ubuntu-18.sh @@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \ libboost-filesystem-dev libexpat1-dev zlib1g-dev\ libbz2-dev libpq-dev libproj-dev \ - postgresql-server-dev-10 postgresql-10-postgis-2.4 \ + postgresql-10-postgis-2.4 \ postgresql-contrib-10 postgresql-10-postgis-scripts \ php php-pgsql php-intl libicu-dev python3-pip \ - python3-psutil python3-jinja2 python3-icu git + python3-psutil python3-jinja2 python3-yaml python3-icu git # Some of the Python packages that come with Ubuntu 18.04 are too old, so # install the latest version from pip: diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh index cdfb20f2..1ea180e8 100755 --- a/vagrant/Install-on-Ubuntu-20.sh +++ b/vagrant/Install-on-Ubuntu-20.sh @@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \ libboost-filesystem-dev libexpat1-dev zlib1g-dev \ libbz2-dev libpq-dev libproj-dev \ - postgresql-server-dev-12 postgresql-12-postgis-3 \ + postgresql-12-postgis-3 \ postgresql-contrib-12 postgresql-12-postgis-3-scripts \ php php-pgsql php-intl libicu-dev python3-dotenv \ python3-psycopg2 python3-psutil python3-jinja2 \ - python3-icu python3-datrie git + python3-icu python3-datrie python3-yaml git # # System Configuration