From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 11 May 2022 06:52:49 +0000 (+0200)
Subject: Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default
X-Git-Tag: v4.1.0~44
X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/5ff35d998459260e60bcb01aa7302f4706d043b1?hp=b332b1ae236d91738e7b41c1cf05ea9ff7c6329c

Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default

Make ICU tokenizer the default
---

diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml
index 757decd4..042166ad 100644
--- a/.github/actions/build-nominatim/action.yml
+++ b/.github/actions/build-nominatim/action.yml
@@ -5,6 +5,10 @@ inputs:
         description: 'Version of Ubuntu to install on'
         required: false
         default: '20'
+    cmake-args:
+        description: 'Additional options to hand to cmake'
+        required: false
+        default: ''
 
 runs:
     using: "composite"
@@ -21,18 +25,13 @@ runs:
           shell: bash
           env:
             UBUNTUVER: ${{ inputs.ubuntu }}
-
-        - name: Download dependencies
-          run: |
-              if [ ! -f country_grid.sql.gz ]; then
-                  wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
-              fi
-              cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
-          shell: bash
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
 
         - name: Configure
-          run: mkdir build && cd build && cmake ../Nominatim
+          run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
           shell: bash
+          env:
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
 
         - name: Build
           run: |
diff --git a/.github/actions/setup-postgresql/action.yml b/.github/actions/setup-postgresql/action.yml
index 060a6789..19a19e17 100644
--- a/.github/actions/setup-postgresql/action.yml
+++ b/.github/actions/setup-postgresql/action.yml
@@ -22,7 +22,7 @@ runs:
 
         - name: Install PostgreSQL
           run: |
-              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
+              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
           shell: bash
           env:
               PGVER: ${{ inputs.postgresql-version }}
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 6ebf1ab9..a08a995f 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -113,19 +113,9 @@ jobs:
               working-directory: Nominatim/test/bdd
 
 
-    icu-test:
+    legacy-test:
         needs: create-archive
-        strategy:
-            matrix:
-                ubuntu: [20]
-                include:
-                    - ubuntu: 20
-                      postgresql: 13
-                      postgis: 3
-                      pytest: py.test-3
-                      php: 7.4
-
-        runs-on: ubuntu-${{ matrix.ubuntu }}.04
+        runs-on: ubuntu-20.04
 
         steps:
             - uses: actions/download-artifact@v2
@@ -138,35 +128,27 @@ jobs:
             - name: Setup PHP
               uses: shivammathur/setup-php@v2
               with:
-                  php-version: ${{ matrix.php }}
-                  coverage: xdebug
-                  tools: phpunit, phpcs, composer
-
-            - uses: actions/setup-python@v2
-              with:
-                python-version: 3.6
-              if: matrix.ubuntu == 18
+                  php-version: 7.4
 
             - uses: ./Nominatim/.github/actions/setup-postgresql
               with:
-                  postgresql-version: ${{ matrix.postgresql }}
-                  postgis-version: ${{ matrix.postgis }}
+                  postgresql-version: 13
+                  postgis-version: 3
+
+            - name: Install Postgresql server dev
+              run: sudo apt-get install postgresql-server-dev-13
 
             - uses: ./Nominatim/.github/actions/build-nominatim
               with:
-                  ubuntu: ${{ matrix.ubuntu }}
+                  ubuntu: 20
+                  cmake-args: -DBUILD_MODULE=on
 
             - name: Install test prerequsites
               run: sudo apt-get install -y -qq python3-behave
-              if: matrix.ubuntu == 20
-
-            - name: Install test prerequsites
-              run: pip3 install behave==1.2.6
-              if: matrix.ubuntu == 18
 
-            - name: BDD tests (icu tokenizer)
+            - name: BDD tests (legacy tokenizer)
               run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
               working-directory: Nominatim/test/bdd
 
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index af7dbc2a..8360d549 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ endif()
 
 set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
 set(BUILD_API on CACHE BOOL "Build everything for the API server")
-set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
+set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
 set(BUILD_TESTS on CACHE BOOL "Build test suite")
 set(BUILD_DOCS on CACHE BOOL "Build documentation")
 set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md
index 8c4c670b..f5411604 100644
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -158,6 +158,17 @@ make
 sudo make install
 ```
 
+!!! warning
+    The default installation no longer compiles the PostgreSQL module that
+    is needed for the legacy tokenizer from older Nominatim versions. If you
+    are upgrading an older database or want to run the
+    [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
+    some other reason, you need to enable the PostgreSQL module via
+    cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
+    you need to have the server development headers for PostgreSQL installed.
+    On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
+
+
 Nominatim installs itself into `/usr/local` per default. To choose a different
 installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
 cmake command. Make sure that the `bin` directory is available in your path
diff --git a/docs/admin/Migration.md b/docs/admin/Migration.md
index 11ee7f05..950f7e19 100644
--- a/docs/admin/Migration.md
+++ b/docs/admin/Migration.md
@@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
 
 ## 4.0.0 -> master
 
+### ICU tokenizer is the new default
+
+Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
+by default. This only has an effect on newly installed databases. When
+updating older databases, it keeps its installed tokenizer. If you still
+run with the legacy tokenizer, make sure to compile Nominatim with the
+PostgreSQL module, see [Installation](Installation.md#building-nominatim).
+
 ### geocodejson output changed
 
 The `type` field of the geocodejson output has changed. It now contains
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md
index d849eb48..19d867dd 100644
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -19,7 +19,22 @@ they can be configured.
 
 The legacy tokenizer implements the analysis algorithms of older Nominatim
 versions. It uses a special Postgresql module to normalize names and queries.
-This tokenizer is currently the default.
+This tokenizer is automatically installed and used when upgrading an older
+database. It should not be used for new installations anymore.
+
+### Compiling the PostgreSQL module
+
+The tokeinzer needs a special C module for PostgreSQL which is not compiled
+by default. If you need the legacy tokenizer, compile Nominatim as follows:
+
+```
+mkdir build
+cd build
+cmake -DBUILD_MODULE=on
+make
+```
+
+### Enabling the tokenizer
 
 To enable the tokenizer add the following line to your project configuration:
 
@@ -47,6 +62,7 @@ normalization functions are hard-coded.
 The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
+This tokenizer is currently the default.
 
 To enable the tokenizer add the following line to your project configuration:
 
diff --git a/nominatim/config.py b/nominatim/config.py
index 13d9cd8a..a3f91055 100644
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -187,7 +187,7 @@ class Configuration:
         if configfile.suffix in ('.yaml', '.yml'):
             result = self._load_from_yaml(configfile)
         elif configfile.suffix == '.json':
-            with configfile.open('r') as cfg:
+            with configfile.open('r', encoding='utf-8') as cfg:
                 result = json.load(cfg)
         else:
             raise UsageError(f"Config file '{configfile}' has unknown format.")
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index b553dbc6..9c7138ce 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                 @define('CONST_Max_Word_Frequency', 10000000);
                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 
 
     def _save_config(self):
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index 3b8f7569..97ce6d16 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
                 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
                 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
                 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)))
+                """.format(config)), encoding='utf-8')
 
 
     def _init_db_tables(self, config):
diff --git a/settings/env.defaults b/settings/env.defaults
index e5dfe4a6..3115f438 100644
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
 # Tokenizer used for normalizing and parsing queries and names.
 # The tokenizer is set up during import and cannot be changed afterwards
 # without a reimport.
-# Currently available tokenizers: legacy
-NOMINATIM_TOKENIZER="legacy"
+# Currently available tokenizers: icu, legacy
+NOMINATIM_TOKENIZER="icu"
 
 # Number of occurrences of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
diff --git a/test/bdd/environment.py b/test/bdd/environment.py
index 0acc73b4..ee07e602 100644
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@@ -59,5 +59,5 @@ def after_scenario(context, scenario):
 
 def before_tag(context, tag):
     if tag == 'fail-legacy':
-        if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
+        if context.config.userdata['TOKENIZER'] == 'legacy':
             context.scenario.skip("Not implemented in legacy tokenizer")
diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py
index 7de32e48..70a03e6e 100644
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -207,7 +207,7 @@ class NominatimEnvironment:
                     self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                     self.run_nominatim('freeze')
 
-                    if self.tokenizer != 'icu':
+                    if self.tokenizer == 'legacy':
                         phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                         run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                     else:
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py
index e02cad8f..4c711b72 100644
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
     plist.sort()
 
     with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'icu':
+        if nctx.tokenizer != 'legacy':
             cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                         (plist,))
         else:
diff --git a/test/python/conftest.py b/test/python/conftest.py
index f4581bf9..40526295 100644
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
                      country_code VARCHAR(2)""")
 
 
-@pytest.fixture
-def word_table(temp_db_conn):
-    return mocks.MockWordTable(temp_db_conn)
-
-
 @pytest.fixture
 def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
     table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
diff --git a/test/python/mocks.py b/test/python/mocks.py
index 2cd2e3e3..9c6ef532 100644
--- a/test/python/mocks.py
+++ b/test/python/mocks.py
@@ -14,7 +14,7 @@ import psycopg2.extras
 from nominatim.db import properties
 
 # This must always point to the mock word table for the default tokenizer.
-from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
+from mock_icu_word_table import MockIcuWordTable as MockWordTable
 
 class MockPlacexTable:
     """ A placex table for testing.
diff --git a/test/python/tools/test_database_import.py b/test/python/tools/test_database_import.py
index 8ac31bc0..68d19a07 100644
--- a/test/python/tools/test_database_import.py
+++ b/test/python/tools/test_database_import.py
@@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
 
 @pytest.mark.parametrize("threads", (1, 5))
 def test_load_data(dsn, place_row, placex_table, osmline_table,
-                   word_table, temp_db_cursor, threads):
+                   temp_db_cursor, threads):
     for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
         temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
                                   RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
diff --git a/test/python/tools/test_migration.py b/test/python/tools/test_migration.py
index 8fef0dc1..d102b97d 100644
--- a/test/python/tools/test_migration.py
+++ b/test/python/tools/test_migration.py
@@ -14,6 +14,8 @@ from nominatim.tools import migration
 from nominatim.errors import UsageError
 import nominatim.version
 
+from mock_legacy_word_table import MockLegacyWordTable
+
 class DummyTokenizer:
 
     def update_sql_functions(self, config):
@@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
     monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
                         lambda *args: DummyTokenizer())
 
+@pytest.fixture
+def legacy_word_table(temp_db_conn):
+    return MockLegacyWordTable(temp_db_conn)
+
 
 def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
     table_factory('country_name', 'name HSTORE, country_code TEXT')
@@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
 
 
 def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
-                                            word_table, placex_table):
+                                            legacy_word_table, placex_table):
     placex_table.add(housenumber='3A')
 
     temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py
index 556d6457..bdfe3094 100644
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -65,7 +65,7 @@ def tokenizer():
     return dummy_tokenizer.DummyTokenizer(None, None)
 
 @pytest.fixture
-def postcode_table(temp_db_conn, placex_table, word_table):
+def postcode_table(temp_db_conn, placex_table):
     return MockPostcodeTable(temp_db_conn)
 
 
diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh
index 40ee7ba8..3537bcf4 100755
--- a/vagrant/Install-on-Ubuntu-18.sh
+++ b/vagrant/Install-on-Ubuntu-18.sh
@@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
     sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                         libboost-filesystem-dev libexpat1-dev zlib1g-dev\
                         libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-10 postgresql-10-postgis-2.4 \
+                        postgresql-10-postgis-2.4 \
                         postgresql-contrib-10 postgresql-10-postgis-scripts \
                         php php-pgsql php-intl libicu-dev python3-pip \
-                        python3-psutil python3-jinja2 python3-icu git
+                        python3-psutil python3-jinja2 python3-yaml python3-icu git
 
 # Some of the Python packages that come with Ubuntu 18.04 are too old, so
 # install the latest version from pip:
diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh
index cdfb20f2..1ea180e8 100755
--- a/vagrant/Install-on-Ubuntu-20.sh
+++ b/vagrant/Install-on-Ubuntu-20.sh
@@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
     sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                         libboost-filesystem-dev libexpat1-dev zlib1g-dev \
                         libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-12 postgresql-12-postgis-3 \
+                        postgresql-12-postgis-3 \
                         postgresql-contrib-12 postgresql-12-postgis-3-scripts \
                         php php-pgsql php-intl libicu-dev python3-dotenv \
                         python3-psycopg2 python3-psutil python3-jinja2 \
-                        python3-icu python3-datrie git
+                        python3-icu python3-datrie python3-yaml git
 
 #
 # System Configuration