Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml

index b70ea80f1bf102127c9787eb712859a407cb32c7..23d640d7f084a9347c963db14ab1c9b5bacd8edd 100644 (file)
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -123,11 +123,6 @@ jobs:
                working-directory: Nominatim/test/bdd
                if: matrix.ubuntu == 18
  
-            - name: BDD tests (icu tokenizer)
-              run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
-              working-directory: Nominatim/test/bdd
-
              - name: Upload coverage to Codecov
                uses: codecov/codecov-action@v1
                with:
@@ -140,6 +135,63 @@ jobs:
                if: matrix.ubuntu == 20
  
  
+    icu-test:
+        needs: create-archive
+        strategy:
+            matrix:
+                ubuntu: [20]
+                include:
+                    - ubuntu: 20
+                      postgresql: 13
+                      postgis: 3
+                      pytest: py.test-3
+                      php: 7.4
+
+        runs-on: ubuntu-${{ matrix.ubuntu }}.04
+
+        steps:
+            - uses: actions/download-artifact@v2
+              with:
+                  name: full-source
+
+            - name: Unpack Nominatim
+              run: tar xf nominatim-src.tar.bz2
+
+            - name: Setup PHP
+              uses: shivammathur/setup-php@v2
+              with:
+                  php-version: ${{ matrix.php }}
+                  coverage: xdebug
+                  tools: phpunit, phpcs, composer
+
+            - uses: actions/setup-python@v2
+              with:
+                python-version: 3.6
+              if: matrix.ubuntu == 18
+
+            - uses: ./Nominatim/.github/actions/setup-postgresql
+              with:
+                  postgresql-version: ${{ matrix.postgresql }}
+                  postgis-version: ${{ matrix.postgis }}
+
+            - uses: ./Nominatim/.github/actions/build-nominatim
+              with:
+                  ubuntu: ${{ matrix.ubuntu }}
+
+            - name: Install test prerequsites
+              run: sudo apt-get install -y -qq python3-behave
+              if: matrix.ubuntu == 20
+
+            - name: Install test prerequsites
+              run: pip3 install behave==1.2.6
+              if: matrix.ubuntu == 18
+
+            - name: BDD tests (icu tokenizer)
+              run: |
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+              working-directory: Nominatim/test/bdd
+
+
      install:
          runs-on: ubuntu-latest
          needs: create-archive
diff --git a/docs/admin/Import.md b/docs/admin/Import.md

index 576c0097516b707fc353dbffadce301996913af8..7ebebde3559b925970b0557748c61f0d7f13ddac 100644 (file)
--- a/docs/admin/Import.md
+++ b/docs/admin/Import.md
@@ -271,20 +271,7 @@ reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=
  To run Nominatim via webservers like Apache or nginx, please read the
  [Deployment chapter](Deployment.md).
  
-## Tuning the database
-
-Accurate word frequency information for search terms helps PostgreSQL's query
-planner to make the right decisions. Recomputing them can improve the performance
-of forward geocoding in particular under high load. To recompute word counts run:
-
-```sh
-nominatim refresh --word-counts
-```
-
-This will take a couple of hours for a full planet installation. You can
-also defer that step to a later point in time when you realise that
-performance becomes an issue. Just make sure that updates are stopped before
-running this function.
+## Adding search through category phrases
  
  If you want to be able to search for places by their type through
  [special phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
diff --git a/lib-php/SearchDescription.php b/lib-php/SearchDescription.php

index ee8bbc0c9be81304c9531760aac3751366a6a689..fa622e62634c0d4c9c29c58bc70ba478f6012ca6 100644 (file)
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -19,6 +19,8 @@ class SearchDescription
      private $aName = array();
      /// True if the name is rare enough to force index use on name.
      private $bRareName = false;
+    /// True if the name requires to be accompanied by address terms.
+    private $bNameNeedsAddress = false;
      /// List of word ids making up the address of the object.
      private $aAddress = array();
      /// List of word ids that appear in the name but should be ignored.
@@ -113,6 +115,9 @@ class SearchDescription
                  return false;
              }
          }
+        if ($this->bNameNeedsAddress && empty($this->aAddress)) {
+            return false;
+        }
  
          return true;
      }
@@ -231,6 +236,7 @@ class SearchDescription
      {
          $this->aName[$iId] = $iId;
          $this->bRareName = $bRareName;
+        $this->bNameNeedsAddress = false;
      }
  
      /**
@@ -240,11 +246,19 @@ class SearchDescription
       * @param integer iID            ID of term to add.
       * @param bool bSearchable       Term should be used to search for result
       *                               (i.e. term is not a stop word).
+     * @param bool bNeedsAddress     True if the term is too unspecific to be used
+     *                               in a stand-alone search without an address
+     *                               to narrow down the search.
       * @param integer iPhraseNumber  Index of phrase, where the partial term
       *                               appears.
       */
-    public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber)
+    public function addPartialNameToken($iId, $bSearchable, $bNeedsAddress, $iPhraseNumber)
      {
+        if (empty($this->aName)) {
+            $this->bNameNeedsAddress = $bNeedsAddress;
+        } else {
+            $this->bNameNeedsAddress |= $bNeedsAddress;
+        }
          if ($bSearchable) {
              $this->aName[$iId] = $iId;
          } else {
@@ -310,6 +324,7 @@ class SearchDescription
      {
          $this->aAddress = array_merge($this->aAddress, $this->aName);
          $this->bRareName = false;
+        $this->bNameNeedsAddress = true;
          $this->aName = array($iId => $iId);
          $this->iNamePhrase = -1;
      }
diff --git a/lib-php/TokenPartial.php b/lib-php/TokenPartial.php

index 131bb2a3b48a52484df6817e1e39deeb8d3db90b..112154b2cf59526cf8432be726a4471ceb772b9c 100644 (file)
--- a/lib-php/TokenPartial.php
+++ b/lib-php/TokenPartial.php
@@ -90,6 +90,7 @@ class Partial
              $oNewSearch->addPartialNameToken(
                  $this->iId,
                  $this->iSearchNameCount < CONST_Max_Word_Frequency,
+                $this->iSearchNameCount > CONST_Search_NameOnlySearchFrequencyThreshold,
                  $oPosition->getPhrase()
              );
  
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py

index 27847920b80a434b1f4df0b1b4ac0b7b17ba5b24..9a2a73de0d5aab3e88a253fd28778ded8ba0ab22 100644 (file)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -125,16 +125,15 @@ class SetupAll:
                  freeze.drop_update_tables(conn)
          tokenizer.finalize_import(args.config)
  
+        LOG.warning('Recompute word counts')
+        tokenizer.update_statistics()
  
          webdir = args.project_dir / 'website'
          LOG.warning('Setup website at %s', webdir)
          with connect(args.config.get_libpq_dsn()) as conn:
              refresh.setup_website(webdir, args.config, conn)
  
-        with connect(args.config.get_libpq_dsn()) as conn:
-            SetupAll._set_database_date(conn)
-            properties.set_property(conn, 'database_version',
-                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
+        SetupAll._set_database_date(args.config.get_libpq_dsn())
  
          return 0
  
@@ -197,12 +196,16 @@ class SetupAll:
  
  
      @staticmethod
-    def _set_database_date(conn):
+    def _set_database_date(dsn):
          """ Determine the database date and set the status accordingly.
          """
-        try:
-            dbdate = status.compute_database_date(conn)
-            status.set_status(conn, dbdate)
-            LOG.info('Database is at %s.', dbdate)
-        except Exception as exc: # pylint: disable=broad-except
-            LOG.error('Cannot determine date of database: %s', exc)
+        with connect(dsn) as conn:
+            try:
+                dbdate = status.compute_database_date(conn)
+                status.set_status(conn, dbdate)
+                LOG.info('Database is at %s.', dbdate)
+            except Exception as exc: # pylint: disable=broad-except
+                LOG.error('Cannot determine date of database: %s', exc)
+
+            properties.set_property(conn, 'database_version',
+                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index e7ee57ad197b5879187a5070a97c07e544c5f8f1..3331a3210aaba70d49b602299c2ce9e88238a3a0 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -99,18 +99,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """ Recompute frequencies for all name words.
          """
          with connect(self.dsn) as conn:
-            with conn.cursor() as cur:
-                cur.drop_table("word_frequencies")
-                LOG.info("Computing word frequencies")
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute("CREATE INDEX ON word_frequencies(id)")
-                LOG.info("Update word table with recomputed frequencies")
-                cur.execute("""UPDATE word
-                               SET info = info || jsonb_build_object('count', count)
-                               FROM word_frequencies WHERE word_id = id""")
-                cur.drop_table("word_frequencies")
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word
+                                   SET info = info || jsonb_build_object('count', count)
+                                   FROM word_frequencies WHERE word_id = id""")
+                    cur.drop_table("word_frequencies")
              conn.commit()
  
  
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index d901a68d2e53f77e5c96210c11ede863e7e5e36f..0edcdccaaa8a6b7471b4eb38fb710f7db4924d71 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -190,18 +190,19 @@ class LegacyTokenizer(AbstractTokenizer):
          """ Recompute the frequency of full words.
          """
          with connect(self.dsn) as conn:
-            with conn.cursor() as cur:
-                cur.drop_table("word_frequencies")
-                LOG.info("Computing word frequencies")
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute("CREATE INDEX ON word_frequencies(id)")
-                LOG.info("Update word table with recomputed frequencies")
-                cur.execute("""UPDATE word SET search_name_count = count
-                               FROM word_frequencies
-                               WHERE word_token like ' %' and word_id = id""")
-                cur.drop_table("word_frequencies")
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word SET search_name_count = count
+                                   FROM word_frequencies
+                                   WHERE word_token like ' %' and word_id = id""")
+                    cur.drop_table("word_frequencies")
              conn.commit()
  
      def name_analyzer(self):
diff --git a/test/bdd/api/search/params.feature b/test/bdd/api/search/params.feature

index 9c4d1f10b3cd2ae639c2715439b0c6430bc3c610..b3df7d1952b1ca4a90d383ce6db5361f695b7363 100644 (file)
--- a/test/bdd/api/search/params.feature
+++ b/test/bdd/api/search/params.feature
@@ -169,9 +169,9 @@ Feature: Search queries
          Then at most 50 results are returned
  
      Scenario: Limit number of search results
-        When sending json search query "schloss"
+        When sending json search query "landstr"
          Then more than 4 results are returned
-        When sending json search query "schloss"
+        When sending json search query "landstr"
            | limit |
            | 4 |
          Then exactly 4 results are returned
@@ -227,7 +227,7 @@ Feature: Search queries
            | place | village |
  
      Scenario Outline: Search with polygon threshold (json)
-        When sending json search query "switzerland"
+        When sending json search query "triesenberg"
            | polygon_geojson | polygon_threshold |
            | 1               | <th> |
          Then at least 1 result is returned
@@ -241,7 +241,7 @@ Feature: Search queries
          | 999 |
  
      Scenario Outline: Search with polygon threshold (xml)
-        When sending xml search query "switzerland"
+        When sending xml search query "triesenberg"
            | polygon_geojson | polygon_threshold |
            | 1               | <th> |
          Then at least 1 result is returned
@@ -255,7 +255,7 @@ Feature: Search queries
          | 999 |
  
      Scenario Outline: Search with invalid polygon threshold (xml)
-        When sending xml search query "switzerland"
+        When sending xml search query "triesenberg"
            | polygon_geojson | polygon_threshold |
            | 1               | <th> |
          Then a HTTP 400 is returned
@@ -355,11 +355,11 @@ Feature: Search queries
          | geokml             |
  
      Scenario: Search along a route
-        When sending json search query "schloss" with address
+        When sending json search query "rathaus" with address
          Then result addresses contain
            | ID | town |
-          | 0  | Vaduz |
-        When sending json search query "schloss" with address
+          | 0  | Schaan |
+        When sending json search query "rathaus" with address
            | bounded | routewidth | route                              |
            | 1       | 0.1        |  9.54353,47.11772,9.54314,47.11894 |
          Then result addresses contain
diff --git a/test/bdd/api/search/queries.feature b/test/bdd/api/search/queries.feature

index 6d697ef96fe7ea3a3cded93f086d8e4f96ab1174..8b70dac188ad968e36a9f2d80c3fb21021064b0b 100644 (file)
--- a/test/bdd/api/search/queries.feature
+++ b/test/bdd/api/search/queries.feature
@@ -66,7 +66,7 @@ Feature: Search queries
           | way      | ^697,.* |
  
      Scenario: Search with class-type feature
-        When sending jsonv2 search query "Hotel in California"
+        When sending jsonv2 search query "bars in ebenholz"
          Then results contain
            | place_rank |
            | 30 |
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
.github/workflows/ci-tests.yml		patch \| blob \| history
docs/admin/Import.md		patch \| blob \| history
lib-php/SearchDescription.php		patch \| blob \| history
lib-php/TokenPartial.php		patch \| blob \| history
nominatim/clicmd/setup.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
test/bdd/api/search/params.feature		patch \| blob \| history
test/bdd/api/search/queries.feature		patch \| blob \| history