]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 26 Oct 2021 15:24:33 +0000 (17:24 +0200)
.github/workflows/ci-tests.yml
docs/admin/Import.md
lib-php/SearchDescription.php
lib-php/TokenPartial.php
nominatim/clicmd/setup.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
test/bdd/api/search/params.feature
test/bdd/api/search/queries.feature

index b70ea80f1bf102127c9787eb712859a407cb32c7..23d640d7f084a9347c963db14ab1c9b5bacd8edd 100644 (file)
@@ -123,11 +123,6 @@ jobs:
               working-directory: Nominatim/test/bdd
               if: matrix.ubuntu == 18
 
-            - name: BDD tests (icu tokenizer)
-              run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
-              working-directory: Nominatim/test/bdd
-
             - name: Upload coverage to Codecov
               uses: codecov/codecov-action@v1
               with:
@@ -140,6 +135,63 @@ jobs:
               if: matrix.ubuntu == 20
 
 
+    icu-test:
+        needs: create-archive
+        strategy:
+            matrix:
+                ubuntu: [20]
+                include:
+                    - ubuntu: 20
+                      postgresql: 13
+                      postgis: 3
+                      pytest: py.test-3
+                      php: 7.4
+
+        runs-on: ubuntu-${{ matrix.ubuntu }}.04
+
+        steps:
+            - uses: actions/download-artifact@v2
+              with:
+                  name: full-source
+
+            - name: Unpack Nominatim
+              run: tar xf nominatim-src.tar.bz2
+
+            - name: Setup PHP
+              uses: shivammathur/setup-php@v2
+              with:
+                  php-version: ${{ matrix.php }}
+                  coverage: xdebug
+                  tools: phpunit, phpcs, composer
+
+            - uses: actions/setup-python@v2
+              with:
+                python-version: 3.6
+              if: matrix.ubuntu == 18
+
+            - uses: ./Nominatim/.github/actions/setup-postgresql
+              with:
+                  postgresql-version: ${{ matrix.postgresql }}
+                  postgis-version: ${{ matrix.postgis }}
+
+            - uses: ./Nominatim/.github/actions/build-nominatim
+              with:
+                  ubuntu: ${{ matrix.ubuntu }}
+
+            - name: Install test prerequsites
+              run: sudo apt-get install -y -qq python3-behave
+              if: matrix.ubuntu == 20
+
+            - name: Install test prerequsites
+              run: pip3 install behave==1.2.6
+              if: matrix.ubuntu == 18
+
+            - name: BDD tests (icu tokenizer)
+              run: |
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+              working-directory: Nominatim/test/bdd
+
+
     install:
         runs-on: ubuntu-latest
         needs: create-archive
index 576c0097516b707fc353dbffadce301996913af8..7ebebde3559b925970b0557748c61f0d7f13ddac 100644 (file)
@@ -271,20 +271,7 @@ reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=
 To run Nominatim via webservers like Apache or nginx, please read the
 [Deployment chapter](Deployment.md).
 
-## Tuning the database
-
-Accurate word frequency information for search terms helps PostgreSQL's query
-planner to make the right decisions. Recomputing them can improve the performance
-of forward geocoding in particular under high load. To recompute word counts run:
-
-```sh
-nominatim refresh --word-counts
-```
-
-This will take a couple of hours for a full planet installation. You can
-also defer that step to a later point in time when you realise that
-performance becomes an issue. Just make sure that updates are stopped before
-running this function.
+## Adding search through category phrases
 
 If you want to be able to search for places by their type through
 [special phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
index ee8bbc0c9be81304c9531760aac3751366a6a689..fa622e62634c0d4c9c29c58bc70ba478f6012ca6 100644 (file)
@@ -19,6 +19,8 @@ class SearchDescription
     private $aName = array();
     /// True if the name is rare enough to force index use on name.
     private $bRareName = false;
+    /// True if the name requires to be accompanied by address terms.
+    private $bNameNeedsAddress = false;
     /// List of word ids making up the address of the object.
     private $aAddress = array();
     /// List of word ids that appear in the name but should be ignored.
@@ -113,6 +115,9 @@ class SearchDescription
                 return false;
             }
         }
+        if ($this->bNameNeedsAddress && empty($this->aAddress)) {
+            return false;
+        }
 
         return true;
     }
@@ -231,6 +236,7 @@ class SearchDescription
     {
         $this->aName[$iId] = $iId;
         $this->bRareName = $bRareName;
+        $this->bNameNeedsAddress = false;
     }
 
     /**
@@ -240,11 +246,19 @@ class SearchDescription
      * @param integer iID            ID of term to add.
      * @param bool bSearchable       Term should be used to search for result
      *                               (i.e. term is not a stop word).
+     * @param bool bNeedsAddress     True if the term is too unspecific to be used
+     *                               in a stand-alone search without an address
+     *                               to narrow down the search.
      * @param integer iPhraseNumber  Index of phrase, where the partial term
      *                               appears.
      */
-    public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber)
+    public function addPartialNameToken($iId, $bSearchable, $bNeedsAddress, $iPhraseNumber)
     {
+        if (empty($this->aName)) {
+            $this->bNameNeedsAddress = $bNeedsAddress;
+        } else {
+            $this->bNameNeedsAddress |= $bNeedsAddress;
+        }
         if ($bSearchable) {
             $this->aName[$iId] = $iId;
         } else {
@@ -310,6 +324,7 @@ class SearchDescription
     {
         $this->aAddress = array_merge($this->aAddress, $this->aName);
         $this->bRareName = false;
+        $this->bNameNeedsAddress = true;
         $this->aName = array($iId => $iId);
         $this->iNamePhrase = -1;
     }
index 131bb2a3b48a52484df6817e1e39deeb8d3db90b..112154b2cf59526cf8432be726a4471ceb772b9c 100644 (file)
@@ -90,6 +90,7 @@ class Partial
             $oNewSearch->addPartialNameToken(
                 $this->iId,
                 $this->iSearchNameCount < CONST_Max_Word_Frequency,
+                $this->iSearchNameCount > CONST_Search_NameOnlySearchFrequencyThreshold,
                 $oPosition->getPhrase()
             );
 
index 27847920b80a434b1f4df0b1b4ac0b7b17ba5b24..9a2a73de0d5aab3e88a253fd28778ded8ba0ab22 100644 (file)
@@ -125,16 +125,15 @@ class SetupAll:
                 freeze.drop_update_tables(conn)
         tokenizer.finalize_import(args.config)
 
+        LOG.warning('Recompute word counts')
+        tokenizer.update_statistics()
 
         webdir = args.project_dir / 'website'
         LOG.warning('Setup website at %s', webdir)
         with connect(args.config.get_libpq_dsn()) as conn:
             refresh.setup_website(webdir, args.config, conn)
 
-        with connect(args.config.get_libpq_dsn()) as conn:
-            SetupAll._set_database_date(conn)
-            properties.set_property(conn, 'database_version',
-                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
+        SetupAll._set_database_date(args.config.get_libpq_dsn())
 
         return 0
 
@@ -197,12 +196,16 @@ class SetupAll:
 
 
     @staticmethod
-    def _set_database_date(conn):
+    def _set_database_date(dsn):
         """ Determine the database date and set the status accordingly.
         """
-        try:
-            dbdate = status.compute_database_date(conn)
-            status.set_status(conn, dbdate)
-            LOG.info('Database is at %s.', dbdate)
-        except Exception as exc: # pylint: disable=broad-except
-            LOG.error('Cannot determine date of database: %s', exc)
+        with connect(dsn) as conn:
+            try:
+                dbdate = status.compute_database_date(conn)
+                status.set_status(conn, dbdate)
+                LOG.info('Database is at %s.', dbdate)
+            except Exception as exc: # pylint: disable=broad-except
+                LOG.error('Cannot determine date of database: %s', exc)
+
+            properties.set_property(conn, 'database_version',
+                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
index e7ee57ad197b5879187a5070a97c07e544c5f8f1..3331a3210aaba70d49b602299c2ce9e88238a3a0 100644 (file)
@@ -99,18 +99,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
-            with conn.cursor() as cur:
-                cur.drop_table("word_frequencies")
-                LOG.info("Computing word frequencies")
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute("CREATE INDEX ON word_frequencies(id)")
-                LOG.info("Update word table with recomputed frequencies")
-                cur.execute("""UPDATE word
-                               SET info = info || jsonb_build_object('count', count)
-                               FROM word_frequencies WHERE word_id = id""")
-                cur.drop_table("word_frequencies")
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word
+                                   SET info = info || jsonb_build_object('count', count)
+                                   FROM word_frequencies WHERE word_id = id""")
+                    cur.drop_table("word_frequencies")
             conn.commit()
 
 
index d901a68d2e53f77e5c96210c11ede863e7e5e36f..0edcdccaaa8a6b7471b4eb38fb710f7db4924d71 100644 (file)
@@ -190,18 +190,19 @@ class LegacyTokenizer(AbstractTokenizer):
         """ Recompute the frequency of full words.
         """
         with connect(self.dsn) as conn:
-            with conn.cursor() as cur:
-                cur.drop_table("word_frequencies")
-                LOG.info("Computing word frequencies")
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute("CREATE INDEX ON word_frequencies(id)")
-                LOG.info("Update word table with recomputed frequencies")
-                cur.execute("""UPDATE word SET search_name_count = count
-                               FROM word_frequencies
-                               WHERE word_token like ' %' and word_id = id""")
-                cur.drop_table("word_frequencies")
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word SET search_name_count = count
+                                   FROM word_frequencies
+                                   WHERE word_token like ' %' and word_id = id""")
+                    cur.drop_table("word_frequencies")
             conn.commit()
 
     def name_analyzer(self):
index 9c4d1f10b3cd2ae639c2715439b0c6430bc3c610..b3df7d1952b1ca4a90d383ce6db5361f695b7363 100644 (file)
@@ -169,9 +169,9 @@ Feature: Search queries
         Then at most 50 results are returned
 
     Scenario: Limit number of search results
-        When sending json search query "schloss"
+        When sending json search query "landstr"
         Then more than 4 results are returned
-        When sending json search query "schloss"
+        When sending json search query "landstr"
           | limit |
           | 4 |
         Then exactly 4 results are returned
@@ -227,7 +227,7 @@ Feature: Search queries
           | place | village |
 
     Scenario Outline: Search with polygon threshold (json)
-        When sending json search query "switzerland"
+        When sending json search query "triesenberg"
           | polygon_geojson | polygon_threshold |
           | 1               | <th> |
         Then at least 1 result is returned
@@ -241,7 +241,7 @@ Feature: Search queries
         | 999 |
 
     Scenario Outline: Search with polygon threshold (xml)
-        When sending xml search query "switzerland"
+        When sending xml search query "triesenberg"
           | polygon_geojson | polygon_threshold |
           | 1               | <th> |
         Then at least 1 result is returned
@@ -255,7 +255,7 @@ Feature: Search queries
         | 999 |
 
     Scenario Outline: Search with invalid polygon threshold (xml)
-        When sending xml search query "switzerland"
+        When sending xml search query "triesenberg"
           | polygon_geojson | polygon_threshold |
           | 1               | <th> |
         Then a HTTP 400 is returned
@@ -355,11 +355,11 @@ Feature: Search queries
         | geokml             |
 
     Scenario: Search along a route
-        When sending json search query "schloss" with address
+        When sending json search query "rathaus" with address
         Then result addresses contain
           | ID | town |
-          | 0  | Vaduz |
-        When sending json search query "schloss" with address
+          | 0  | Schaan |
+        When sending json search query "rathaus" with address
           | bounded | routewidth | route                              |
           | 1       | 0.1        |  9.54353,47.11772,9.54314,47.11894 |
         Then result addresses contain
index 6d697ef96fe7ea3a3cded93f086d8e4f96ab1174..8b70dac188ad968e36a9f2d80c3fb21021064b0b 100644 (file)
@@ -66,7 +66,7 @@ Feature: Search queries
          | way      | ^697,.* |
 
     Scenario: Search with class-type feature
-        When sending jsonv2 search query "Hotel in California"
+        When sending jsonv2 search query "bars in ebenholz"
         Then results contain
           | place_rank |
           | 30 |