]> git.openstreetmap.org Git - nominatim.git/commitdiff
only run analyze on indexing when work was done
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 28 Sep 2022 08:22:54 +0000 (10:22 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 28 Sep 2022 08:22:54 +0000 (10:22 +0200)
This speeds up processing when continuing indexing after it was
interrupted.

nominatim/indexer/indexer.py
nominatim/indexer/progress.py

index 5425c8ffaf212ada260b57932c0566c6611ac2c5..233423f03c6a202ec088cfeb0fe7ac26c79db01f 100644 (file)
@@ -128,58 +128,64 @@ class Indexer:
                     with conn.cursor() as cur:
                         cur.execute('ANALYZE')
 
                     with conn.cursor() as cur:
                         cur.execute('ANALYZE')
 
-            self.index_by_rank(0, 4)
-            _analyze()
+            if self.index_by_rank(0, 4) > 0:
+                _analyze()
 
 
-            self.index_boundaries(0, 30)
-            _analyze()
+            if self.index_boundaries(0, 30) > 100:
+                _analyze()
 
 
-            self.index_by_rank(5, 25)
-            _analyze()
+            if self.index_by_rank(5, 25) > 100:
+                _analyze()
 
 
-            self.index_by_rank(26, 30)
-            _analyze()
+            if self.index_by_rank(26, 30) > 1000:
+                _analyze()
 
 
-            self.index_postcodes()
-            _analyze()
+            if self.index_postcodes() > 100:
+                _analyze()
 
 
 
 
-    def index_boundaries(self, minrank: int, maxrank: int) -> None:
+    def index_boundaries(self, minrank: int, maxrank: int) -> int:
         """ Index only administrative boundaries within the given rank range.
         """
         """ Index only administrative boundaries within the given rank range.
         """
+        total = 0
         LOG.warning("Starting indexing boundaries using %s threads",
                     self.num_threads)
 
         with self.tokenizer.name_analyzer() as analyzer:
             for rank in range(max(minrank, 4), min(maxrank, 26)):
         LOG.warning("Starting indexing boundaries using %s threads",
                     self.num_threads)
 
         with self.tokenizer.name_analyzer() as analyzer:
             for rank in range(max(minrank, 4), min(maxrank, 26)):
-                self._index(runners.BoundaryRunner(rank, analyzer))
+                total += self._index(runners.BoundaryRunner(rank, analyzer))
 
 
-    def index_by_rank(self, minrank: int, maxrank: int) -> None:
+        return total
+
+    def index_by_rank(self, minrank: int, maxrank: int) -> int:
         """ Index all entries of placex in the given rank range (inclusive)
             in order of their address rank.
 
             When rank 30 is requested then also interpolations and
             places with address rank 0 will be indexed.
         """
         """ Index all entries of placex in the given rank range (inclusive)
             in order of their address rank.
 
             When rank 30 is requested then also interpolations and
             places with address rank 0 will be indexed.
         """
+        total = 0
         maxrank = min(maxrank, 30)
         LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                     minrank, maxrank, self.num_threads)
 
         with self.tokenizer.name_analyzer() as analyzer:
             for rank in range(max(1, minrank), maxrank + 1):
         maxrank = min(maxrank, 30)
         LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                     minrank, maxrank, self.num_threads)
 
         with self.tokenizer.name_analyzer() as analyzer:
             for rank in range(max(1, minrank), maxrank + 1):
-                self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
+                total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
 
             if maxrank == 30:
 
             if maxrank == 30:
-                self._index(runners.RankRunner(0, analyzer))
-                self._index(runners.InterpolationRunner(analyzer), 20)
+                total += self._index(runners.RankRunner(0, analyzer))
+                total += self._index(runners.InterpolationRunner(analyzer), 20)
+
+        return total
 
 
 
 
-    def index_postcodes(self) -> None:
+    def index_postcodes(self) -> int:
         """Index the entries of the location_postcode table.
         """
         LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
 
         """Index the entries of the location_postcode table.
         """
         LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
 
-        self._index(runners.PostcodeRunner(), 20)
+        return self._index(runners.PostcodeRunner(), 20)
 
 
     def update_status_table(self) -> None:
 
 
     def update_status_table(self) -> None:
@@ -191,7 +197,7 @@ class Indexer:
 
             conn.commit()
 
 
             conn.commit()
 
-    def _index(self, runner: runners.Runner, batch: int = 1) -> None:
+    def _index(self, runner: runners.Runner, batch: int = 1) -> int:
         """ Index a single rank or table. `runner` describes the SQL to use
             for indexing. `batch` describes the number of objects that
             should be processed with a single SQL statement
         """ Index a single rank or table. `runner` describes the SQL to use
             for indexing. `batch` describes the number of objects that
             should be processed with a single SQL statement
@@ -233,4 +239,4 @@ class Indexer:
 
                 conn.commit()
 
 
                 conn.commit()
 
-        progress.done()
+        return progress.done()
index 177c262b702e19eec30c08383737670c1414ce74..33df37fbaec06df141462adafa14be012a95f043 100644 (file)
@@ -55,7 +55,7 @@ class ProgressLogger:
 
         self.next_info += int(places_per_sec) * self.log_interval
 
 
         self.next_info += int(places_per_sec) * self.log_interval
 
-    def done(self) -> None:
+    def done(self) -> int:
         """ Print final statistics about the progress.
         """
         rank_end_time = datetime.now()
         """ Print final statistics about the progress.
         """
         rank_end_time = datetime.now()
@@ -70,3 +70,5 @@ class ProgressLogger:
         LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
                     self.done_places, self.total_places, int(diff_seconds),
                     places_per_sec, self.name)
         LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
                     self.done_places, self.total_places, int(diff_seconds),
                     places_per_sec, self.name)
+
+        return self.done_places