]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/indexer/indexer.py
implementaion of 'nominatim index'
[nominatim.git] / nominatim / indexer / indexer.py
1 """
2 Main work horse for indexing (computing addresses) the database.
3 """
4 # pylint: disable=C0111
5 import logging
6 import select
7
8 import psycopg2
9
10 from .progress import ProgressLogger
11 from ..db.async_connection import DBConnection
12
13 LOG = logging.getLogger()
14
15 class RankRunner:
16     """ Returns SQL commands for indexing one rank within the placex table.
17     """
18
19     def __init__(self, rank):
20         self.rank = rank
21
22     def name(self):
23         return "rank {}".format(self.rank)
24
25     def sql_count_objects(self):
26         return """SELECT count(*) FROM placex
27                   WHERE rank_address = {} and indexed_status > 0
28                """.format(self.rank)
29
30     def sql_get_objects(self):
31         return """SELECT place_id FROM placex
32                   WHERE indexed_status > 0 and rank_address = {}
33                   ORDER BY geometry_sector""".format(self.rank)
34
35     @staticmethod
36     def sql_index_place(ids):
37         return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
38                .format(','.join((str(i) for i in ids)))
39
40
41 class InterpolationRunner:
42     """ Returns SQL commands for indexing the address interpolation table
43         location_property_osmline.
44     """
45
46     @staticmethod
47     def name():
48         return "interpolation lines (location_property_osmline)"
49
50     @staticmethod
51     def sql_count_objects():
52         return """SELECT count(*) FROM location_property_osmline
53                   WHERE indexed_status > 0"""
54
55     @staticmethod
56     def sql_get_objects():
57         return """SELECT place_id FROM location_property_osmline
58                   WHERE indexed_status > 0
59                   ORDER BY geometry_sector"""
60
61     @staticmethod
62     def sql_index_place(ids):
63         return """UPDATE location_property_osmline
64                   SET indexed_status = 0 WHERE place_id IN ({})"""\
65                .format(','.join((str(i) for i in ids)))
66
67 class BoundaryRunner:
68     """ Returns SQL commands for indexing the administrative boundaries
69         of a certain rank.
70     """
71
72     def __init__(self, rank):
73         self.rank = rank
74
75     def name(self):
76         return "boundaries rank {}".format(self.rank)
77
78     def sql_count_objects(self):
79         return """SELECT count(*) FROM placex
80                   WHERE indexed_status > 0
81                     AND rank_search = {}
82                     AND class = 'boundary' and type = 'administrative'""".format(self.rank)
83
84     def sql_get_objects(self):
85         return """SELECT place_id FROM placex
86                   WHERE indexed_status > 0 and rank_search = {}
87                         and class = 'boundary' and type = 'administrative'
88                   ORDER BY partition, admin_level""".format(self.rank)
89
90     @staticmethod
91     def sql_index_place(ids):
92         return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
93                .format(','.join((str(i) for i in ids)))
94
95 class Indexer:
96     """ Main indexing routine.
97     """
98
99     def __init__(self, dsn, num_threads):
100         self.conn = psycopg2.connect(dsn)
101         self.threads = [DBConnection(dsn) for _ in range(num_threads)]
102
103     def index_boundaries(self, minrank, maxrank):
104         LOG.warning("Starting indexing boundaries using %s threads",
105                     len(self.threads))
106
107         for rank in range(max(minrank, 5), min(maxrank, 26)):
108             self.index(BoundaryRunner(rank))
109
110     def index_by_rank(self, minrank, maxrank):
111         """ Run classic indexing by rank.
112         """
113         maxrank = min(maxrank, 30)
114         LOG.warning("Starting indexing rank (%i to %i) using %i threads",
115                     minrank, maxrank, len(self.threads))
116
117         for rank in range(max(1, minrank), maxrank):
118             self.index(RankRunner(rank))
119
120         if maxrank == 30:
121             self.index(RankRunner(0))
122             self.index(InterpolationRunner(), 20)
123             self.index(RankRunner(30), 20)
124         else:
125             self.index(RankRunner(maxrank))
126
127     def index(self, obj, batch=1):
128         """ Index a single rank or table. `obj` describes the SQL to use
129             for indexing. `batch` describes the number of objects that
130             should be processed with a single SQL statement
131         """
132         LOG.warning("Starting %s (using batch size %s)", obj.name(), batch)
133
134         cur = self.conn.cursor()
135         cur.execute(obj.sql_count_objects())
136
137         total_tuples = cur.fetchone()[0]
138         LOG.debug("Total number of rows: %i", total_tuples)
139
140         cur.close()
141
142         progress = ProgressLogger(obj.name(), total_tuples)
143
144         if total_tuples > 0:
145             cur = self.conn.cursor(name='places')
146             cur.execute(obj.sql_get_objects())
147
148             next_thread = self.find_free_thread()
149             while True:
150                 places = [p[0] for p in cur.fetchmany(batch)]
151                 if not places:
152                     break
153
154                 LOG.debug("Processing places: %s", str(places))
155                 thread = next(next_thread)
156
157                 thread.perform(obj.sql_index_place(places))
158                 progress.add(len(places))
159
160             cur.close()
161
162             for thread in self.threads:
163                 thread.wait()
164
165         progress.done()
166
167     def find_free_thread(self):
168         """ Generator that returns the next connection that is free for
169             sending a query.
170         """
171         ready = self.threads
172         command_stat = 0
173
174         while True:
175             for thread in ready:
176                 if thread.is_done():
177                     command_stat += 1
178                     yield thread
179
180             # refresh the connections occasionaly to avoid potential
181             # memory leaks in Postgresql.
182             if command_stat > 100000:
183                 for thread in self.threads:
184                     while not thread.is_done():
185                         thread.wait()
186                     thread.connect()
187                 command_stat = 0
188                 ready = self.threads
189             else:
190                 ready, _, _ = select.select(self.threads, [], [])
191
192         assert False, "Unreachable code"