nominatim/api/search/geocoder.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2023 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Public interface to the search code.
   9 """
  10 from typing import List, Any, Optional, Iterator, Tuple, Dict
  11 import itertools
  12 import re
  13 import datetime as dt
  14 import difflib
  15
  16 from nominatim.api.connection import SearchConnection
  17 from nominatim.api.types import SearchDetails
  18 from nominatim.api.results import SearchResult, SearchResults, add_result_details
  19 from nominatim.api.search.token_assignment import yield_token_assignments
  20 from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
  21 from nominatim.api.search.db_searches import AbstractSearch
  22 from nominatim.api.search.query_analyzer_factory import make_query_analyzer, AbstractQueryAnalyzer
  23 from nominatim.api.search.query import Phrase, QueryStruct
  24 from nominatim.api.logging import log
  25
  26 class ForwardGeocoder:
  27     """ Main class responsible for place search.
  28     """
  29
  30     def __init__(self, conn: SearchConnection,
  31                  params: SearchDetails, timeout: Optional[int]) -> None:
  32         self.conn = conn
  33         self.params = params
  34         self.timeout = dt.timedelta(seconds=timeout or 1000000)
  35         self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
  36
  37
  38     @property
  39     def limit(self) -> int:
  40         """ Return the configured maximum number of search results.
  41         """
  42         return self.params.max_results
  43
  44
  45     async def build_searches(self,
  46                              phrases: List[Phrase]) -> Tuple[QueryStruct, List[AbstractSearch]]:
  47         """ Analyse the query and return the tokenized query and list of
  48             possible searches over it.
  49         """
  50         if self.query_analyzer is None:
  51             self.query_analyzer = await make_query_analyzer(self.conn)
  52
  53         query = await self.query_analyzer.analyze_query(phrases)
  54
  55         searches: List[AbstractSearch] = []
  56         if query.num_token_slots() > 0:
  57             # 2. Compute all possible search interpretations
  58             log().section('Compute abstract searches')
  59             search_builder = SearchBuilder(query, self.params)
  60             num_searches = 0
  61             for assignment in yield_token_assignments(query):
  62                 searches.extend(search_builder.build(assignment))
  63                 if num_searches < len(searches):
  64                     log().table_dump('Searches for assignment',
  65                                      _dump_searches(searches, query, num_searches))
  66                 num_searches = len(searches)
  67             searches.sort(key=lambda s: (s.penalty, s.SEARCH_PRIO))
  68
  69         return query, searches
  70
  71
  72     async def execute_searches(self, query: QueryStruct,
  73                                searches: List[AbstractSearch]) -> SearchResults:
  74         """ Run the abstract searches against the database until a result
  75             is found.
  76         """
  77         log().section('Execute database searches')
  78         results: Dict[Any, SearchResult] = {}
  79
  80         end_time = dt.datetime.now() + self.timeout
  81
  82         min_ranking = searches[0].penalty + 2.0
  83         prev_penalty = 0.0
  84         for i, search in enumerate(searches):
  85             if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
  86                 break
  87             log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
  88             log().var_dump('Params', self.params)
  89             lookup_results = await search.lookup(self.conn, self.params)
  90             for result in lookup_results:
  91                 rhash = (result.source_table, result.place_id,
  92                          result.housenumber, result.country_code)
  93                 prevresult = results.get(rhash)
  94                 if prevresult:
  95                     prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
  96                 else:
  97                     results[rhash] = result
  98                 min_ranking = min(min_ranking, result.accuracy * 1.2)
  99             log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
 100             prev_penalty = search.penalty
 101             if dt.datetime.now() >= end_time:
 102                 break
 103
 104         return SearchResults(results.values())
 105
 106
 107     def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
 108         """ Remove badly matching results, sort by ranking and
 109             limit to the configured number of results.
 110         """
 111         if results:
 112             min_ranking = min(r.ranking for r in results)
 113             results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
 114             results.sort(key=lambda r: r.ranking)
 115
 116         if results:
 117             min_rank = results[0].rank_search
 118             results = SearchResults(r for r in results
 119                                     if r.ranking + 0.05 * (r.rank_search - min_rank)
 120                                        < min_ranking + 0.5)
 121
 122             results = SearchResults(results[:self.limit])
 123
 124         return results
 125
 126
 127     def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
 128         """ Adjust the accuracy of the localized result according to how well
 129             they match the original query.
 130         """
 131         assert self.query_analyzer is not None
 132         qwords = [word for phrase in query.source
 133                        for word in re.split('[, ]+', phrase.text) if word]
 134         if not qwords:
 135             return
 136
 137         for result in results:
 138             # Negative importance indicates ordering by distance, which is
 139             # more important than word matching.
 140             if not result.display_name\
 141                or (result.importance is not None and result.importance < 0):
 142                 continue
 143             distance = 0.0
 144             norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
 145                                                                 result.country_code or '')))
 146             words = set((w for w in norm.split(' ') if w))
 147             if not words:
 148                 continue
 149             for qword in qwords:
 150                 wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
 151                 if wdist < 0.5:
 152                     distance += len(qword)
 153                 else:
 154                     distance += (1.0 - wdist) * len(qword)
 155             # Compensate for the fact that country names do not get a
 156             # match penalty yet by the tokenizer.
 157             # Temporary hack that needs to be removed!
 158             if result.rank_address == 4:
 159                 distance *= 2
 160             result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
 161
 162
 163     async def lookup_pois(self, categories: List[Tuple[str, str]],
 164                           phrases: List[Phrase]) -> SearchResults:
 165         """ Look up places by category. If phrase is given, a place search
 166             over the phrase will be executed first and places close to the
 167             results returned.
 168         """
 169         log().function('forward_lookup_pois', categories=categories, params=self.params)
 170
 171         if phrases:
 172             query, searches = await self.build_searches(phrases)
 173
 174             if query:
 175                 searches = [wrap_near_search(categories, s) for s in searches[:50]]
 176                 results = await self.execute_searches(query, searches)
 177                 await add_result_details(self.conn, results, self.params)
 178                 log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
 179                 results = self.sort_and_cut_results(results)
 180             else:
 181                 results = SearchResults()
 182         else:
 183             search = build_poi_search(categories, self.params.countries)
 184             results = await search.lookup(self.conn, self.params)
 185             await add_result_details(self.conn, results, self.params)
 186
 187         log().result_dump('Final Results', ((r.accuracy, r) for r in results))
 188
 189         return results
 190
 191
 192     async def lookup(self, phrases: List[Phrase]) -> SearchResults:
 193         """ Look up a single free-text query.
 194         """
 195         log().function('forward_lookup', phrases=phrases, params=self.params)
 196         results = SearchResults()
 197
 198         if self.params.is_impossible():
 199             return results
 200
 201         query, searches = await self.build_searches(phrases)
 202
 203         if searches:
 204             # Execute SQL until an appropriate result is found.
 205             results = await self.execute_searches(query, searches[:50])
 206             await add_result_details(self.conn, results, self.params)
 207             log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
 208             self.rerank_by_query(query, results)
 209             log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
 210             results = self.sort_and_cut_results(results)
 211             log().result_dump('Final Results', ((r.accuracy, r) for r in results))
 212
 213         return results
 214
 215
 216 # pylint: disable=invalid-name,too-many-locals
 217 def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
 218                    start: int = 0) -> Iterator[Optional[List[Any]]]:
 219     yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
 220            'Qualifier', 'Catgeory', 'Rankings']
 221
 222     def tk(tl: List[int]) -> str:
 223         tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
 224
 225         return f"[{','.join(tstr)}]"
 226
 227     def fmt_ranking(f: Any) -> str:
 228         if not f:
 229             return ''
 230         ranks = ','.join((f"{tk(r.tokens)}^{r.penalty:.3g}" for r in f.rankings))
 231         if len(ranks) > 100:
 232             ranks = ranks[:100] + '...'
 233         return f"{f.column}({ranks},def={f.default:.3g})"
 234
 235     def fmt_lookup(l: Any) -> str:
 236         if not l:
 237             return ''
 238
 239         return f"{l.lookup_type}({l.column}{tk(l.tokens)})"
 240
 241
 242     def fmt_cstr(c: Any) -> str:
 243         if not c:
 244             return ''
 245
 246         return f'{c[0]}^{c[1]}'
 247
 248     for search in searches[start:]:
 249         fields = ('lookups', 'rankings', 'countries', 'housenumbers',
 250                   'postcodes', 'qualifiers')
 251         if hasattr(search, 'search'):
 252             iters = itertools.zip_longest([f"{search.penalty:.3g}"],
 253                                           *(getattr(search.search, attr, []) for attr in fields),
 254                                           getattr(search, 'categories', []),
 255                                           fillvalue='')
 256         else:
 257             iters = itertools.zip_longest([f"{search.penalty:.3g}"],
 258                                           *(getattr(search, attr, []) for attr in fields),
 259                                           [],
 260                                           fillvalue='')
 261         for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
 262             yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
 263                    fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
 264         yield None