]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/api/search/token_assignment.py
do not split names from typed phrases
[nominatim.git] / nominatim / api / search / token_assignment.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Create query interpretations where each vertice in the query is assigned
9 a specific function (expressed as a token type).
10 """
11 from typing import Optional, List, Iterator
12 import dataclasses
13
14 import nominatim.api.search.query as qmod
15 from nominatim.api.logging import log
16
17 # pylint: disable=too-many-return-statements,too-many-branches
18
19 @dataclasses.dataclass
20 class TypedRange:
21     """ A token range for a specific type of tokens.
22     """
23     ttype: qmod.TokenType
24     trange: qmod.TokenRange
25
26
27 PENALTY_TOKENCHANGE = {
28     qmod.BreakType.START: 0.0,
29     qmod.BreakType.END: 0.0,
30     qmod.BreakType.PHRASE: 0.0,
31     qmod.BreakType.WORD: 0.1,
32     qmod.BreakType.PART: 0.2,
33     qmod.BreakType.TOKEN: 0.4
34 }
35
36 TypedRangeSeq = List[TypedRange]
37
38 @dataclasses.dataclass
39 class TokenAssignment: # pylint: disable=too-many-instance-attributes
40     """ Representation of a possible assignment of token types
41         to the tokens in a tokenized query.
42     """
43     penalty: float = 0.0
44     name: Optional[qmod.TokenRange] = None
45     address: List[qmod.TokenRange] = dataclasses.field(default_factory=list)
46     housenumber: Optional[qmod.TokenRange] = None
47     postcode: Optional[qmod.TokenRange] = None
48     country: Optional[qmod.TokenRange] = None
49     category: Optional[qmod.TokenRange] = None
50     qualifier: Optional[qmod.TokenRange] = None
51
52
53     @staticmethod
54     def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
55         """ Create a new token assignment from a sequence of typed spans.
56         """
57         out = TokenAssignment()
58         for token in ranges:
59             if token.ttype == qmod.TokenType.PARTIAL:
60                 out.address.append(token.trange)
61             elif token.ttype == qmod.TokenType.HOUSENUMBER:
62                 out.housenumber = token.trange
63             elif token.ttype == qmod.TokenType.POSTCODE:
64                 out.postcode = token.trange
65             elif token.ttype == qmod.TokenType.COUNTRY:
66                 out.country = token.trange
67             elif token.ttype == qmod.TokenType.CATEGORY:
68                 out.category = token.trange
69             elif token.ttype == qmod.TokenType.QUALIFIER:
70                 out.qualifier = token.trange
71         return out
72
73
74 class _TokenSequence:
75     """ Working state used to put together the token assignements.
76
77         Represents an intermediate state while traversing the tokenized
78         query.
79     """
80     def __init__(self, seq: TypedRangeSeq,
81                  direction: int = 0, penalty: float = 0.0) -> None:
82         self.seq = seq
83         self.direction = direction
84         self.penalty = penalty
85
86
87     def __str__(self) -> str:
88         seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
89         return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
90
91
92     @property
93     def end_pos(self) -> int:
94         """ Return the index of the global end of the current sequence.
95         """
96         return self.seq[-1].trange.end if self.seq else 0
97
98
99     def has_types(self, *ttypes: qmod.TokenType) -> bool:
100         """ Check if the current sequence contains any typed ranges of
101             the given types.
102         """
103         return any(s.ttype in ttypes for s in self.seq)
104
105
106     def is_final(self) -> bool:
107         """ Return true when the sequence cannot be extended by any
108             form of token anymore.
109         """
110         # Country and category must be the final term for left-to-right
111         return len(self.seq) > 1 and \
112                self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.CATEGORY)
113
114
115     def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
116         """ Check if the give token type is appendable to the existing sequence.
117
118             Returns None if the token type is not appendable, otherwise the
119             new direction of the sequence after adding such a type. The
120             token is not added.
121         """
122         if ttype == qmod.TokenType.WORD:
123             return None
124
125         if not self.seq:
126             # Append unconditionally to the empty list
127             if ttype == qmod.TokenType.COUNTRY:
128                 return -1
129             if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
130                 return 1
131             return self.direction
132
133         # Name tokens are always acceptable and don't change direction
134         if ttype == qmod.TokenType.PARTIAL:
135             return self.direction
136
137         # Other tokens may only appear once
138         if self.has_types(ttype):
139             return None
140
141         if ttype == qmod.TokenType.HOUSENUMBER:
142             if self.direction == 1:
143                 if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
144                     return None
145                 if len(self.seq) > 2 \
146                    or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
147                     return None # direction left-to-right: housenumber must come before anything
148             elif self.direction == -1 \
149                  or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
150                 return -1 # force direction right-to-left if after other terms
151
152             return self.direction
153
154         if ttype == qmod.TokenType.POSTCODE:
155             if self.direction == -1:
156                 if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
157                     return None
158                 return -1
159             if self.direction == 1:
160                 return None if self.has_types(qmod.TokenType.COUNTRY) else 1
161             if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
162                 return 1
163             return self.direction
164
165         if ttype == qmod.TokenType.COUNTRY:
166             return None if self.direction == -1 else 1
167
168         if ttype == qmod.TokenType.CATEGORY:
169             return self.direction
170
171         if ttype == qmod.TokenType.QUALIFIER:
172             if self.direction == 1:
173                 if (len(self.seq) == 1
174                     and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.CATEGORY)) \
175                    or (len(self.seq) == 2
176                        and self.seq[0].ttype == qmod.TokenType.CATEGORY
177                        and self.seq[1].ttype == qmod.TokenType.PARTIAL):
178                     return 1
179                 return None
180             if self.direction == -1:
181                 return -1
182
183             tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.CATEGORY else self.seq
184             if len(tempseq) == 0:
185                 return 1
186             if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
187                 return None
188             if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
189                 return -1
190             return 0
191
192         return None
193
194
195     def advance(self, ttype: qmod.TokenType, end_pos: int,
196                 btype: qmod.BreakType) -> Optional['_TokenSequence']:
197         """ Return a new token sequence state with the given token type
198             extended.
199         """
200         newdir = self.appendable(ttype)
201         if newdir is None:
202             return None
203
204         if not self.seq:
205             newseq = [TypedRange(ttype, qmod.TokenRange(0, end_pos))]
206             new_penalty = 0.0
207         else:
208             last = self.seq[-1]
209             if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
210                 # extend the existing range
211                 newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
212                 new_penalty = 0.0
213             else:
214                 # start a new range
215                 newseq = list(self.seq) + [TypedRange(ttype,
216                                                       qmod.TokenRange(last.trange.end, end_pos))]
217                 new_penalty = PENALTY_TOKENCHANGE[btype]
218
219         return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
220
221
222     def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
223         if priors == 2:
224             self.penalty += 1.0
225         elif priors > 2:
226             if self.direction == 0:
227                 self.direction = new_dir
228             else:
229                 return False
230
231         return True
232
233
234     def recheck_sequence(self) -> bool:
235         """ Check that the sequence is a fully valid token assignment
236             and addapt direction and penalties further if necessary.
237
238             This function catches some impossible assignments that need
239             forward context and can therefore not be exluded when building
240             the assignment.
241         """
242         # housenumbers may not be further than 2 words from the beginning.
243         # If there are two words in front, give it a penalty.
244         hnrpos = next((i for i, tr in enumerate(self.seq)
245                        if tr.ttype == qmod.TokenType.HOUSENUMBER),
246                       None)
247         if hnrpos is not None:
248             if self.direction != -1:
249                 priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
250                 if not self._adapt_penalty_from_priors(priors, -1):
251                     return False
252             if self.direction != 1:
253                 priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
254                 if not self._adapt_penalty_from_priors(priors, 1):
255                     return False
256
257         return True
258
259
260     def _get_assignments_postcode(self, base: TokenAssignment,
261                                   query_len: int)  -> Iterator[TokenAssignment]:
262         """ Yield possible assignments of Postcode searches with an
263             address component.
264         """
265         assert base.postcode is not None
266
267         if (base.postcode.start == 0 and self.direction != -1)\
268            or (base.postcode.end == query_len and self.direction != 1):
269             log().comment('postcode search')
270             # <address>,<postcode> should give preference to address search
271             if base.postcode.start == 0:
272                 penalty = self.penalty
273                 self.direction = -1 # name searches are only possbile backwards
274             else:
275                 penalty = self.penalty + 0.1
276                 self.direction = 1 # name searches are only possbile forwards
277             yield dataclasses.replace(base, penalty=penalty)
278
279
280     def _get_assignments_address_forward(self, base: TokenAssignment,
281                                          query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
282         """ Yield possible assignments of address searches with
283             left-to-right reading.
284         """
285         first = base.address[0]
286
287         log().comment('first word = name')
288         yield dataclasses.replace(base, penalty=self.penalty,
289                                   name=first, address=base.address[1:])
290
291         # To paraphrase:
292         #  * if another name term comes after the first one and before the
293         #    housenumber
294         #  * a qualifier comes after the name
295         #  * the containing phrase is strictly typed
296         if (base.housenumber and first.end < base.housenumber.start)\
297            or (base.qualifier and base.qualifier > first)\
298            or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
299             return
300
301         penalty = self.penalty
302
303         # Penalty for:
304         #  * <name>, <street>, <housenumber> , ...
305         #  * queries that are comma-separated
306         if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
307             penalty += 0.25
308
309         for i in range(first.start + 1, first.end):
310             name, addr = first.split(i)
311             log().comment(f'split first word = name ({i - first.start})')
312             yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
313                                       penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
314
315
316     def _get_assignments_address_backward(self, base: TokenAssignment,
317                                           query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
318         """ Yield possible assignments of address searches with
319             right-to-left reading.
320         """
321         last = base.address[-1]
322
323         if self.direction == -1 or len(base.address) > 1:
324             log().comment('last word = name')
325             yield dataclasses.replace(base, penalty=self.penalty,
326                                       name=last, address=base.address[:-1])
327
328         # To paraphrase:
329         #  * if another name term comes before the last one and after the
330         #    housenumber
331         #  * a qualifier comes before the name
332         #  * the containing phrase is strictly typed
333         if (base.housenumber and last.start > base.housenumber.end)\
334            or (base.qualifier and base.qualifier < last)\
335            or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
336             return
337
338         penalty = self.penalty
339         if base.housenumber and base.housenumber < last:
340             penalty += 0.4
341         if len(query.source) > 1:
342             penalty += 0.25
343
344         for i in range(last.start + 1, last.end):
345             addr, name = last.split(i)
346             log().comment(f'split last word = name ({i - last.start})')
347             yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
348                                       penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
349
350
351     def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
352         """ Yield possible assignments for the current sequence.
353
354             This function splits up general name assignments into name
355             and address and yields all possible variants of that.
356         """
357         base = TokenAssignment.from_ranges(self.seq)
358
359         num_addr_tokens = sum(t.end - t.start for t in base.address)
360         if num_addr_tokens > 50:
361             return
362
363         # Postcode search (postcode-only search is covered in next case)
364         if base.postcode is not None and base.address:
365             yield from self._get_assignments_postcode(base, query.num_token_slots())
366
367         # Postcode or country-only search
368         if not base.address:
369             if not base.housenumber and (base.postcode or base.country or base.category):
370                 log().comment('postcode/country search')
371                 yield dataclasses.replace(base, penalty=self.penalty)
372         else:
373             # <postcode>,<address> should give preference to postcode search
374             if base.postcode and base.postcode.start == 0:
375                 self.penalty += 0.1
376
377             # Right-to-left reading of the address
378             if self.direction != -1:
379                 yield from self._get_assignments_address_forward(base, query)
380
381             # Left-to-right reading of the address
382             if self.direction != 1:
383                 yield from self._get_assignments_address_backward(base, query)
384
385             # variant for special housenumber searches
386             if base.housenumber:
387                 yield dataclasses.replace(base, penalty=self.penalty)
388
389
390 def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
391     """ Return possible word type assignments to word positions.
392
393         The assignments are computed from the concrete tokens listed
394         in the tokenized query.
395
396         The result includes the penalty for transitions from one word type to
397         another. It does not include penalties for transitions within a
398         type.
399     """
400     todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
401
402     while todo:
403         state = todo.pop()
404         node = query.nodes[state.end_pos]
405
406         for tlist in node.starting:
407             newstate = state.advance(tlist.ttype, tlist.end, node.btype)
408             if newstate is not None:
409                 if newstate.end_pos == query.num_token_slots():
410                     if newstate.recheck_sequence():
411                         log().var_dump('Assignment', newstate)
412                         yield from newstate.get_assignments(query)
413                 elif not newstate.is_final():
414                     todo.append(newstate)