]> git.openstreetmap.org Git - nominatim.git/blob - test/python/api/search/test_token_assignment.py
Merge pull request #3260 from lonvia/improve-catgeory-search
[nominatim.git] / test / python / api / search / test_token_assignment.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Test for creation of token assignments from tokenized queries.
9 """
10 import pytest
11
12 from nominatim.api.search.query import QueryStruct, Phrase, PhraseType, BreakType, TokenType, TokenRange, Token
13 from nominatim.api.search.token_assignment import yield_token_assignments, TokenAssignment, PENALTY_TOKENCHANGE
14
15 class MyToken(Token):
16     def get_category(self):
17         return 'this', 'that'
18
19
20 def make_query(*args):
21     q = QueryStruct([Phrase(args[0][1], '')])
22     dummy = MyToken(3.0, 45, 1, 'foo', True)
23
24     for btype, ptype, _ in args[1:]:
25         q.add_node(btype, ptype)
26     q.add_node(BreakType.END, PhraseType.NONE)
27
28     for start, t in enumerate(args):
29         for end, ttype in t[2]:
30             q.add_token(TokenRange(start, end), ttype, dummy)
31
32     return q
33
34
35 def check_assignments(actual, *expected):
36     todo = list(expected)
37     for assignment in actual:
38         assert assignment in todo, f"Unexpected assignment: {assignment}"
39         todo.remove(assignment)
40
41     assert not todo, f"Missing assignments: {expected}"
42
43
44 def test_query_with_missing_tokens():
45     q = QueryStruct([Phrase(PhraseType.NONE, '')])
46     q.add_node(BreakType.END, PhraseType.NONE)
47
48     assert list(yield_token_assignments(q)) == []
49
50
51 def test_one_word_query():
52     q = make_query((BreakType.START, PhraseType.NONE,
53                     [(1, TokenType.PARTIAL),
54                      (1, TokenType.WORD),
55                      (1, TokenType.HOUSENUMBER)]))
56
57     res = list(yield_token_assignments(q))
58     assert res == [TokenAssignment(name=TokenRange(0, 1))]
59
60
61 def test_single_postcode():
62     q = make_query((BreakType.START, PhraseType.NONE,
63                     [(1, TokenType.POSTCODE)]))
64
65     res = list(yield_token_assignments(q))
66     assert res == [TokenAssignment(postcode=TokenRange(0, 1))]
67
68
69 def test_single_country_name():
70     q = make_query((BreakType.START, PhraseType.NONE,
71                     [(1, TokenType.COUNTRY)]))
72
73     res = list(yield_token_assignments(q))
74     assert res == [TokenAssignment(country=TokenRange(0, 1))]
75
76
77 def test_single_word_poi_search():
78     q = make_query((BreakType.START, PhraseType.NONE,
79                     [(1, TokenType.CATEGORY),
80                      (1, TokenType.QUALIFIER)]))
81
82     res = list(yield_token_assignments(q))
83     assert res == [TokenAssignment(category=TokenRange(0, 1))]
84
85
86 @pytest.mark.parametrize('btype', [BreakType.WORD, BreakType.PART, BreakType.TOKEN])
87 def test_multiple_simple_words(btype):
88     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
89                    (btype, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
90                    (btype, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
91
92     penalty = PENALTY_TOKENCHANGE[btype]
93
94     check_assignments(yield_token_assignments(q),
95                       TokenAssignment(name=TokenRange(0, 3)),
96                       TokenAssignment(penalty=penalty, name=TokenRange(0, 2),
97                                       address=[TokenRange(2, 3)]),
98                       TokenAssignment(penalty=penalty, name=TokenRange(0, 1),
99                                       address=[TokenRange(1, 3)]),
100                       TokenAssignment(penalty=penalty, name=TokenRange(1, 3),
101                                       address=[TokenRange(0, 1)]),
102                       TokenAssignment(penalty=penalty, name=TokenRange(2, 3),
103                                       address=[TokenRange(0, 2)])
104                      )
105
106
107 def test_multiple_words_respect_phrase_break():
108     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
109                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
110
111     check_assignments(yield_token_assignments(q),
112                       TokenAssignment(name=TokenRange(0, 1),
113                                       address=[TokenRange(1, 2)]),
114                       TokenAssignment(name=TokenRange(1, 2),
115                                       address=[TokenRange(0, 1)]))
116
117
118 def test_housenumber_and_street():
119     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]),
120                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
121
122     check_assignments(yield_token_assignments(q),
123                       TokenAssignment(name=TokenRange(1, 2),
124                                       housenumber=TokenRange(0, 1)),
125                       TokenAssignment(address=[TokenRange(1, 2)],
126                                       housenumber=TokenRange(0, 1)))
127
128
129 def test_housenumber_and_street_backwards():
130     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
131                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]))
132
133     check_assignments(yield_token_assignments(q),
134                       TokenAssignment(name=TokenRange(0, 1),
135                                       housenumber=TokenRange(1, 2)),
136                       TokenAssignment(address=[TokenRange(0, 1)],
137                                       housenumber=TokenRange(1, 2)))
138
139
140 def test_housenumber_and_postcode():
141     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
142                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]),
143                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]),
144                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.POSTCODE)]))
145
146     check_assignments(yield_token_assignments(q),
147                       TokenAssignment(penalty=pytest.approx(0.3),
148                                       name=TokenRange(0, 1),
149                                       housenumber=TokenRange(1, 2),
150                                       address=[TokenRange(2, 3)],
151                                       postcode=TokenRange(3, 4)),
152                       TokenAssignment(penalty=pytest.approx(0.3),
153                                       housenumber=TokenRange(1, 2),
154                                       address=[TokenRange(0, 1), TokenRange(2, 3)],
155                                       postcode=TokenRange(3, 4)))
156
157 def test_postcode_and_housenumber():
158     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
159                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.POSTCODE)]),
160                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]),
161                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.HOUSENUMBER)]))
162
163     check_assignments(yield_token_assignments(q),
164                       TokenAssignment(penalty=pytest.approx(0.3),
165                                       name=TokenRange(2, 3),
166                                       housenumber=TokenRange(3, 4),
167                                       address=[TokenRange(0, 1)],
168                                       postcode=TokenRange(1, 2)),
169                       TokenAssignment(penalty=pytest.approx(0.3),
170                                       housenumber=TokenRange(3, 4),
171                                       address=[TokenRange(0, 1), TokenRange(2, 3)],
172                                       postcode=TokenRange(1, 2)))
173
174
175 def test_country_housenumber_postcode():
176     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.COUNTRY)]),
177                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
178                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.HOUSENUMBER)]),
179                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.POSTCODE)]))
180
181     check_assignments(yield_token_assignments(q))
182
183
184 @pytest.mark.parametrize('ttype', [TokenType.POSTCODE, TokenType.COUNTRY,
185                                    TokenType.CATEGORY, TokenType.QUALIFIER])
186 def test_housenumber_with_only_special_terms(ttype):
187     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]),
188                    (BreakType.WORD, PhraseType.NONE, [(2, ttype)]))
189
190     check_assignments(yield_token_assignments(q))
191
192
193 @pytest.mark.parametrize('ttype', [TokenType.POSTCODE, TokenType.HOUSENUMBER, TokenType.COUNTRY])
194 def test_multiple_special_tokens(ttype):
195     q = make_query((BreakType.START, PhraseType.NONE, [(1, ttype)]),
196                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
197                    (BreakType.PHRASE, PhraseType.NONE, [(3, ttype)]))
198
199     check_assignments(yield_token_assignments(q))
200
201
202 def test_housenumber_many_phrases():
203     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
204                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
205                    (BreakType.PHRASE, PhraseType.NONE, [(3, TokenType.PARTIAL)]),
206                    (BreakType.PHRASE, PhraseType.NONE, [(4, TokenType.HOUSENUMBER)]),
207                    (BreakType.WORD, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
208
209     check_assignments(yield_token_assignments(q),
210                       TokenAssignment(penalty=0.1,
211                                       name=TokenRange(4, 5),
212                                       housenumber=TokenRange(3, 4),\
213                                       address=[TokenRange(0, 1), TokenRange(1, 2),
214                                                TokenRange(2, 3)]),
215                       TokenAssignment(penalty=0.1,
216                                       housenumber=TokenRange(3, 4),\
217                                       address=[TokenRange(0, 1), TokenRange(1, 2),
218                                                TokenRange(2, 3), TokenRange(4, 5)]))
219
220
221 def test_country_at_beginning():
222     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.COUNTRY)]),
223                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
224
225     check_assignments(yield_token_assignments(q),
226                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
227                                       country=TokenRange(0, 1)))
228
229
230 def test_country_at_end():
231     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
232                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.COUNTRY)]))
233
234     check_assignments(yield_token_assignments(q),
235                       TokenAssignment(penalty=0.1, name=TokenRange(0, 1),
236                                       country=TokenRange(1, 2)))
237
238
239 def test_country_in_middle():
240     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
241                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.COUNTRY)]),
242                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
243
244     check_assignments(yield_token_assignments(q))
245
246
247 def test_postcode_with_designation():
248     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.POSTCODE)]),
249                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
250
251     check_assignments(yield_token_assignments(q),
252                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
253                                       postcode=TokenRange(0, 1)),
254                       TokenAssignment(postcode=TokenRange(0, 1),
255                                       address=[TokenRange(1, 2)]))
256
257
258 def test_postcode_with_designation_backwards():
259     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
260                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.POSTCODE)]))
261
262     check_assignments(yield_token_assignments(q),
263                       TokenAssignment(name=TokenRange(0, 1),
264                                       postcode=TokenRange(1, 2)),
265                       TokenAssignment(penalty=0.1, postcode=TokenRange(1, 2),
266                                       address=[TokenRange(0, 1)]))
267
268
269 def test_category_at_beginning():
270     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.CATEGORY)]),
271                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
272
273     check_assignments(yield_token_assignments(q),
274                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
275                                       category=TokenRange(0, 1)))
276
277
278 def test_category_at_end():
279     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
280                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.CATEGORY)]))
281
282     check_assignments(yield_token_assignments(q),
283                       TokenAssignment(penalty=0.1, name=TokenRange(0, 1),
284                                       category=TokenRange(1, 2)))
285
286
287 def test_category_in_middle():
288     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
289                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.CATEGORY)]),
290                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
291
292     check_assignments(yield_token_assignments(q))
293
294
295 def test_qualifier_at_beginning():
296     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.QUALIFIER)]),
297                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
298                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
299
300
301     check_assignments(yield_token_assignments(q),
302                       TokenAssignment(penalty=0.1, name=TokenRange(1, 3),
303                                       qualifier=TokenRange(0, 1)),
304                       TokenAssignment(penalty=0.2, name=TokenRange(1, 2),
305                                       qualifier=TokenRange(0, 1),
306                                       address=[TokenRange(2, 3)]))
307
308
309 def test_qualifier_after_name():
310     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
311                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
312                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
313                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
314                    (BreakType.WORD, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
315
316
317     check_assignments(yield_token_assignments(q),
318                       TokenAssignment(penalty=0.2, name=TokenRange(0, 2),
319                                       qualifier=TokenRange(2, 3),
320                                       address=[TokenRange(3, 5)]),
321                       TokenAssignment(penalty=0.2, name=TokenRange(3, 5),
322                                       qualifier=TokenRange(2, 3),
323                                       address=[TokenRange(0, 2)]))
324
325
326 def test_qualifier_before_housenumber():
327     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.QUALIFIER)]),
328                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]),
329                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
330
331     check_assignments(yield_token_assignments(q))
332
333
334 def test_qualifier_after_housenumber():
335     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]),
336                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.QUALIFIER)]),
337                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
338
339     check_assignments(yield_token_assignments(q))