]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
postcodes: introduce a default pattern for countries without postcodes
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27 sanitizers:
28     - step: clean-housenumbers
29       filter-kind:
30         - housenumber
31         - conscriptionnumber
32         - streetnumber
33       convert-to-name:
34         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
35     - step: clean-postcodes
36       convert-to-address: yes
37       default-pattern: [A-Z0-9- ]{3,12}
38     - step: split-name-list
39     - step: strip-brace-terms
40     - step: tag-analyzer-by-language
41       filter-kind: [".*name.*"]
42       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
43       use-defaults: all
44       mode: append
45 token-analysis:
46     - analyzer: generic
47     - id: "@housenumber"
48       analyzer: housenumbers
49     - id: bg
50       analyzer: generic
51       mode: variant-only
52       variants:
53           - !include icu-rules/variants-bg.yaml
54     - id: ca
55       analyzer: generic
56       mode: variant-only
57       variants:
58           - !include icu-rules/variants-ca.yaml
59     - id: cs
60       analyzer: generic
61       mode: variant-only
62       variants:
63           - !include icu-rules/variants-cs.yaml
64     - id: da
65       analyzer: generic
66       mode: variant-only
67       variants:
68           - !include icu-rules/variants-da.yaml
69     - id: de
70       analyzer: generic
71       mode: variant-only
72       variants:
73           - !include icu-rules/variants-de.yaml
74       mutations:
75           - pattern: ä
76             replacements: ["ä", "ae"]
77           - pattern: ö
78             replacements: ["ö", "oe"]
79           - pattern: ü
80             replacements: ["ü", "ue"]
81     - id: el
82       analyzer: generic
83       mode: variant-only
84       variants:
85           - !include icu-rules/variants-el.yaml
86     - id: en
87       analyzer: generic
88       mode: variant-only
89       variants:
90           - !include icu-rules/variants-en.yaml
91     - id: es
92       analyzer: generic
93       mode: variant-only
94       variants:
95           - !include icu-rules/variants-es.yaml
96     - id: et
97       analyzer: generic
98       mode: variant-only
99       variants:
100           - !include icu-rules/variants-et.yaml
101     - id: eu
102       analyzer: generic
103       mode: variant-only
104       variants:
105           - !include icu-rules/variants-eu.yaml
106     - id: fi
107       analyzer: generic
108       mode: variant-only
109       variants:
110           - !include icu-rules/variants-fi.yaml
111     - id: fr
112       analyzer: generic
113       mode: variant-only
114       variants:
115           - !include icu-rules/variants-fr.yaml
116     - id: gl
117       analyzer: generic
118       mode: variant-only
119       variants:
120           - !include icu-rules/variants-gl.yaml
121     - id: hu
122       analyzer: generic
123       mode: variant-only
124       variants:
125           - !include icu-rules/variants-hu.yaml
126     - id: it
127       analyzer: generic
128       mode: variant-only
129       variants:
130           - !include icu-rules/variants-it.yaml
131     - id: ja
132       analyzer: generic
133       mode: variant-only
134       variants:
135           - !include icu-rules/variants-ja.yaml
136     - id: mg
137       analyzer: generic
138       mode: variant-only
139       variants:
140           - !include icu-rules/variants-mg.yaml
141     - id: ms
142       analyzer: generic
143       mode: variant-only
144       variants:
145           - !include icu-rules/variants-ms.yaml
146     - id: nl
147       analyzer: generic
148       mode: variant-only
149       variants:
150           - !include icu-rules/variants-nl.yaml
151     - id: no
152       analyzer: generic
153       mode: variant-only
154       variants:
155           - !include icu-rules/variants-no.yaml
156     - id: pl
157       analyzer: generic
158       mode: variant-only
159       variants:
160           - !include icu-rules/variants-pl.yaml
161     - id: pt
162       analyzer: generic
163       mode: variant-only
164       variants:
165           - !include icu-rules/variants-pt.yaml
166     - id: ro
167       analyzer: generic
168       mode: variant-only
169       variants:
170           - !include icu-rules/variants-ro.yaml
171     - id: ru
172       analyzer: generic
173       mode: variant-only
174       variants:
175           - !include icu-rules/variants-ru.yaml
176     - id: sk
177       analyzer: generic
178       mode: variant-only
179       variants:
180           - !include icu-rules/variants-sk.yaml
181     - id: sl
182       analyzer: generic
183       mode: variant-only
184       variants:
185           - !include icu-rules/variants-sl.yaml
186     - id: sv
187       analyzer: generic
188       mode: variant-only
189       variants:
190           - !include icu-rules/variants-sv.yaml
191     - id: tr
192       analyzer: generic
193       mode: variant-only
194       variants:
195           - !include icu-rules/variants-tr.yaml
196     - id: uk
197       analyzer: generic
198       mode: variant-only
199       variants:
200           - !include icu-rules/variants-uk.yaml
201     - id: vi
202       analyzer: generic
203       mode: variant-only
204       variants:
205           - !include icu-rules/variants-vi.yaml