]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
Merge pull request #2684 from lonvia/translit-keep-spacing-marks
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27 sanitizers:
28     - step: clean-housenumbers
29       filter-kind:
30         - housenumber
31         - conscriptionnumber
32         - streetnumber
33       convert-to-name:
34         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
35     - step: split-name-list
36     - step: strip-brace-terms
37     - step: tag-analyzer-by-language
38       filter-kind: [".*name.*"]
39       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
40       use-defaults: all
41       mode: append
42 token-analysis:
43     - analyzer: generic
44     - id: "@housenumber"
45       analyzer: housenumbers
46     - id: bg
47       analyzer: generic
48       mode: variant-only
49       variants:
50           - !include icu-rules/variants-bg.yaml
51     - id: ca
52       analyzer: generic
53       mode: variant-only
54       variants:
55           - !include icu-rules/variants-ca.yaml
56     - id: cs
57       analyzer: generic
58       mode: variant-only
59       variants:
60           - !include icu-rules/variants-cs.yaml
61     - id: da
62       analyzer: generic
63       mode: variant-only
64       variants:
65           - !include icu-rules/variants-da.yaml
66     - id: de
67       analyzer: generic
68       mode: variant-only
69       variants:
70           - !include icu-rules/variants-de.yaml
71       mutations:
72           - pattern: ä
73             replacements: ["ä", "ae"]
74           - pattern: ö
75             replacements: ["ö", "oe"]
76           - pattern: ü
77             replacements: ["ü", "ue"]
78     - id: el
79       analyzer: generic
80       mode: variant-only
81       variants:
82           - !include icu-rules/variants-el.yaml
83     - id: en
84       analyzer: generic
85       mode: variant-only
86       variants:
87           - !include icu-rules/variants-en.yaml
88     - id: es
89       analyzer: generic
90       mode: variant-only
91       variants:
92           - !include icu-rules/variants-es.yaml
93     - id: et
94       analyzer: generic
95       mode: variant-only
96       variants:
97           - !include icu-rules/variants-et.yaml
98     - id: eu
99       analyzer: generic
100       mode: variant-only
101       variants:
102           - !include icu-rules/variants-eu.yaml
103     - id: fi
104       analyzer: generic
105       mode: variant-only
106       variants:
107           - !include icu-rules/variants-fi.yaml
108     - id: fr
109       analyzer: generic
110       mode: variant-only
111       variants:
112           - !include icu-rules/variants-fr.yaml
113     - id: gl
114       analyzer: generic
115       mode: variant-only
116       variants:
117           - !include icu-rules/variants-gl.yaml
118     - id: hu
119       analyzer: generic
120       mode: variant-only
121       variants:
122           - !include icu-rules/variants-hu.yaml
123     - id: it
124       analyzer: generic
125       mode: variant-only
126       variants:
127           - !include icu-rules/variants-it.yaml
128     - id: ja
129       analyzer: generic
130       mode: variant-only
131       variants:
132           - !include icu-rules/variants-ja.yaml
133     - id: mg
134       analyzer: generic
135       mode: variant-only
136       variants:
137           - !include icu-rules/variants-mg.yaml
138     - id: ms
139       analyzer: generic
140       mode: variant-only
141       variants:
142           - !include icu-rules/variants-ms.yaml
143     - id: nl
144       analyzer: generic
145       mode: variant-only
146       variants:
147           - !include icu-rules/variants-nl.yaml
148     - id: no
149       analyzer: generic
150       mode: variant-only
151       variants:
152           - !include icu-rules/variants-no.yaml
153     - id: pl
154       analyzer: generic
155       mode: variant-only
156       variants:
157           - !include icu-rules/variants-pl.yaml
158     - id: pt
159       analyzer: generic
160       mode: variant-only
161       variants:
162           - !include icu-rules/variants-pt.yaml
163     - id: ro
164       analyzer: generic
165       mode: variant-only
166       variants:
167           - !include icu-rules/variants-ro.yaml
168     - id: ru
169       analyzer: generic
170       mode: variant-only
171       variants:
172           - !include icu-rules/variants-ru.yaml
173     - id: sk
174       analyzer: generic
175       mode: variant-only
176       variants:
177           - !include icu-rules/variants-sk.yaml
178     - id: sl
179       analyzer: generic
180       mode: variant-only
181       variants:
182           - !include icu-rules/variants-sl.yaml
183     - id: sv
184       analyzer: generic
185       mode: variant-only
186       variants:
187           - !include icu-rules/variants-sv.yaml
188     - id: tr
189       analyzer: generic
190       mode: variant-only
191       variants:
192           - !include icu-rules/variants-tr.yaml
193     - id: uk
194       analyzer: generic
195       mode: variant-only
196       variants:
197           - !include icu-rules/variants-uk.yaml
198     - id: vi
199       analyzer: generic
200       mode: variant-only
201       variants:
202           - !include icu-rules/variants-vi.yaml