]> git.openstreetmap.org Git - nominatim.git/commitdiff
improve normalization
authorSarah Hoffmann <lonvia@denofr.de>
Sat, 26 Jun 2021 17:38:08 +0000 (19:38 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
Make sure all special symbols are removed during normalization already.
Those won't be interpreted in any way because they are unlikely to be
searched for.

settings/icu-rules/extended-unicode-to-asccii.yaml
settings/icu-rules/unicode-digits-to-decimal.yaml [new file with mode: 0644]
settings/legacy_icu_tokenizer.yaml

index 921874f50d17674140acf511610870c5e678990e..959774d2b0511a130e038bbe9503d0de0b90e310 100644 (file)
@@ -1,4 +1,4 @@
-- ":: Latin ()"
+- "'ล‚' > 'l'"
 - "'ยช' > 'a'"
 - "'ยต' > 'u'"
 - "'ยบ' > 'o'"
diff --git a/settings/icu-rules/unicode-digits-to-decimal.yaml b/settings/icu-rules/unicode-digits-to-decimal.yaml
new file mode 100644 (file)
index 0000000..55b3274
--- /dev/null
@@ -0,0 +1,24 @@
+- "[๐žฅ๐’ ฿€๐–ญ๊ค€๐–ฉ ๐‘“๐‘‘๐‘‹ฐ๐‘„ถ๊ฉ๊˜ แฑ€แญแฎฐแ แŸ แฅ†เผ เป๊งฐแ‚แชแช€แง๐‘ต๊ฏฐแฑ๐‘ฑ๐‘œฐ๐‘›€๐‘™๐‘‡๊ง๊ฃเทฆ๐‘ฆ๏ผ๐Ÿถ๐Ÿ˜๐Ÿฌ๐ŸŽ๐Ÿขโ‚€โ“ฟโ“ชโฐ] > 0"
+- "[๐žฅ‘๐’ก฿๐–ญ‘๊ค๐–ฉก๐‘“‘๐‘‘‘๐‘‹ฑ๐‘„ท๊ฉ‘๊˜กแฑแญ‘แฎฑแ ‘แŸกแฅ‡เผกเป‘๊งฑแ‚‘แช‘แชแง‘๐‘ต‘๊ฏฑแฑ‘๐‘ฑ‘๐‘œฑ๐‘›๐‘™‘๐‘‡‘๊ง‘๊ฃ‘เทง๐‘ง๏ผ‘๐Ÿท๐Ÿ™๐Ÿญ๐Ÿ๐Ÿฃโ‚ยนโ‘ โ‘ดโ’ˆโถโž€โžŠโ“ต] > 1"
+- "[๐žฅ’๐’ข฿‚๐–ญ’๊ค‚๐–ฉข๐‘“’๐‘‘’๐‘‹ฒ๐‘„ธ๊ฉ’๊˜ขแฑ‚แญ’แฎฒแ ’แŸขแฅˆเผขเป’๊งฒแ‚’แช’แช‚แง’๐‘ต’๊ฏฒแฑ’๐‘ฑ’๐‘œฒ๐‘›‚๐‘™’๐‘‡’๊ง’๊ฃ’เทจ๐‘จ๏ผ’๐Ÿธ๐Ÿš๐Ÿฎ๐Ÿ๐Ÿคโ‚‚ยฒโ‘กโ‘ตโ’‰โทโžโž‹โ“ถ] > 2"
+- "[๐žฅ“๐’ฃ฿ƒ๐–ญ“๊คƒ๐–ฉฃ๐‘““๐‘‘“๐‘‹ณ๐‘„น๊ฉ“๊˜ฃแฑƒแญ“แฎณแ “แŸฃแฅ‰เผฃเป“๊งณแ‚“แช“แชƒแง“๐‘ต“๊ฏณแฑ“๐‘ฑ“๐‘œณ๐‘›ƒ๐‘™“๐‘‡“๊ง“๊ฃ“เทฉ๐‘ฉ๏ผ“๐Ÿน๐Ÿ›๐Ÿฏ๐Ÿ‘๐Ÿฅโ‚ƒยณโ‘ขโ‘ถโ’Šโธโž‚โžŒโ“ท] > 3"
+- "[๐žฅ”๐’ค฿„๐–ญ”๊ค„๐–ฉค๐‘“”๐‘‘”๐‘‹ด๐‘„บ๊ฉ”๊˜คแฑ„แญ”แฎดแ ”แŸคแฅŠเผคเป”๊งดแ‚”แช”แช„แง”๐‘ต”๊ฏดแฑ”๐‘ฑ”๐‘œด๐‘›„๐‘™”๐‘‡”๊ง”๊ฃ”เทช๐‘ช๏ผ”๐Ÿบ๐Ÿœ๐Ÿฐ๐Ÿ’๐Ÿฆโ‚„โดโ‘ฃโ‘ทโ’‹โนโžƒโžโ“ธ] > 4"
+- "[๐žฅ•๐’ฅ฿…๐–ญ•๊ค…๐–ฉฅ๐‘“•๐‘‘•๐‘‹ต๐‘„ป๊ฉ•๊˜ฅแฑ…แญ•แฎตแ •แŸฅแฅ‹เผฅเป•๊งตแ‚•แช•แช…แง•๐‘ต•๊ฏตแฑ•๐‘ฑ•๐‘œต๐‘›…๐‘™•๐‘‡•๊ง•๊ฃ•เทซ๐‘ซ๏ผ•๐Ÿป๐Ÿ๐Ÿฑ๐Ÿ“๐Ÿงโ‚…โตโ‘คโ‘ธโ’Œโบโž„โžŽโ“น] > 5"
+- "[๐žฅ–๐’ฆ฿†๐–ญ–๊ค†๐–ฉฆ๐‘“–๐‘‘–๐‘‹ถ๐‘„ผ๊ฉ–๊˜ฆแฑ†แญ–แฎถแ –แŸฆแฅŒเผฆเป–๊งถแ‚–แช–แช†แง–๐‘ต–๊ฏถแฑ–๐‘ฑ–๐‘œถ๐‘›†๐‘™–๐‘‡–๊ง–๊ฃ–เทฌ๐‘ฌ๏ผ–๐Ÿผ๐Ÿž๐Ÿฒ๐Ÿ”๐Ÿจโ‚†โถโ‘ฅโ‘นโ’โปโž…โžโ“บ] > 6"
+- "[๐žฅ—๐’ง฿‡๐–ญ—๊ค‡๐–ฉง๐‘“—๐‘‘—๐‘‹ท๐‘„ฝ๊ฉ—๊˜งแฑ‡แญ—แฎทแ —แŸงแฅเผงเป—๊งทแ‚—แช—แช‡แง—๐‘ต—๊ฏทแฑ—๐‘ฑ—๐‘œท๐‘›‡๐‘™—๐‘‡—๊ง—๊ฃ—เทญ๐‘ญ๏ผ—๐Ÿฝ๐ŸŸ๐Ÿณ๐Ÿ•๐Ÿฉโ‚‡โทโ‘ฆโ‘บโ’Žโผโž†โžโ“ป] > 7"
+- "[๐žฅ˜๐’จ฿ˆ๐–ญ˜๊คˆ๐–ฉจ๐‘“˜๐‘‘˜๐‘‹ธ๐‘„พ๊ฉ˜๊˜จแฑˆแญ˜แฎธแ ˜แŸจแฅŽเผจเป˜๊งธแ‚˜แช˜แชˆแง˜๐‘ต˜๊ฏธแฑ˜๐‘ฑ˜๐‘œธ๐‘›ˆ๐‘™˜๐‘‡˜๊ง˜๊ฃ˜เทฎ๐‘ฎ๏ผ˜๐Ÿพ๐Ÿ ๐Ÿด๐Ÿ–๐Ÿชโ‚ˆโธโ‘งโ‘ปโ’โฝโž‡โž‘โ“ผ] > 8"
+- "[๐žฅ™๐’ฉ฿‰๐–ญ™๊ค‰๐–ฉฉ๐‘“™๐‘‘™๐‘‹น๐‘„ฟ๊ฉ™๊˜ฉแฑ‰แญ™แฎนแ ™แŸฉแฅเผฉเป™๊งนแ‚™แช™แช‰แง™๐‘ต™๊ฏนแฑ™๐‘ฑ™๐‘œน๐‘›‰๐‘™™๐‘‡™๊ง™๊ฃ™เทฏ๐‘ฏ๏ผ™๐Ÿฟ๐Ÿก๐Ÿต๐Ÿ—๐Ÿซโ‚‰โนโ‘จโ‘ผโ’โพโžˆโž’โ“ฝ] > 9"
+- "[๐‘œบโ‘ฉโ‘ฝโ’‘โฟโž‰โž“โ“พ] > '10'"
+- "[โ‘ชโ‘พโ’’โ“ซ] > '11'"
+- "[โ‘ซโ‘ฟโ’“โ“ฌ] > '12'"
+- "[โ‘ฌโ’€โ’”โ“ญ] > '13'"
+- "[โ‘ญโ’โ’•โ“ฎ] > '14'"
+- "[โ‘ฎโ’‚โ’–โ“ฏ] > '15'"
+- "[โ‘ฏโ’ƒโ’—โ“ฐ] > '16'"
+- "[โ‘ฐโ’„โ’˜โ“ฑ] > '17'"
+- "[โ‘ฑโ’…โ’™โ“ฒ] > '18'"
+- "[โ‘ฒโ’†โ’šโ“ณ] > '19'"
+- "[๐‘œปโ‘ณโ’‡โ’›โ“ด] > '20'"
+- "โ… > ' 1/7'"
+- "โ…‘ > ' 1/9'"
+- "โ…’  > ' 1/10'"
index a3f1c02735238b21d96af8d9b5f3bbeaa7629749..7972b156d455d8841daa020328252edef9da9056 100644 (file)
@@ -1,20 +1,29 @@
 normalization:
-    - ":: NFD ()"
-    - "[[:Nonspacing Mark:] [:Cf:]] >"
     - ":: lower ()"
+    - !include icu-rules/unicode-digits-to-decimal.yaml
+    - "'โ„–' > 'no'"
+    - "'nยฐ' > 'no'"
+    - "'nยบ' > 'no'"
+    - "ยช > a"
+    - "ยบ > o"
+    - "[[:Punctuation:][:Symbol:]]  > ' '"
     - "รŸ > 'ss'" # German szet is unimbigiously equal to double ss
-    - "[[:Punctuation:][:Space:]]+ > ' '"
-    - ":: NFC ()"
+    - "[^[:Letter:] [:Number:] [:Space:]] >"
+    - "[:Lm:] >"
+    - ":: [[:Number:]] Latin ()"
+    - ":: [[:Number:]] Ascii ();"
+    - ":: [[:Number:]] NFD ();"
+    - "[[:Nonspacing Mark:] [:Cf:]] >;"
+    - "[:Space:]+ > ' '"
 transliteration:
+    - ":: Latin ()"
     - !include icu-rules/extended-unicode-to-asccii.yaml
     - ":: Ascii ()"
     - ":: NFD ()"
-    - "'' >"
-    - "[[:Nonspacing Mark:] [:Cf:]] >"
     - "[^[:Ascii:]] >"
     - ":: lower ()"
-    - "[[:Punctuation:][:Space:]]+ > ' '"
     - ":: NFC ()"
+    - "[:Space:]+ > ' '"
 variants:
   - words:
     - ~hal => hal