]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_icu_tokenizer.py
icu tokenizer: move transliteration rules in separate file
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
index 2bd22c7207cb3f3cbf2de920a0a5887d67afd04d..689318d7e87bc79024b5b0c645db4252f19b62ae 100644 (file)
@@ -58,7 +58,7 @@ class LegacyICUTokenizer:
             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
 
         rules = json.loads(cfgfile.read_text())
-        self.transliteration = ';'.join(rules['normalization']) + ';'
+        self._load_transliteration(rules['normalization'], cfgfile.parent)
         self.abbreviations = rules["abbreviations"]
         self.normalization = config.TERM_NORMALIZATION
 
@@ -70,6 +70,12 @@ class LegacyICUTokenizer:
             self._init_db_tables(config)
 
 
+    def _load_transliteration(self, rules, cfg_path):
+        if isinstance(rules, str):
+            self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
+        else:
+            self.transliteration = ';'.join(rules) + ';'
+
     def init_from_project(self):
         """ Initialise the tokenizer from the project directory.
         """