complete tests for rule loader

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 10 Jun 2021 08:06:49 +0000 (10:06 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 10 Jun 2021 08:06:49 +0000 (10:06 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index af53e825426bc1c1fcaed5fb1b4895e5a250719e..960148890d0fe0a7ba465706d00a3097dbaf2d74 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,7 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
-import io
  import itertools
  import logging
  import re
@@ -178,7 +177,7 @@ class LegacyICUTokenizer:
  
                  with conn.cursor() as cur:
                      copystr.copy_out(cur, 'word',
-                                      columns=['word_token', 'search_name_count'])
+                                     columns=['word_token', 'search_name_count'])
                      cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                                     WHERE word_id is null""")
  
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py

index abbc92423f4d9b1f44941b90f16534491c9dd2b6..51927eaacf420e0745f21ff2f76b82c29ed7f3dc 100644 (file)
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -63,6 +63,22 @@ def test_missing_normalization(tmp_path, section):
      with pytest.raises(UsageError):
          ICURuleLoader(fpath)
  
+@pytest.mark.parametrize("abbr", ["simple",
+                                  "double => arrow => bad",
+                                  "bad = > arrow"])
+def test_bad_abbreviation_syntax(tmp_path, abbr):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalization:
+        transliteration:
+        compound_suffixes:
+        abbreviations:
+         - {}
+        """.format(abbr)))
+
+    with pytest.raises(UsageError):
+        rules = ICURuleLoader(fpath)
+
  
  def test_get_search_rules(cfgfile):
      fpath = cfgfile(['strasse', 'straße', 'weg'],
@@ -105,18 +121,54 @@ def test_get_transliteration_rules(cfgfile):
      assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
  
  
-def test_get_synonym_pairs(cfgfile):
-    fpath = cfgfile(['Weg', 'Strasse'],
+def test_get_replacement_pairs_multi_to(cfgfile):
+    fpath = cfgfile(['Pfad', 'Strasse'],
                      ['Strasse => str,st'])
  
-    loader = ICURuleLoader(fpath)
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
+
+    assert [(a, sorted(b)) for a, b in repl] == \
+             [(' strasse ', [' st ', ' str ', ' strasse ']),
+              ('strasse ', [' st ', ' str ', ' strasse ']),
+              ('pfad ', [' pfad ']),
+              ('str ' , [' str ']),
+              ('st ' , [' st '])]
+
+
+def test_get_replacement_pairs_multi_from(cfgfile):
+    fpath = cfgfile([], ['saint,Sainte => st'])
  
-    repl = loader.get_replacement_pairs()
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
+
+    assert [(a, sorted(b)) for a, b in repl] == \
+             [(' sainte ', [' sainte ', ' st ']),
+              (' saint ', [' saint ', ' st '])]
+
+
+def test_get_replacement_pairs_cross_abbreviations(cfgfile):
+    fpath = cfgfile([], ['saint,Sainte => st',
+                         'sainte => ste'])
+
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
+
+    assert [(a, sorted(b)) for a, b in repl] == \
+             [(' sainte ', [' sainte ', ' st ', ' ste ']),
+              (' saint ', [' saint ', ' st '])]
+
+
+@pytest.mark.parametrize("abbr", ["missing to =>",
+                                  "  => missing from",
+                                  "=>"])
+def test_bad_abbreviation_syntax(tmp_path, abbr):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalization:
+        transliteration:
+        compound_suffixes:
+        abbreviations:
+         - {}
+        """.format(abbr)))
  
-    assert sorted(((a, sorted(b)) for a, b in repl)) == \
-             sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
-                     ('strasse ', [' st ', ' str ', ' strasse ']),
-                     ('st ' , [' st ']),
-                     ('str ' , [' str ']),
-                     ('weg ', [' weg '])])
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
  
+    assert repl == []
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 10 Jun 2021 08:06:49 +0000 (10:06 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
test/python/test_tokenizer_icu_rule_loader.py		patch \| blob \| history