convert word info column to json before copying

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 21 Jul 2021 09:37:14 +0000 (11:37 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 21 Jul 2021 09:37:14 +0000 (11:37 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py

index 9a4a41a581661ced3048797b7ae1ff98d613dad0..bb7faa25767f2a55066494a448a3fc00aa5b6025 100644 (file)
--- a/nominatim/db/utils.py
+++ b/nominatim/db/utils.py
@@ -65,6 +65,7 @@ _SQL_TRANSLATION = {ord(u'\\'): u'\\\\',
                      ord(u'\t'): u'\\t',
                      ord(u'\n'): u'\\n'}
  
+
  class CopyBuffer:
      """ Data collector for the copy_from command.
      """
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 14fa5b609456c51ee4a7f9a35f6e5bf9908636c7..e019ef671a73b5f980bf47eededac1b359dfd0d3 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module.
  """
  from collections import Counter
  import itertools
+import json
  import logging
  import re
  from textwrap import dedent
@@ -173,7 +174,7 @@ class LegacyICUTokenizer:
              # copy them back into the word table
              with CopyBuffer() as copystr:
                  for k, v in words.items():
-                    copystr.add('w', k, {'count': v})
+                    copystr.add('w', k, json.dumps({'count': v}))
  
                  with conn.cursor() as cur:
                      copystr.copy_out(cur, 'word',
@@ -287,7 +288,7 @@ class LegacyICUNameAnalyzer:
                          to_delete.append(word)
                      else:
                          copystr.add(self.name_processor.get_search_normalized(postcode),
-                                    'P', {'postcode': postcode})
+                                    'P', json.dumps({'postcode': postcode}))
  
                  if to_delete:
                      cur.execute("""DELETE FROM WORD
@@ -337,8 +338,8 @@ class LegacyICUNameAnalyzer:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
                      copystr.add(term, 'S',
-                                {'word': word, 'class': cls, 'type': typ,
-                                 'op': oper if oper in ('in', 'near') else None})
+                                json.dumps({'word': word, 'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None}))
                      added += 1
  
              copystr.copy_out(cursor, 'word',
diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py

index 545cc58f633448096fbd2f212a19e69160ae01ff..9eea7ed101eb1421ad67e69124634d87076f1ab2 100644 (file)
--- a/test/python/test_db_utils.py
+++ b/test/python/test_db_utils.py
@@ -1,6 +1,8 @@
  """
  Tests for DB utility functions in db.utils
  """
+import json
+
  import pytest
  
  import nominatim.db.utils as db_utils
@@ -115,3 +117,38 @@ class TestCopyBuffer:
  
  
  
+class TestCopyBufferJson:
+    TABLE_NAME = 'copytable'
+
+    @pytest.fixture(autouse=True)
+    def setup_test_table(self, table_factory):
+        table_factory(self.TABLE_NAME, 'colA INT, colB JSONB')
+
+
+    def table_rows(self, cursor):
+        cursor.execute('SELECT * FROM ' + self.TABLE_NAME)
+        results = {k: v for k,v in cursor}
+
+        assert len(results) == cursor.rowcount
+
+        return results
+
+
+    def test_json_object(self, temp_db_cursor):
+        with db_utils.CopyBuffer() as buf:
+            buf.add(1, json.dumps({'test': 'value', 'number': 1}))
+
+            buf.copy_out(temp_db_cursor, self.TABLE_NAME)
+
+        assert self.table_rows(temp_db_cursor) == \
+                   {1: {'test': 'value', 'number': 1}}
+
+
+    def test_json_object_special_chras(self, temp_db_cursor):
+        with db_utils.CopyBuffer() as buf:
+            buf.add(1, json.dumps({'te\tst': 'va\nlue', 'nu"mber': None}))
+
+            buf.copy_out(temp_db_cursor, self.TABLE_NAME)
+
+        assert self.table_rows(temp_db_cursor) == \
+                   {1: {'te\tst': 'va\nlue', 'nu"mber': None}}
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 21 Jul 2021 09:37:14 +0000 (11:37 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
nominatim/db/utils.py		patch \| blob \| history
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
test/python/test_db_utils.py		patch \| blob \| history