Resolve conflicts

author AntoJvlt <antonin.jolivat@gmail.com>

Mon, 17 May 2021 11:52:35 +0000 (13:52 +0200)

committer AntoJvlt <antonin.jolivat@gmail.com>

Mon, 17 May 2021 11:52:35 +0000 (13:52 +0200)
author AntoJvlt <antonin.jolivat@gmail.com>
Mon, 17 May 2021 11:52:35 +0000 (13:52 +0200)
committer AntoJvlt <antonin.jolivat@gmail.com>
Mon, 17 May 2021 11:52:35 +0000 (13:52 +0200)
diff --cc docs/admin/Import.md
Simple merge
diff --cc nominatim/tokenizer/legacy_icu_tokenizer.py

index e07602d90aea7192939d0abf2c0c36240a56c2b3,7205ddefab0c449ec33da6610fe98edb8cfb48ba..156e99ece67f156d463f5d7e84858dcb7b80c027
--- 1/nominatim/tokenizer/legacy_icu_tokenizer.py
--- 2/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@@ -285,28 -295,47 +295,47 @@@ class LegacyICUNameAnalyzer
   
           return self.transliterator.transliterate(hnr)
   
-     def add_postcodes_from_db(self):
-         """ Add postcodes from the location_postcode table to the word table.
+     def update_postcodes_from_db(self):
+         """ Update postcode tokens in the word table from the location_postcode
+             table.
           """
+         to_delete = []
           copystr = io.StringIO()
           with self.conn.cursor() as cur:
-             cur.execute("SELECT distinct(postcode) FROM location_postcode")
-             for (postcode, ) in cur:
-                 copystr.write(postcode)
-                 copystr.write('\t ')
-                 copystr.write(self.transliterator.transliterate(postcode))
-                 copystr.write('\tplace\tpostcode\t0\n')
- 
-             copystr.seek(0)
-             cur.copy_from(copystr, 'word',
-                           columns=['word', 'word_token', 'class', 'type',
-                                    'search_name_count'])
-             # Don't really need an ID for postcodes....
-             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-             #                WHERE word_id is null and type = 'postcode'""")
+             # This finds us the rows in location_postcode and word that are
+             # missing in the other table.
+             cur.execute("""SELECT * FROM
+                             (SELECT pc, word FROM
+                               (SELECT distinct(postcode) as pc FROM location_postcode) p
+                               FULL JOIN
+                               (SELECT word FROM word
+                                 WHERE class ='place' and type = 'postcode') w
+                               ON pc = word) x
+                            WHERE pc is null or word is null""")
+ 
+             for postcode, word in cur:
+                 if postcode is None:
+                     to_delete.append(word)
+                 else:
+                     copystr.write(postcode)
+                     copystr.write('\t ')
+                     copystr.write(self.transliterator.transliterate(postcode))
+                     copystr.write('\tplace\tpostcode\t0\n')
+ 
+             if to_delete:
+                 cur.execute("""DELETE FROM WORD
+                                WHERE class ='place' and type = 'postcode'
+                                      and word = any(%s)
+                             """, (to_delete, ))
+ 
+             if copystr.getvalue():
+                 copystr.seek(0)
+                 cur.copy_from(copystr, 'word',
+                               columns=['word', 'word_token', 'class', 'type',
+                                        'search_name_count'])
   
   
- -    def update_special_phrases(self, phrases):
+ +    def update_special_phrases(self, phrases, should_replace):
           """ Replace the search index for special phrases with the new phrases.
           """
           norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
diff --cc nominatim/tokenizer/legacy_tokenizer.py

index 5bd45c51284f211ffc78b4fa4f25a5e169a19d2e,3808c68e069f3f00b7f76bb84847d7b43ccb4ba0..4c03678d12e0c95ab613d44f2c152febd2e6294c
--- 1/nominatim/tokenizer/legacy_tokenizer.py
--- 2/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@@ -305,16 -305,54 +305,54 @@@ class LegacyNameAnalyzer
           return self.normalizer.transliterate(phrase)
   
   
-     def add_postcodes_from_db(self):
-         """ Add postcodes from the location_postcode table to the word table.
+     @staticmethod
+     def normalize_postcode(postcode):
+         """ Convert the postcode to a standardized form.
+ 
+             This function must yield exactly the same result as the SQL function
+             'token_normalized_postcode()'.
+         """
+         return postcode.strip().upper()
+ 
+ 
+     def update_postcodes_from_db(self):
+         """ Update postcode tokens in the word table from the location_postcode
+             table.
           """
           with self.conn.cursor() as cur:
-             cur.execute("""SELECT count(create_postcode_id(pc))
-                            FROM (SELECT distinct(postcode) as pc
-                                  FROM location_postcode) x""")
+             # This finds us the rows in location_postcode and word that are
+             # missing in the other table.
+             cur.execute("""SELECT * FROM
+                             (SELECT pc, word FROM
+                               (SELECT distinct(postcode) as pc FROM location_postcode) p
+                               FULL JOIN
+                               (SELECT word FROM word
+                                 WHERE class ='place' and type = 'postcode') w
+                               ON pc = word) x
+                            WHERE pc is null or word is null""")
+ 
+             to_delete = []
+             to_add = []
+ 
+             for postcode, word in cur:
+                 if postcode is None:
+                     to_delete.append(word)
+                 else:
+                     to_add.append(postcode)
+ 
+             if to_delete:
+                 cur.execute("""DELETE FROM WORD
+                                WHERE class ='place' and type = 'postcode'
+                                      and word = any(%s)
+                             """, (to_delete, ))
+             if to_add:
+                 cur.execute("""SELECT count(create_postcode_id(pc))
+                                FROM unnest(%s) as pc
+                             """, (to_add, ))
+ 
   
   
- -    def update_special_phrases(self, phrases):
+ +    def update_special_phrases(self, phrases, should_replace):
           """ Replace the search index for special phrases with the new phrases.
           """
           norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
diff --cc test/bdd/steps/nominatim_environment.py
Simple merge
diff --cc test/python/dummy_tokenizer.py

index 2e61a24524c992e8cc41161fe7fffc5d6e11d6a4,0a86ba8d1598752a00af622e3466f4deede31c0e..18e322caef3d5642939698ed5d83c22726058150
--- 1/test/python/dummy_tokenizer.py
--- 2/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@@ -51,10 -51,13 +51,13 @@@ class DummyNameAnalyzer
       def close(self):
           pass
   
-     def add_postcodes_from_db(self):
+     def normalize_postcode(self, postcode):
+         return postcode
+ 
+     def update_postcodes_from_db(self):
           pass
   
- -    def update_special_phrases(self, phrases):
+ +    def update_special_phrases(self, phrases, should_replace):
           self.analyser_cache['special_phrases'] = phrases
   
       def add_country_names(self, code, names):
diff --cc test/python/test_cli.py
Simple merge
diff --cc test/python/test_tokenizer_legacy.py

index 801471723c4b72b5be4c4f6938c188b059639408,15ae50a4ce94175b78fd444d97c9bb0a4e5ab2e5..76b51f717e93e8ca08de78433cdd3d31d15a8dad
--- 1/test/python/test_tokenizer_legacy.py
--- 2/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@@ -209,10 -221,9 +221,9 @@@ def test_update_special_phrase_empty_ta
           ("König bei", "amenity", "royal", "near"),
           ("Könige", "amenity", "royal", "-"),
           ("strasse", "highway", "primary", "in")
- -    ])
+ +    ], True)
   
-     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
-                                      FROM word WHERE class != 'place'""") \
+     assert word_table.get_special() \
                  == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
                          (' könige', 'könige', 'amenity', 'royal', None),
                          (' strasse', 'strasse', 'highway', 'primary', 'in')))
@@@ -220,46 -231,29 +231,42 @@@
   
   def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
                                             make_standard_name):
-     temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
-                               VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
-                                      (' bar', 'bar', 'highway', 'road', null)""")
+     word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in')
+     word_table.add_special(' bar', 'bar', 'highway', 'road', None)
   
-     assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+     assert word_table.count_special() == 2
   
- -    analyzer.update_special_phrases([])
+ +    analyzer.update_special_phrases([], True)
   
-     assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+     assert word_table.count_special() == 0
   
   
- def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
-                                       make_standard_name):
-     temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
-                               VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
-                                      (' bar', 'bar', 'highway', 'road', null)""")
+ +def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,
+ +                                          make_standard_name):
+ +    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+ +                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+ +                                     (' bar', 'bar', 'highway', 'road', null)""")
+ +
+ +    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+ +
+ +    analyzer.update_special_phrases([], False)
+ +
+ +    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+ +
+ +
+ def test_update_special_phrase_modify(analyzer, word_table, make_standard_name):
+     word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in')
+     word_table.add_special(' bar', 'bar', 'highway', 'road', None)
   
-     assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+     assert word_table.count_special() == 2
   
       analyzer.update_special_phrases([
         ('prison', 'amenity', 'prison', 'in'),
         ('bar', 'highway', 'road', '-'),
         ('garden', 'leisure', 'garden', 'near')
- -    ])
+ +    ], True)
   
-     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
-                                      FROM word WHERE class != 'place'""") \
+     assert word_table.get_special() \
                  == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
                          (' bar', 'bar', 'highway', 'road', None),
                          (' garden', 'garden', 'leisure', 'garden', 'near')))
diff --cc test/python/test_tokenizer_legacy_icu.py
Simple merge
author	AntoJvlt <antonin.jolivat@gmail.com>
	Mon, 17 May 2021 11:52:35 +0000 (13:52 +0200)
committer	AntoJvlt <antonin.jolivat@gmail.com>
	Mon, 17 May 2021 11:52:35 +0000 (13:52 +0200)
		1	2
docs/admin/Import.md	patch \|	diff1 \|	diff2 \|	blob \| history
nominatim/tokenizer/legacy_icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
nominatim/tokenizer/legacy_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
test/bdd/steps/nominatim_environment.py	patch \|	diff1 \|	diff2 \|	blob \| history
test/python/dummy_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
test/python/test_cli.py	patch \|	diff1 \|	diff2 \|	blob \| history
test/python/test_tokenizer_legacy.py	patch \|	diff1 \|	diff2 \|	blob \| history
test/python/test_tokenizer_legacy_icu.py	patch \|	diff1 \|	diff2 \|	blob \| history