From: Sarah Hoffmann Date: Wed, 23 Nov 2022 16:58:42 +0000 (+0100) Subject: Merge pull request #2902 from lonvia/tiger-county-sanitizer X-Git-Tag: v4.2.0~4 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/8dfdf64dd5b3f0d88a011654076a0a041b7b252c?hp=55ee08f42b5e2b503a06f03c11a4fb3daf99ad29 Merge pull request #2902 from lonvia/tiger-county-sanitizer Tiger county sanitizer --- diff --git a/.pylintrc b/.pylintrc index e8609407..e62371c6 100644 --- a/.pylintrc +++ b/.pylintrc @@ -15,4 +15,4 @@ ignored-classes=NominatimArgs,closing # typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager -good-names=i,x,y,fd,db,cc +good-names=i,x,y,m,fd,db,cc diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index c563b201..58606c29 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -213,6 +213,15 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +##### clean-tiger-tags + +::: nominatim.tokenizer.sanitizers.clean_tiger_tags + selection: + members: False + rendering: + heading_level: 6 + + #### Token Analysis diff --git a/nominatim/tokenizer/sanitizers/clean_tiger_tags.py b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py new file mode 100644 index 00000000..9698a326 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that preprocesses tags from the TIGER import. + +It makes the following changes: + +* remove state reference from tiger:county +""" +from typing import Callable +import re + +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.tokenizer.sanitizers.config import SanitizerConfig + +COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]') + +def _clean_tiger_county(obj: ProcessInfo) -> None: + """ Remove the state reference from tiger:county tags. + + This transforms a name like 'Hamilton, AL' into 'Hamilton'. + If no state reference is detected at the end, the name is left as is. + """ + if not obj.address: + return + + for item in obj.address: + if item.kind == 'tiger' and item.suffix == 'county': + m = COUNTY_MATCH.fullmatch(item.name) + if m: + item.name = m[1] + # Switch kind and suffix, the split left them reversed. + item.kind = 'county' + item.suffix = 'tiger' + + return + + +def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: + """ Create a housenumber processing function. + """ + return _clean_tiger_county diff --git a/settings/flex-base.lua b/settings/flex-base.lua index 19f4e27b..fe3ce32a 100644 --- a/settings/flex-base.lua +++ b/settings/flex-base.lua @@ -347,16 +347,6 @@ function process_tags(o) local is_interpolation = o:grab_address{match=INTERPOLATION_TAGS} > 0 - if ADD_TIGER_COUNTY then - local v = o:grab_tag('tiger:county') - if v ~= nil then - v, num = v:gsub(',.*', ' county') - if num == 0 then - v = v .. ' county' - end - o:set_address('tiger:county', v) - end - end o:grab_address{match=ADDRESS_TAGS} if is_interpolation then diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 212fdcb9..16339970 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -35,6 +35,7 @@ sanitizers: - step: clean-postcodes convert-to-address: yes default-pattern: "[A-Z0-9- ]{3,12}" + - step: clean-tiger-tags - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/settings/import-extratags.lua b/settings/import-extratags.lua index 535af3c8..7b1880ef 100644 --- a/settings/import-extratags.lua +++ b/settings/import-extratags.lua @@ -123,8 +123,7 @@ HOUSENUMBER_TAGS = tag_match{keys = {'addr:housenumber', 'addr:conscriptionnumbe INTERPOLATION_TAGS = tag_match{keys = {'addr:interpolation'}} -ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*'}} -ADD_TIGER_COUNTY = true +ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*', 'tiger:county'}} SAVE_EXTRA_MAINS = true diff --git a/test/bdd/osm2pgsql/import/tags.feature b/test/bdd/osm2pgsql/import/tags.feature index 1f6857f2..60d241fe 100644 --- a/test/bdd/osm2pgsql/import/tags.feature +++ b/test/bdd/osm2pgsql/import/tags.feature @@ -166,20 +166,6 @@ Feature: Tag evaluation | N10003:place | place | island | - Scenario: Shorten tiger:county tags - When loading osm data - """ - n11001 Tplace=village,tiger:county=Feebourgh%2c%%20%AL - n11002 Tplace=village,addr:state=Alabama,tiger:county=Feebourgh%2c%%20%AL - n11003 Tplace=village,tiger:county=Feebourgh - """ - Then place contains exactly - | object | class | address | - | N11001 | place | 'tiger:county': 'Feebourgh county' | - | N11002 | place | 'tiger:county': 'Feebourgh county', 'state': 'Alabama' | - | N11003 | place | 'tiger:county': 'Feebourgh county' | - - Scenario: Building fallbacks When loading osm data """ diff --git a/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py new file mode 100644 index 00000000..fc17ad24 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for sanitizer that clean up TIGER tags. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.data.place_info import PlaceInfo + +class TestCleanTigerTags: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, addr): + place = PlaceInfo({'address': addr}) + _, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in outaddr]) + + @pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'), + ('Little, Borough, CA', 'Little, Borough')]) + def test_well_formatted(self, inname, outname): + assert self.run_sanitizer_on({'tiger:county': inname})\ + == [(outname, 'county', 'tiger')] + + + @pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', '')) + def test_badly_formatted(self, name): + assert self.run_sanitizer_on({'tiger:county': name})\ + == [(name, 'county', 'tiger')] + + + def test_unmatched(self): + assert self.run_sanitizer_on({'tiger:country': 'US'})\ + == [('US', 'tiger', 'country')]