X-Git-Url: https://git.openstreetmap.org/nominatim.git/blobdiff_plain/ab6a99677293c9ddb96a717f03b8a009e98ab955..40b87bbadf2f6eabdc94a125b83a83cdc7fecc4e:/module/nominatim.c diff --git a/module/nominatim.c b/module/nominatim.c index 15553153..54632f76 100644 --- a/module/nominatim.c +++ b/module/nominatim.c @@ -1,12 +1,22 @@ +/** + * SPDX-License-Identifier: GPL-2.0-only + * + * This file is part of Nominatim. (https://nominatim.org) + * + * Copyright (C) 2022 by the Nominatim developer community. + * For a full list of authors see the git log. + */ #include "postgres.h" #include "fmgr.h" #include "mb/pg_wchar.h" #include -#ifdef PG_MODULE_MAGIC -PG_MODULE_MAGIC; +#if PG_MAJORVERSION_NUM > 15 +#include "varatt.h" #endif +PG_MODULE_MAGIC; + Datum transliteration( PG_FUNCTION_ARGS ); Datum gettokenstring( PG_FUNCTION_ARGS ); void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int); @@ -54,7 +64,8 @@ transliteration( PG_FUNCTION_ARGS ) wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int)); // Based on pg_utf2wchar_with_len from wchar.c - while (sourcedatalength > 0 && *sourcedata) + // Postgresql strings are not zero terminalted + while (sourcedatalength > 0) { if ((*sourcedata & 0x80) == 0) { @@ -68,7 +79,7 @@ transliteration( PG_FUNCTION_ARGS ) c1 = *sourcedata++ & 0x1f; c2 = *sourcedata++ & 0x3f; *wchardata = (c1 << 6) | c2; - wchardata++; + if (*wchardata < 65536) wchardata++; sourcedatalength -= 2; } else if ((*sourcedata & 0xf0) == 0xe0) @@ -78,7 +89,7 @@ transliteration( PG_FUNCTION_ARGS ) c2 = *sourcedata++ & 0x3f; c3 = *sourcedata++ & 0x3f; *wchardata = (c1 << 12) | (c2 << 6) | c3; - wchardata++; + if (*wchardata < 65536) wchardata++; sourcedatalength -= 3; } else if ((*sourcedata & 0xf8) == 0xf0) @@ -89,7 +100,7 @@ transliteration( PG_FUNCTION_ARGS ) c3 = *sourcedata++ & 0x3f; c4 = *sourcedata++ & 0x3f; *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4; - wchardata++; + if (*wchardata < 65536) wchardata++; sourcedatalength -= 4; } else if ((*sourcedata & 0xfc) == 0xf8) @@ -97,17 +108,20 @@ transliteration( PG_FUNCTION_ARGS ) // table does not extend beyond 4 char long, just skip if (sourcedatalength < 5) break; sourcedatalength -= 5; + sourcedata += 5; } else if ((*sourcedata & 0xfe) == 0xfc) { // table does not extend beyond 4 char long, just skip if (sourcedatalength < 6) break; sourcedatalength -= 6; + sourcedata += 6; } else { // assume lenngth 1, silently drop bogus characters sourcedatalength--; + sourcedata += 1; } } *wchardata = 0; @@ -139,12 +153,12 @@ transliteration( PG_FUNCTION_ARGS ) resultdata++; } } - else + /*else { ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ), errmsg( "missing char: %i\n", *wchardata ))); - } + }*/ wchardata++; } @@ -153,17 +167,18 @@ transliteration( PG_FUNCTION_ARGS ) PG_RETURN_TEXT_P(result); } +// Set isspace=1 if the replacement _only_ adds a space before the search string. I.e. to == " " + from void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace) { char *p; - // Search string is too long to be pressent + // Search string is too long to be present if (fromlen > *len) return; p = strstr(buffer, from); while(p) { - if (!isspace || *(p-1) != ' ') + if (!isspace || (p > buffer && *(p-1) != ' ')) { (*changes)++; if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1); @@ -226,7 +241,7 @@ gettokenstring( PG_FUNCTION_ARGS ) sourcedata = (unsigned char *)VARDATA(source); sourcedatalength = VARSIZE(source) - VARHDRSZ; - // Buffer for doing the replace in - string could get slightly longer (double is mastive overkill) + // Buffer for doing the replace in - string could get slightly longer (double is massive overkill) buffer = (char *)palloc((sourcedatalength*2)*sizeof(char)); memcpy(buffer+1, sourcedata, sourcedatalength); buffer[0] = 32; @@ -248,7 +263,6 @@ gettokenstring( PG_FUNCTION_ARGS ) str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0); str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0); str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0); - str_replace(buffer, &len, &changes, " e ", 3, " ", 1, 0); str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0); // 'the' (and similar)