module/nominatim.c

   1 /**
   2  * SPDX-License-Identifier: GPL-2.0-only
   3  *
   4  * This file is part of Nominatim. (https://nominatim.org)
   5  *
   6  * Copyright (C) 2022 by the Nominatim developer community.
   7  * For a full list of authors see the git log.
   8  */
   9 #include "postgres.h"
  10 #include "fmgr.h"
  11 #include "mb/pg_wchar.h"
  12 #include <utfasciitable.h>
  13
  14 #if PG_MAJORVERSION_NUM > 15
  15 #include "varatt.h"
  16 #endif
  17
  18 PG_MODULE_MAGIC;
  19
  20 Datum transliteration( PG_FUNCTION_ARGS );
  21 Datum gettokenstring( PG_FUNCTION_ARGS );
  22 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
  23 void str_dupspaces(char* buffer);
  24
  25 PG_FUNCTION_INFO_V1( transliteration );
  26 Datum
  27 transliteration( PG_FUNCTION_ARGS )
  28 {
  29         static char * ascii = UTFASCII;
  30         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
  31         char * asciipos;
  32
  33         text *source;
  34         unsigned char *sourcedata;
  35         int sourcedatalength;
  36
  37         unsigned int c1,c2,c3,c4;
  38         unsigned int * wchardata;
  39         unsigned int * wchardatastart;
  40
  41         text *result;
  42         unsigned char *resultdata;
  43         int resultdatalength;
  44         int iLen;
  45
  46         if (GetDatabaseEncoding() != PG_UTF8)
  47         {
  48                 ereport(ERROR,
  49                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
  50                                          errmsg("requires UTF8 database encoding")));
  51         }
  52
  53         if (PG_ARGISNULL(0))
  54         {
  55                 PG_RETURN_NULL();
  56         }
  57
  58         // The original string
  59         source = PG_GETARG_TEXT_P(0);
  60         sourcedata = (unsigned char *)VARDATA(source);
  61         sourcedatalength = VARSIZE(source) - VARHDRSZ;
  62
  63         // Intermediate wchar version of string
  64         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
  65
  66         // Based on pg_utf2wchar_with_len from wchar.c
  67         // Postgresql strings are not zero terminalted
  68         while (sourcedatalength > 0)
  69         {
  70                 if ((*sourcedata & 0x80) == 0)
  71                 {
  72                         *wchardata = *sourcedata++;
  73                         wchardata++;
  74                         sourcedatalength--;
  75                 }
  76                 else if ((*sourcedata & 0xe0) == 0xc0)
  77                 {
  78                         if (sourcedatalength < 2) break;
  79                         c1 = *sourcedata++ & 0x1f;
  80                         c2 = *sourcedata++ & 0x3f;
  81                         *wchardata = (c1 << 6) | c2;
  82                         if (*wchardata < 65536) wchardata++;
  83                         sourcedatalength -= 2;
  84                 }
  85                 else if ((*sourcedata & 0xf0) == 0xe0)
  86                 {
  87                         if (sourcedatalength < 3) break;
  88                         c1 = *sourcedata++ & 0x0f;
  89                         c2 = *sourcedata++ & 0x3f;
  90                         c3 = *sourcedata++ & 0x3f;
  91                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
  92                         if (*wchardata < 65536) wchardata++;
  93                         sourcedatalength -= 3;
  94                 }
  95                 else if ((*sourcedata & 0xf8) == 0xf0)
  96                 {
  97                         if (sourcedatalength < 4) break;
  98                         c1 = *sourcedata++ & 0x07;
  99                         c2 = *sourcedata++ & 0x3f;
 100                         c3 = *sourcedata++ & 0x3f;
 101                         c4 = *sourcedata++ & 0x3f;
 102                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
 103                         if (*wchardata < 65536) wchardata++;
 104                         sourcedatalength -= 4;
 105                 }
 106                 else if ((*sourcedata & 0xfc) == 0xf8)
 107                 {
 108                         // table does not extend beyond 4 char long, just skip
 109                         if (sourcedatalength < 5) break;
 110                         sourcedatalength -= 5;
 111                         sourcedata += 5;
 112                 }
 113                 else if ((*sourcedata & 0xfe) == 0xfc)
 114                 {
 115                         // table does not extend beyond 4 char long, just skip
 116                         if (sourcedatalength < 6) break;
 117                         sourcedatalength -= 6;
 118                         sourcedata += 6;
 119                 }
 120                 else
 121                 {
 122                         // assume lenngth 1, silently drop bogus characters
 123                         sourcedatalength--;
 124                         sourcedata += 1;
 125                 }
 126         }
 127         *wchardata = 0;
 128
 129         // calc the length of transliteration string
 130         resultdatalength = 0;
 131         wchardata = wchardatastart;
 132         while(*wchardata)
 133         {
 134                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
 135                 wchardata++;
 136         }
 137
 138         // allocate & create the result
 139         result = (text *)palloc(resultdatalength + VARHDRSZ);
 140         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
 141         resultdata = (unsigned char *)VARDATA(result);
 142
 143         wchardata = wchardatastart;
 144         while(*wchardata)
 145         {
 146                 if (*(asciilookup + *wchardata) > 0)
 147                 {
 148                         asciipos = ascii + *(asciilookup + *wchardata);
 149                         for(iLen = *asciipos; iLen > 0; iLen--)
 150                         {
 151                                 asciipos++;
 152                                 *resultdata = *asciipos;
 153                                 resultdata++;
 154                         }
 155                 }
 156                 /*else
 157                 {
 158                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
 159                               errmsg( "missing char: %i\n", *wchardata )));
 160
 161                 }*/
 162                 wchardata++;
 163         }
 164
 165         pfree(wchardatastart);
 166
 167         PG_RETURN_TEXT_P(result);
 168 }
 169
 170 // Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
 171 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
 172 {
 173         char *p;
 174
 175         // Search string is too long to be present
 176         if (fromlen > *len) return;
 177
 178         p = strstr(buffer, from);
 179         while(p)
 180         {
 181                 if (!isspace || (p > buffer && *(p-1) != ' '))
 182                 {
 183                         (*changes)++;
 184                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
 185                         memcpy(p, to, tolen);
 186                         *len += tolen - fromlen;
 187                 }
 188                 p = strstr(p+1, from);
 189         }
 190 }
 191
 192 void str_dupspaces(char* buffer)
 193 {
 194         char *out;
 195         int wasspace;
 196
 197         out = buffer;
 198         wasspace = 0;
 199         while(*buffer)
 200         {
 201                 if (wasspace && *buffer != ' ') wasspace = 0;
 202                 if (!wasspace)
 203                 {
 204                         *out = *buffer;
 205                         out++;
 206                         wasspace = (*buffer == ' ');
 207                 }
 208                 buffer++;
 209         }
 210         *out = 0;
 211 }
 212
 213 PG_FUNCTION_INFO_V1( gettokenstring );
 214 Datum
 215 gettokenstring( PG_FUNCTION_ARGS )
 216 {
 217         text *source;
 218         unsigned char *sourcedata;
 219         int sourcedatalength;
 220
 221         char * buffer;
 222         int len;
 223         int changes;
 224
 225         text *result;
 226
 227         if (GetDatabaseEncoding() != PG_UTF8)
 228         {
 229                 ereport(ERROR,
 230                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 231                                          errmsg("requires UTF8 database encoding")));
 232         }
 233
 234         if (PG_ARGISNULL(0))
 235         {
 236                 PG_RETURN_NULL();
 237         }
 238
 239         // The original string
 240         source = PG_GETARG_TEXT_P(0);
 241         sourcedata = (unsigned char *)VARDATA(source);
 242         sourcedatalength = VARSIZE(source) - VARHDRSZ;
 243
 244         // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
 245         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
 246         memcpy(buffer+1, sourcedata, sourcedatalength);
 247         buffer[0] = 32;
 248         buffer[sourcedatalength+1] = 32;
 249         buffer[sourcedatalength+2] = 0;
 250         len = sourcedatalength+3;
 251
 252         changes = 1;
 253         str_dupspaces(buffer);
 254         while(changes)
 255         {
 256                 changes = 0;
 257                 #include <tokenstringreplacements.inc>
 258                 str_dupspaces(buffer);
 259         }
 260
 261         // 'and' in various languages
 262         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
 263         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
 264         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
 265         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
 266         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
 267
 268         // 'the' (and similar)
 269         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
 270         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
 271         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
 272         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
 273         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
 274         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
 275         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
 276         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
 277         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
 278
 279         // german
 280         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
 281         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
 282         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
 283         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
 284         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
 285         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
 286
 287         // russian
 288         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
 289         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
 290
 291         // allocate & create the result
 292         len--;// Drop the terminating zero
 293         result = (text *)palloc(len + VARHDRSZ);
 294         SET_VARSIZE(result, len + VARHDRSZ);
 295         memcpy(VARDATA(result), buffer, len);
 296
 297         pfree(buffer);
 298
 299         PG_RETURN_TEXT_P(result);
 300 }
 301