]> git.openstreetmap.org Git - nominatim.git/blob - module/nominatim.c
Merge pull request #3367 from lonvia/address-word-counts
[nominatim.git] / module / nominatim.c
1 /**
2  * SPDX-License-Identifier: GPL-2.0-only
3  *
4  * This file is part of Nominatim. (https://nominatim.org)
5  *
6  * Copyright (C) 2022 by the Nominatim developer community.
7  * For a full list of authors see the git log.
8  */
9 #include "postgres.h"
10 #include "fmgr.h"
11 #include "mb/pg_wchar.h"
12 #include <utfasciitable.h>
13
14 #if PG_MAJORVERSION_NUM > 15
15 #include "varatt.h"
16 #endif
17
18 PG_MODULE_MAGIC;
19
20 Datum transliteration( PG_FUNCTION_ARGS );
21 Datum gettokenstring( PG_FUNCTION_ARGS );
22 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
23 void str_dupspaces(char* buffer);
24
25 PG_FUNCTION_INFO_V1( transliteration );
26 Datum
27 transliteration( PG_FUNCTION_ARGS )
28 {
29         static char * ascii = UTFASCII;
30         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
31         char * asciipos;
32
33         text *source;
34         unsigned char *sourcedata;
35         int sourcedatalength;
36
37         unsigned int c1,c2,c3,c4;
38         unsigned int * wchardata;
39         unsigned int * wchardatastart;
40
41         text *result;
42         unsigned char *resultdata;
43         int resultdatalength;
44         int iLen;
45
46         if (GetDatabaseEncoding() != PG_UTF8) 
47         {
48                 ereport(ERROR,
49                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
50                                          errmsg("requires UTF8 database encoding")));
51         }
52
53         if (PG_ARGISNULL(0))
54         {
55                 PG_RETURN_NULL();
56         }
57
58         // The original string
59         source = PG_GETARG_TEXT_P(0);
60         sourcedata = (unsigned char *)VARDATA(source);
61         sourcedatalength = VARSIZE(source) - VARHDRSZ;
62
63         // Intermediate wchar version of string
64         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
65
66         // Based on pg_utf2wchar_with_len from wchar.c
67         // Postgresql strings are not zero terminalted
68         while (sourcedatalength > 0)
69         {
70                 if ((*sourcedata & 0x80) == 0)
71                 {
72                         *wchardata = *sourcedata++;
73                         wchardata++;
74                         sourcedatalength--;
75                 }
76                 else if ((*sourcedata & 0xe0) == 0xc0)
77                 {
78                         if (sourcedatalength < 2) break;
79                         c1 = *sourcedata++ & 0x1f;
80                         c2 = *sourcedata++ & 0x3f;
81                         *wchardata = (c1 << 6) | c2;
82                         if (*wchardata < 65536) wchardata++;
83                         sourcedatalength -= 2;
84                 }
85                 else if ((*sourcedata & 0xf0) == 0xe0)
86                 {
87                         if (sourcedatalength < 3) break;
88                         c1 = *sourcedata++ & 0x0f;
89                         c2 = *sourcedata++ & 0x3f;
90                         c3 = *sourcedata++ & 0x3f;
91                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
92                         if (*wchardata < 65536) wchardata++;
93                         sourcedatalength -= 3;
94                 }
95                 else if ((*sourcedata & 0xf8) == 0xf0)
96                 {
97                         if (sourcedatalength < 4) break;
98                         c1 = *sourcedata++ & 0x07;
99                         c2 = *sourcedata++ & 0x3f;
100                         c3 = *sourcedata++ & 0x3f;
101                         c4 = *sourcedata++ & 0x3f;
102                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
103                         if (*wchardata < 65536) wchardata++;
104                         sourcedatalength -= 4;
105                 }
106                 else if ((*sourcedata & 0xfc) == 0xf8)
107                 {
108                         // table does not extend beyond 4 char long, just skip
109                         if (sourcedatalength < 5) break;
110                         sourcedatalength -= 5;
111                         sourcedata += 5;
112                 }
113                 else if ((*sourcedata & 0xfe) == 0xfc)
114                 {
115                         // table does not extend beyond 4 char long, just skip
116                         if (sourcedatalength < 6) break;
117                         sourcedatalength -= 6;
118                         sourcedata += 6;
119                 }
120                 else
121                 {
122                         // assume lenngth 1, silently drop bogus characters
123                         sourcedatalength--;
124                         sourcedata += 1;
125                 }
126         }
127         *wchardata = 0;
128
129         // calc the length of transliteration string
130         resultdatalength = 0;
131         wchardata = wchardatastart;
132         while(*wchardata)
133         {
134                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
135                 wchardata++;
136         }
137
138         // allocate & create the result
139         result = (text *)palloc(resultdatalength + VARHDRSZ);
140         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
141         resultdata = (unsigned char *)VARDATA(result);
142
143         wchardata = wchardatastart;
144         while(*wchardata)
145         {
146                 if (*(asciilookup + *wchardata) > 0)
147                 {
148                         asciipos = ascii + *(asciilookup + *wchardata);
149                         for(iLen = *asciipos; iLen > 0; iLen--)
150                         {
151                                 asciipos++;
152                                 *resultdata = *asciipos;
153                                 resultdata++;
154                         }
155                 }
156                 /*else
157                 {
158                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
159                               errmsg( "missing char: %i\n", *wchardata )));
160                         
161                 }*/
162                 wchardata++;
163         }
164
165         pfree(wchardatastart);
166
167         PG_RETURN_TEXT_P(result);
168 }
169
170 // Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
171 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
172 {
173         char *p;
174
175         // Search string is too long to be present
176         if (fromlen > *len) return;
177
178         p = strstr(buffer, from);
179         while(p)
180         {
181                 if (!isspace || (p > buffer && *(p-1) != ' '))
182                 {
183                         (*changes)++;
184                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
185                         memcpy(p, to, tolen);
186                         *len += tolen - fromlen;
187                 }
188                 p = strstr(p+1, from);
189         }
190 }
191
192 void str_dupspaces(char* buffer)
193 {
194         char *out;
195         int wasspace;
196
197         out = buffer;
198         wasspace = 0;
199         while(*buffer)
200         {
201                 if (wasspace && *buffer != ' ') wasspace = 0;
202                 if (!wasspace)
203                 {
204                         *out = *buffer;
205                         out++;
206                         wasspace = (*buffer == ' ');
207                 }
208                 buffer++;
209         }
210         *out = 0;
211 }
212
213 PG_FUNCTION_INFO_V1( gettokenstring );
214 Datum
215 gettokenstring( PG_FUNCTION_ARGS )
216 {
217         text *source;
218         unsigned char *sourcedata;
219         int sourcedatalength;
220
221         char * buffer;
222         int len;
223         int changes;
224
225         text *result;
226
227         if (GetDatabaseEncoding() != PG_UTF8) 
228         {
229                 ereport(ERROR,
230                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
231                                          errmsg("requires UTF8 database encoding")));
232         }
233
234         if (PG_ARGISNULL(0))
235         {
236                 PG_RETURN_NULL();
237         }
238
239         // The original string
240         source = PG_GETARG_TEXT_P(0);
241         sourcedata = (unsigned char *)VARDATA(source);
242         sourcedatalength = VARSIZE(source) - VARHDRSZ;
243
244         // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
245         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
246         memcpy(buffer+1, sourcedata, sourcedatalength);
247         buffer[0] = 32;
248         buffer[sourcedatalength+1] = 32;
249         buffer[sourcedatalength+2] = 0;
250         len = sourcedatalength+3;
251
252         changes = 1;
253         str_dupspaces(buffer);
254         while(changes)
255         {
256                 changes = 0;
257                 #include <tokenstringreplacements.inc>
258                 str_dupspaces(buffer);
259         }
260
261         // 'and' in various languages
262         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
263         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
264         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
265         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
266         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
267
268         // 'the' (and similar)
269         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
270         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
271         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
272         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
273         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
274         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
275         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
276         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
277         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
278
279         // german
280         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
281         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
282         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
283         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
284         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
285         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
286
287         // russian
288         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
289         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
290
291         // allocate & create the result
292         len--;// Drop the terminating zero
293         result = (text *)palloc(len + VARHDRSZ);
294         SET_VARSIZE(result, len + VARHDRSZ);
295         memcpy(VARDATA(result), buffer, len);
296
297         pfree(buffer);
298
299         PG_RETURN_TEXT_P(result);
300 }
301