]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
README: tiny markdown syntax error
[nominatim.git] / utils / importWikipedia.php
1 <?php
2
3 require_once(CONST_BasePath.'/lib/init-cmd.php');
4 ini_set('memory_limit', '800M');
5
6 $aCMDOptions
7  = array(
8     'Create and setup nominatim search system',
9     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
10     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
11     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
12
13     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
14     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
15     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
16    );
17 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
18
19 /*
20 $sTestPageText = <<<EOD
21 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
22 {{ Infobox Amusement park
23 | name = Six Flags Great Adventure
24 | image = [[File:SixFlagsGreatAdventure logo.png]]
25 | caption = Six Flags Great Adventure logo
26 | location = [[Jackson, New Jersey|Jackson]]
27 | location2 = New Jersey
28 | location3 = United States
29 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
30 | season = March/April through October/November
31 | opening_date = July 1, 1974
32 | previous_names = Great Adventure
33 | area_acre = 2200
34 | rides = 45 park admission rides
35 | coasters = 12
36 | water_rides = 2
37 | owner = [[Six Flags]]
38 | general_manager =
39 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
40 }}
41 EOD;
42 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
43 exit;
44 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
45 */
46 /*
47
48     $a = array();
49     $a[] = 'test';
50
51     $oDB &= getDB();
52
53     if ($aCMDResult['drop-tables'])
54     {
55         $oDB->query('DROP TABLE wikipedia_article');
56         $oDB->query('DROP TABLE wikipedia_link');
57     }
58 */
59
60 if ($aCMDResult['create-tables']) {
61     $sSQL = <<<'EOD'
62 CREATE TABLE wikipedia_article (
63     language text NOT NULL,
64     title text NOT NULL,
65     langcount integer,
66     othercount integer,
67     totalcount integer,
68     lat double precision,
69     lon double precision,
70     importance double precision,
71     title_en text,
72     osm_type character(1),
73     osm_id bigint,
74     infobox_type text,
75     population bigint,
76     website text
77 );
78         $oDB->query($sSQL);
79
80         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
81
82         $sSQL = <<<'EOD'
83 CREATE TABLE wikipedia_link (
84   from_id INTEGER,
85   to_name text
86   );
87 EOD;
88     $oDB->query($sSQL);
89 }
90
91
92 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
93 {
94     $sNSEW = strtoupper($sNSEW);
95     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
96 }
97
98
99 function _parseWikipediaContent($sPageText)
100 {
101     $sPageText = str_replace("\n", ' ', $sPageText);
102     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104
105     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106
107     $aPageProperties = array();
108     $sPageBody = '';
109     $aTemplates = array();
110     $aLinks = array();
111
112     $aTemplateStack = array();
113     $aState = array('body');
114     foreach ($aPageText as $i => $sPart) {
115         switch ($sPart) {
116             case '{{':
117                 array_unshift($aTemplateStack, array('', array()));
118                 array_unshift($aState, 'template');
119                 break;
120             case '}}':
121                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
122                     $aTemplate = array_shift($aTemplateStack);
123                     array_shift($aState);
124
125                     $aTemplates[] = $aTemplate;
126                 }
127                 break;
128             case '[[':
129                 $sLinkPage = '';
130                 $sLinkSyn = '';
131                 array_unshift($aState, 'link');
132                 break;
133             case ']]':
134                 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
135                     if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
136                     if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
137
138                     $aLinks[] = array($sLinkPage, $sLinkSyn);
139
140                     array_shift($aState);
141                     switch ($aState[0]) {
142                         case 'template':
143                             $aTemplateStack[0][0] .= trim($sPart);
144                             break;
145                         case 'templateparam':
146                             $aTemplateStack[0][1][0] .= $sLinkSyn;
147                             break;
148                         case 'link':
149                             $sLinkPage .= trim($sPart);
150                             break;
151                         case 'linksynonim':
152                             $sLinkSyn .= $sPart;
153                             break;
154                         case 'body':
155                             $sPageBody .= $sLinkSyn;
156                             break;
157                         default:
158                             var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
159                             fail('unknown state');
160                     }
161                 }
162                 break;
163             case '|':
164                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
165                     // Create a new template paramater
166                     $aState[0] = 'templateparam';
167                     array_unshift($aTemplateStack[0][1], '');
168                 }
169                 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
170                 break;
171             default:
172                 switch ($aState[0]) {
173                     case 'template':
174                         $aTemplateStack[0][0] .= trim($sPart);
175                         break;
176                     case 'templateparam':
177                         $aTemplateStack[0][1][0] .= $sPart;
178                         break;
179                     case 'link':
180                         $sLinkPage .= trim($sPart);
181                         break;
182                     case 'linksynonim':
183                         $sLinkSyn .= $sPart;
184                         break;
185                     case 'body':
186                         $sPageBody .= $sPart;
187                         break;
188                     default:
189                         var_dump($aState, $aPageText);
190                         fail('unknown state');
191                 }
192                 break;
193         }
194     }
195     return $aTemplates;
196 }
197
198 function _templatesToProperties($aTemplates)
199 {
200     $aPageProperties = array();
201     foreach ($aTemplates as $iTemplate => $aTemplate) {
202         $aParams = array();
203         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
204             if (($iPos = strpos($sParam, '=')) === false) {
205                 $aParams[] = trim($sParam);
206             } else {
207                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
208             }
209         }
210         $aTemplates[$iTemplate][1] = $aParams;
211         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
212         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
213             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
214         }
215         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
216             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
217         }
218         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
219             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
220         }
221         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
222             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
223         }
224         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
225             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
226                 $aPageProperties['sWebsite'] = $aMatch[1];
227                 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
228                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
229                 }
230             }
231         }
232         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
233             $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
234         }
235
236         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
237             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
238             // $aPageProperties['aInfoboxParams'] = $aParams;
239         }
240
241         // Assume the first template with lots of params is the type (fallback for infobox)
242         if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
243             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
244             // $aPageProperties['aInfoboxParams'] = $aParams;
245         }
246
247         // do we have a lat/lon
248         if (!isset($aPageProperties['fLat'])) {
249             if (isset($aParams['latd']) && isset($aParams['longd'])) {
250                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
251                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
252             }
253             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
254                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
255                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
256             }
257             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
258                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
259                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
260                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
261                 }
262             }
263             if (strtolower($aTemplate[0]) == 'coord') {
264                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
265                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
266                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
267                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
268                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
269                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
270                 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
271                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
272                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
273                 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
274                     $aPageProperties['fLat'] = (float)$aParams[0];
275                     $aPageProperties['fLon'] = (float)$aParams[1];
276                 }
277             }
278             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
279                 $aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
280                 $aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
281                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
282                     $aPageProperties['fLat'] =
283                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
284                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
285                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
286                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
287                 }
288
289                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
290                     $aPageProperties['fLon'] =
291                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
292                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
293                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
294                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
295                 }
296             }
297         }
298     }
299     if (isset($aPageProperties['sPossibleInfoboxType'])) {
300         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
301         unset($aPageProperties['sPossibleInfoboxType']);
302     }
303     return $aPageProperties;
304 }
305
306 if (isset($aCMDResult['parse-wikipedia'])) {
307     $oDB =& getDB();
308     $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
309     $sSQL .= $aCMDResult['parse-wikipedia'];
310     $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
311     $aArticleNames = $oDB->getCol($sSQL);
312     /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
313         and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
314         and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
315      */
316     foreach ($aArticleNames as $sArticleName) {
317         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
318         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
319
320         if (isset($aP['sInfoboxType'])) {
321             $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
322             $sSQL = 'update wikipedia_article set ';
323             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
324             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
325             $oDB->query($sSQL);
326         }
327         if (isset($aP['iPopulation'])) {
328             $sSQL = 'update wikipedia_article set ';
329             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
330             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
331             $oDB->query($sSQL);
332         }
333         if (isset($aP['sWebsite'])) {
334             $sSQL = 'update wikipedia_article set ';
335             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
336             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
337             $oDB->query($sSQL);
338         }
339         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
340             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
341             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
342             $sSQL = 'update wikipedia_article set ';
343             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
344             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
345             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
346             $oDB->query($sSQL);
347         }
348     }
349 }
350
351
352 function nominatimXMLStart($hParser, $sName, $aAttr)
353 {
354     global $aNominatRecords;
355     switch ($sName) {
356         case 'PLACE':
357             $aNominatRecords[] = $aAttr;
358             break;
359     }
360 }
361
362
363 function nominatimXMLEnd($hParser, $sName)
364 {
365 }
366
367
368 if (isset($aCMDResult['link'])) {
369     $oDB =& getDB();
370     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
371
372     // If you point this script at production OSM you will be blocked
373     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
374
375     foreach ($aWikiArticles as $aRecord) {
376         $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
377
378         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
379
380         echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
381         $fMaxDist = 0.0000001;
382         $bUnknown = false;
383         switch (strtolower($aRecord['infobox_type'])) {
384             case 'former country':
385                 continue 2;
386             case 'sea':
387                 $fMaxDist = 60; // effectively turn it off
388                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
389                 break;
390             case 'country':
391             case 'island':
392             case 'islands':
393             case 'continent':
394                 $fMaxDist = 60; // effectively turn it off
395                 $sURL .= '&featuretype=country';
396                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
397                 break;
398             case 'prefecture japan':
399                 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
400                 // intentionally no break
401             case 'state':
402             case '#us state':
403             case 'county':
404             case 'u.s. state':
405             case 'u.s. state symbols':
406             case 'german state':
407             case 'province or territory of canada':
408             case 'indian jurisdiction':
409             case 'province':
410             case 'french region':
411             case 'region of italy':
412             case 'kommune':
413             case '#australia state or territory':
414             case 'russian federal subject':
415                 $fMaxDist = 4;
416                 $sURL .= '&featuretype=state';
417                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
418                 break;
419             case 'protected area':
420                 $fMaxDist = 1;
421                 $sURL .= '&nearlat='.$aRecord['lat'];
422                 $sURL .= '&nearlon='.$aRecord['lon'];
423                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
424                 break;
425             case 'settlement':
426                 $bUnknown = true;
427                 // intentionally no break
428             case 'french commune':
429             case 'italian comune':
430             case 'uk place':
431             case 'italian comune':
432             case 'australian place':
433             case 'german place':
434             case '#geobox':
435             case 'u.s. county':
436             case 'municipality':
437             case 'city japan':
438             case 'russian inhabited locality':
439             case 'finnish municipality/land area':
440             case 'england county':
441             case 'israel municipality':
442             case 'russian city':
443             case 'city':
444                 $fMaxDist = 0.2;
445                 $sURL .= '&featuretype=settlement';
446                 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
447                 break;
448             case 'mountain':
449             case 'mountain pass':
450             case 'river':
451             case 'lake':
452             case 'airport':
453                 $fMaxDist = 0.2;
454                 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
455                 break;
456             case 'ship begin':
457                 $fMaxDist = 0.1;
458                 $aTypes = array('wreck');
459                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
460                 $sURL .= '&nearlat='.$aRecord['lat'];
461                 $sURL .= '&nearlon='.$aRecord['lon'];
462                 break;
463             case 'road':
464             case 'university':
465             case 'company':
466             case 'department':
467                 $fMaxDist = 0.005;
468                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
469                 $sURL .= '&bounded=1';
470                 $sURL .= '&nearlat='.$aRecord['lat'];
471                 $sURL .= '&nearlon='.$aRecord['lon'];
472                 break;
473             default:
474                 $bUnknown = true;
475                 $fMaxDist = 0.005;
476                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
477                 // $sURL .= "&bounded=1";
478                 $sURL .= '&nearlat='.$aRecord['lat'];
479                 $sURL .= '&nearlon='.$aRecord['lon'];
480                 echo '-- Unknown: '.$aRecord['infobox_type']."\n";
481                 break;
482         }
483         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
484
485         var_Dump($sNameURL);
486         $sXML = file_get_contents($sNameURL);
487
488         $aNominatRecords = array();
489         $hXMLParser = xml_parser_create();
490         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
491         xml_parse($hXMLParser, $sXML, true);
492         xml_parser_free($hXMLParser);
493
494         if (!isset($aNominatRecords[0])) {
495             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
496             if (count($aNameParts) > 1) {
497                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
498                 var_Dump($sNameURL);
499                 $sXML = file_get_contents($sNameURL);
500
501                 $aNominatRecords = array();
502                 $hXMLParser = xml_parser_create();
503                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
504                 xml_parse($hXMLParser, $sXML, true);
505                 xml_parser_free($hXMLParser);
506             }
507         }
508
509         // assume first is best/right
510         for ($i = 0; $i < count($aNominatRecords); $i++) {
511             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
512             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
513             $fDiff = sqrt($fDiff);
514             if ($bUnknown) {
515                 // If it was an unknown type base it on the rank of the found result
516                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
517                 if ($iRank <= 4) $fMaxDist = 2;
518                 elseif ($iRank <= 8) $fMaxDist = 1;
519                 elseif ($iRank <= 10) $fMaxDist = 0.8;
520                 elseif ($iRank <= 12) $fMaxDist = 0.6;
521                 elseif ($iRank <= 17) $fMaxDist = 0.2;
522                 elseif ($iRank <= 18) $fMaxDist = 0.1;
523                 elseif ($iRank <= 22) $fMaxDist = 0.02;
524                 elseif ($iRank <= 26) $fMaxDist = 0.001;
525                 else $fMaxDist = 0.001;
526             }
527             echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
528             echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
529             echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
530             echo " (dist:$fDiff, max:$fMaxDist)\n";
531             if ($fDiff > $fMaxDist) {
532                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
533             } else {
534                 $sSQL = 'update wikipedia_article set osm_type=';
535                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
536                     case 'relation':
537                         $sSQL .= "'R'";
538                         break;
539                     case 'way':
540                         $sSQL .= "'W'";
541                         break;
542                     case 'node':
543                         $sSQL .= "'N'";
544                         break;
545                 }
546                 $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
547                 $oDB->query($sSQL);
548                 break;
549             }
550         }
551     }
552 }