]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
case statement need : instead of ;. Added more breaks
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
7
8 $aCMDOptions
9  = array(
10     "Create and setup nominatim search system",
11     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
14
15     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
18    );
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
20
21 /*
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
35 | area_acre = 2200
36 | rides = 45 park admission rides
37 | coasters = 12
38 | water_rides = 2
39 | owner = [[Six Flags]]
40 | general_manager = 
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
42 }}
43 EOD;
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
45 exit;
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
47 */
48 /*
49
50     $a = array();
51     $a[] = 'test';
52
53     $oDB &= getDB();
54
55     if ($aCMDResult['drop-tables'])
56     {
57         $oDB->query('DROP TABLE wikipedia_article');
58         $oDB->query('DROP TABLE wikipedia_link');
59     }
60 */
61
62 if ($aCMDResult['create-tables']) {
63     $sSQL = <<<'EOD'
64 CREATE TABLE wikipedia_article (
65     language text NOT NULL,
66     title text NOT NULL,
67     langcount integer,
68     othercount integer,
69     totalcount integer,
70     lat double precision,
71     lon double precision,
72     importance double precision,
73     title_en text,
74     osm_type character(1),
75     osm_id bigint,
76     infobox_type text,
77     population bigint,
78     website text
79 );
80         $oDB->query($sSQL);
81
82         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
83
84         $sSQL = <<<'EOD'
85 CREATE TABLE wikipedia_link (
86   from_id INTEGER,
87   to_name text
88   );
89 EOD;
90     $oDB->query($sSQL);
91 }
92
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
94 {
95     $sNSEW = strtoupper($sNSEW);
96     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
97 }
98
99 function _parseWikipediaContent($sPageText)
100 {
101     $sPageText = str_replace("\n", ' ', $sPageText);
102     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104
105     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106
107     $aPageProperties = array();
108     $sPageBody = '';
109     $aTemplates = array();
110     $aLinks = array();
111
112     $aTemplateStack = array();
113     $aState = array('body');
114     foreach ($aPageText as $i => $sPart) {
115         switch ($sPart) {
116         case '{{':
117             array_unshift($aTemplateStack, array('', array()));
118             array_unshift($aState, 'template');
119             break;
120         case '}}':
121             if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
122                 $aTemplate = array_shift($aTemplateStack);
123                 array_shift($aState);
124
125                 $aTemplates[] = $aTemplate;
126             }
127             break;
128         case '[[':
129             $sLinkPage = '';
130             $sLinkSyn = '';
131             array_unshift($aState, 'link');
132             break;
133         case ']]':
134             if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
135                 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
136                 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
137
138                 $aLinks[] = array($sLinkPage, $sLinkSyn);
139
140                 array_shift($aState);
141                 switch ($aState[0]) {
142                 case 'template':
143                     $aTemplateStack[0][0] .= trim($sPart);
144                     break;
145                 case 'templateparam':
146                     $aTemplateStack[0][1][0] .= $sLinkSyn;
147                     break;
148                 case 'link':
149                     $sLinkPage .= trim($sPart);
150                     break;
151                 case 'linksynonim':
152                     $sLinkSyn .= $sPart;
153                     break;
154                 case 'body':
155                     $sPageBody .= $sLinkSyn;
156                     break;
157                 default:
158                     var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
159                     fail('unknown state');
160                 }
161             }
162             break;
163         case '|':
164             if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
165                 // Create a new template paramater
166                 $aState[0] = 'templateparam';
167                 array_unshift($aTemplateStack[0][1], '');
168             }
169             if ($aState[0] == 'link') $aState[0] = 'linksynonim';
170             break;
171         default:
172             switch ($aState[0]) {
173             case 'template':
174                 $aTemplateStack[0][0] .= trim($sPart);
175                 break;
176             case 'templateparam':
177                 $aTemplateStack[0][1][0] .= $sPart;
178                 break;
179             case 'link':
180                 $sLinkPage .= trim($sPart);
181                 break;
182             case 'linksynonim':
183                 $sLinkSyn .= $sPart;
184                 break;
185             case 'body':
186                 $sPageBody .= $sPart;
187                 break;
188             default:
189                 var_dump($aState, $aPageText);
190                 fail('unknown state');
191             }
192             break;
193         }
194     }
195     return $aTemplates;
196 }
197
198 function _templatesToProperties($aTemplates)
199 {
200     $aPageProperties = array();
201     foreach ($aTemplates as $iTemplate => $aTemplate) {
202         $aParams = array();
203         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
204             if (($iPos = strpos($sParam, '=')) === FALSE) {
205                 $aParams[] = trim($sParam);
206             } else {
207                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
208             }
209         }
210         $aTemplates[$iTemplate][1] = $aParams;
211         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
212         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
213             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
214         }
215         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
216             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
217         }
218         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
219             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
220         }
221         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
222             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
223         }
224         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
225             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
226                 $aPageProperties['sWebsite'] = $aMatch[1];
227                 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === FALSE) {
228                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
229                 }
230             }
231         }
232         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
233             $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
234         }
235
236         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
237             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
238             // $aPageProperties['aInfoboxParams'] = $aParams;
239         }
240
241         // Assume the first template with lots of params is the type (fallback for infobox)
242         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
243             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
244             // $aPageProperties['aInfoboxParams'] = $aParams;
245         }
246
247         // do we have a lat/lon
248         if (!isset($aPageProperties['fLat'])) {
249             if (isset($aParams['latd']) && isset($aParams['longd'])) {
250                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
251                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
252             }
253             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
254                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
255                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
256             }
257             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
258                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
259                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
260                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
261                 }
262             }
263             if (strtolower($aTemplate[0]) == 'coord') {
264                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
265                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
266                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
267                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
268                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
269                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
270                 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
271                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
272                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
273                 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
274                     $aPageProperties['fLat'] = (float)$aParams[0];
275                     $aPageProperties['fLon'] = (float)$aParams[1];
276                 }
277             }
278             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
279                 $aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
280                 $aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
281                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
282                     $aPageProperties['fLat'] =
283                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
284                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
285                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
286                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
287                 }
288
289                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
290                     $aPageProperties['fLon'] =
291                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
292                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
293                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
294                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
295                 }
296             }
297         }
298     }
299     if (isset($aPageProperties['sPossibleInfoboxType'])) {
300         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
301         unset($aPageProperties['sPossibleInfoboxType']);
302     }
303     return $aPageProperties;
304 }
305
306 if (isset($aCMDResult['parse-wikipedia'])) {
307     $oDB =& getDB();
308     $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
309 //      $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
310     foreach ($aArticleNames as $sArticleName) {
311         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
312         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
313
314         if (isset($aP['sInfoboxType'])) {
315             $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
316             $sSQL = 'update wikipedia_article set ';
317             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
318             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
319             $oDB->query($sSQL);
320         }
321         if (isset($aP['iPopulation'])) {
322             $sSQL = 'update wikipedia_article set ';
323             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
324             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
325             $oDB->query($sSQL);
326         }
327         if (isset($aP['sWebsite'])) {
328             $sSQL = 'update wikipedia_article set ';
329             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
330             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
331             $oDB->query($sSQL);
332         }
333         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
334             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
335             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
336             $sSQL = 'update wikipedia_article set ';
337             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
338             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
339             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
340             $oDB->query($sSQL);
341         }
342     }
343 }
344
345 function nominatimXMLStart($hParser, $sName, $aAttr)
346 {
347         global $aNominatRecords;
348         switch ($sName) {
349         case 'PLACE':
350                 $aNominatRecords[] = $aAttr;
351                 break;
352         }
353 }
354
355 function nominatimXMLEnd($hParser, $sName)
356 {
357 }
358
359
360 if (isset($aCMDResult['link'])) {
361     $oDB =& getDB();
362     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
363
364     // If you point this script at production OSM you will be blocked
365     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
366
367     foreach ($aWikiArticles as $aRecord) {
368         $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
369
370         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
371
372         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
373         $fMaxDist = 0.0000001;
374         $bUnknown = false;
375         switch (strtolower($aRecord['infobox_type'])) {
376         case 'former country':
377             continue 2;
378         case 'sea':
379             $fMaxDist = 60; // effectively turn it off
380             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
381             break;
382         case 'country':
383         case 'island':
384         case 'islands':
385         case 'continent':
386             $fMaxDist = 60; // effectively turn it off
387             $sURL .= "&featuretype=country";
388             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
389             break;
390         case 'prefecture japan':
391             $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
392             break;
393         case 'state':
394         case '#us state':
395         case 'county':
396         case 'u.s. state':
397         case 'u.s. state symbols':
398         case 'german state':
399         case 'province or territory of canada':
400         case 'indian jurisdiction':
401         case 'province':
402         case 'french region':
403         case 'region of italy':
404         case 'kommune':
405         case '#australia state or territory':
406         case 'russian federal subject':
407             $fMaxDist = 4;
408             $sURL .= "&featuretype=state";
409             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
410             break;
411         case 'protected area':
412             $fMaxDist = 1;
413             $sURL .= "&nearlat=".$aRecord['lat'];
414             $sURL .= "&nearlon=".$aRecord['lon'];
415             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
416             break;
417         case 'settlement':
418             $bUnknown = true;
419             break;
420         case 'french commune':
421         case 'italian comune':
422         case 'uk place':
423         case 'italian comune':
424         case 'australian place':
425         case 'german place':
426         case '#geobox':
427         case 'u.s. county':
428         case 'municipality':
429         case 'city japan':
430         case 'russian inhabited locality':
431         case 'finnish municipality/land area':
432         case 'england county':
433         case 'israel municipality':
434         case 'russian city':
435         case 'city':
436             $fMaxDist = 0.2;
437             $sURL .= "&featuretype=settlement";
438             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
439             break;
440         case 'mountain':
441         case 'mountain pass':
442         case 'river':
443         case 'lake':
444         case 'airport':
445             $fMaxDist = 0.2;
446             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
447             break;
448         case 'ship begin':
449             $fMaxDist = 0.1;
450             $aTypes = array('wreck');
451             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
452             $sURL .= "&nearlat=".$aRecord['lat'];
453             $sURL .= "&nearlon=".$aRecord['lon'];
454             break;
455         case 'road':
456         case 'university':
457         case 'company':
458         case 'department':
459             $fMaxDist = 0.005;
460             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
461             $sURL .= "&bounded=1";
462             $sURL .= "&nearlat=".$aRecord['lat'];
463             $sURL .= "&nearlon=".$aRecord['lon'];
464             break;
465         default:
466             $bUnknown = true;
467             $fMaxDist = 0.005;
468             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
469 //              $sURL .= "&bounded=1";
470             $sURL .= "&nearlat=".$aRecord['lat'];
471             $sURL .= "&nearlon=".$aRecord['lon'];
472             echo "-- Unknown: ".$aRecord['infobox_type']."\n";
473             break;
474         }
475         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
476
477         var_Dump($sNameURL);
478         $sXML = file_get_contents($sNameURL);
479
480         $aNominatRecords = array();
481         $hXMLParser = xml_parser_create();
482         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
483         xml_parse($hXMLParser, $sXML, true);
484         xml_parser_free($hXMLParser);
485
486         if (!isset($aNominatRecords[0])) {
487             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
488             if (sizeof($aNameParts) > 1) {
489                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
490                 var_Dump($sNameURL);
491                 $sXML = file_get_contents($sNameURL);
492
493                 $aNominatRecords = array();
494                 $hXMLParser = xml_parser_create();
495                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
496                 xml_parse($hXMLParser, $sXML, true);
497                 xml_parser_free($hXMLParser);#
498             }
499         }
500
501         // assume first is best/right
502         for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
503             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
504             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
505             $fDiff = sqrt($fDiff);
506             if ($bUnknown) {
507                 // If it was an unknown type base it on the rank of the found result
508                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
509                 if ($iRank <= 4) $fMaxDist = 2;
510                 elseif ($iRank <= 8) $fMaxDist = 1;
511                 elseif ($iRank <= 10) $fMaxDist = 0.8;
512                 elseif ($iRank <= 12) $fMaxDist = 0.6;
513                 elseif ($iRank <= 17) $fMaxDist = 0.2;
514                 elseif ($iRank <= 18) $fMaxDist = 0.1;
515                 elseif ($iRank <= 22) $fMaxDist = 0.02;
516                 elseif ($iRank <= 26) $fMaxDist = 0.001;
517                 else $fMaxDist = 0.001;
518             }
519             echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
520             if ($fDiff > $fMaxDist) {
521                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
522             } else {
523                 $sSQL = "update wikipedia_article set osm_type=";
524                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
525                 case 'relation': $sSQL .= "'R'"; break;
526                 case 'way': $sSQL .= "'W'"; break;
527                 case 'node': $sSQL .= "'N'"; break;
528                 }
529                 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
530                 $oDB->query($sSQL);
531                 break;
532             }
533         }
534     }
535 }