4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
9 "Create and setup nominatim search system",
10 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
11 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
12 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
14 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
15 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
16 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
18 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
21 $sTestPageText = <<<EOD
22 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
23 {{ Infobox Amusement park
24 | name = Six Flags Great Adventure
25 | image = [[File:SixFlagsGreatAdventure logo.png]]
26 | caption = Six Flags Great Adventure logo
27 | location = [[Jackson, New Jersey|Jackson]]
28 | location2 = New Jersey
29 | location3 = United States
30 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
31 | season = March/April through October/November
32 | opening_date = July 1, 1974
33 | previous_names = Great Adventure
35 | rides = 45 park admission rides
38 | owner = [[Six Flags]]
40 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
43 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
45 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
54 if ($aCMDResult['drop-tables'])
56 $oDB->query('DROP TABLE wikipedia_article');
57 $oDB->query('DROP TABLE wikipedia_link');
61 if ($aCMDResult['create-tables'])
64 CREATE TABLE wikipedia_article (
65 language text NOT NULL,
72 importance double precision,
74 osm_type character(1),
82 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
85 CREATE TABLE wikipedia_link (
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
95 $sNSEW = strtoupper($sNSEW);
96 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
99 function _parseWikipediaContent($sPageText)
101 $sPageText = str_replace("\n", ' ', $sPageText);
102 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
105 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
107 $aPageProperties = array();
109 $aTemplates = array();
112 $aTemplateStack = array();
113 $aState = array('body');
114 foreach($aPageText as $i => $sPart)
119 array_unshift($aTemplateStack, array('', array()));
120 array_unshift($aState, 'template');
123 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
125 $aTemplate = array_shift($aTemplateStack);
126 array_shift($aState);
128 $aTemplates[] = $aTemplate;
135 array_unshift($aState, 'link');
138 if ($aState[0] == 'link' || $aState[0] == 'linksynonim')
140 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
141 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
143 $aLinks[] = array($sLinkPage, $sLinkSyn);
145 array_shift($aState);
149 $aTemplateStack[0][0] .= trim($sPart);
151 case 'templateparam':
152 $aTemplateStack[0][1][0] .= $sLinkSyn;
155 $sLinkPage .= trim($sPart);
161 $sPageBody .= $sLinkSyn;
164 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
165 fail('unknown state');
170 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
172 // Create a new template paramater
173 $aState[0] = 'templateparam';
174 array_unshift($aTemplateStack[0][1], '');
176 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
182 $aTemplateStack[0][0] .= trim($sPart);
184 case 'templateparam':
185 $aTemplateStack[0][1][0] .= $sPart;
188 $sLinkPage .= trim($sPart);
194 $sPageBody .= $sPart;
197 var_dump($aState, $aPageText);
198 fail('unknown state');
206 function _templatesToProperties($aTemplates)
208 $aPageProperties = array();
209 foreach($aTemplates as $iTemplate => $aTemplate)
212 foreach(array_reverse($aTemplate[1]) as $iParam => $sParam)
214 if (($iPos = strpos($sParam, '=')) === FALSE)
216 $aParams[] = trim($sParam);
220 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
223 $aTemplates[$iTemplate][1] = $aParams;
224 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
225 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population']))
227 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
229 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total']))
231 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
233 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban']))
235 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
237 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate']))
239 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
241 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website'])
243 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch))
245 $aPageProperties['sWebsite'] = $aMatch[1];
246 if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE)
248 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
252 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld'])
254 $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
257 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox')
259 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
260 // $aPageProperties['aInfoboxParams'] = $aParams;
263 // Assume the first template with lots of params is the type (fallback for infobox)
264 if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10)
266 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
267 // $aPageProperties['aInfoboxParams'] = $aParams;
270 // do we have a lat/lon
271 if (!isset($aPageProperties['fLat']))
273 if (isset($aParams['latd']) && isset($aParams['longd']))
275 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
276 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
278 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees']))
280 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
281 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
283 if (isset($aParams['latitude']) && isset($aParams['longitude']))
285 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude']))
287 $aPageProperties['fLat'] = (float)$aParams['latitude'];
288 $aPageProperties['fLon'] = (float)$aParams['longitude'];
291 if (strtolower($aTemplate[0]) == 'coord')
293 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S'))
295 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
296 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
298 elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S'))
300 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
301 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
303 else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S'))
305 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
306 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
308 else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1]))
310 $aPageProperties['fLat'] = (float)$aParams[0];
311 $aPageProperties['fLon'] = (float)$aParams[1];
314 if (isset($aParams['Latitude']) && isset($aParams['Longitude']))
316 $aParams['Latitude'] = str_replace(' ',' ',$aParams['Latitude']);
317 $aParams['Longitude'] = str_replace(' ',' ',$aParams['Longitude']);
318 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
320 $aPageProperties['fLat'] =
321 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
322 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
324 else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
326 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
329 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
331 $aPageProperties['fLon'] =
332 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
333 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
335 else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
337 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
342 if (isset($aPageProperties['sPossibleInfoboxType']))
344 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
345 unset($aPageProperties['sPossibleInfoboxType']);
347 return $aPageProperties;
350 if (isset($aCMDResult['parse-wikipedia']))
353 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
354 // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
355 foreach($aArticleNames as $sArticleName)
357 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
358 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
360 if (isset($aP['sInfoboxType']))
362 $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
363 $sSQL = 'update wikipedia_article set ';
364 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
365 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
368 if (isset($aP['iPopulation']))
370 $sSQL = 'update wikipedia_article set ';
371 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
372 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
375 if (isset($aP['sWebsite']))
377 $sSQL = 'update wikipedia_article set ';
378 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
379 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
382 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0'))
384 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
385 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
386 $sSQL = 'update wikipedia_article set ';
387 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
388 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
389 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
395 function nominatimXMLStart($hParser, $sName, $aAttr)
397 global $aNominatRecords;
401 $aNominatRecords[] = $aAttr;
406 function nominatimXMLEnd($hParser, $sName)
411 if (isset($aCMDResult['link']))
414 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
416 // If you point this script at production OSM you will be blocked
417 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
419 foreach($aWikiArticles as $aRecord)
421 $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
423 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
425 echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
426 $fMaxDist = 0.0000001;
428 switch(strtolower($aRecord['infobox_type']))
430 case 'former country':
433 $fMaxDist = 60; // effectively turn it off
434 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
440 $fMaxDist = 60; // effectively turn it off
441 $sURL .= "&featuretype=country";
442 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
444 case 'prefecture japan':
445 $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
450 case 'u.s. state symbols':
452 case 'province or territory of canada';
453 case 'indian jurisdiction';
455 case 'french region':
456 case 'region of italy':
458 case '#australia state or territory':
459 case 'russian federal subject':
461 $sURL .= "&featuretype=state";
462 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
464 case 'protected area':
466 $sURL .= "&nearlat=".$aRecord['lat'];
467 $sURL .= "&nearlon=".$aRecord['lon'];
468 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
472 case 'french commune':
473 case 'italian comune':
475 case 'italian comune':
476 case 'australian place':
482 case 'russian inhabited locality':
483 case 'finnish municipality/land area':
484 case 'england county':
485 case 'israel municipality':
489 $sURL .= "&featuretype=settlement";
490 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
493 case 'mountain pass':
498 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
502 $aTypes = array('wreck');
503 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
504 $sURL .= "&nearlat=".$aRecord['lat'];
505 $sURL .= "&nearlon=".$aRecord['lon'];
512 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
513 $sURL .= "&bounded=1";
514 $sURL .= "&nearlat=".$aRecord['lat'];
515 $sURL .= "&nearlon=".$aRecord['lon'];
520 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
521 // $sURL .= "&bounded=1";
522 $sURL .= "&nearlat=".$aRecord['lat'];
523 $sURL .= "&nearlon=".$aRecord['lon'];
524 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
527 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
530 $sXML = file_get_contents($sNameURL);
532 $aNominatRecords = array();
533 $hXMLParser = xml_parser_create();
534 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
535 xml_parse($hXMLParser, $sXML, true);
536 xml_parser_free($hXMLParser);
538 if (!isset($aNominatRecords[0]))
540 $aNameParts = preg_split('#[(,]#',$aRecord['name']);
541 if (sizeof($aNameParts) > 1)
543 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
545 $sXML = file_get_contents($sNameURL);
547 $aNominatRecords = array();
548 $hXMLParser = xml_parser_create();
549 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
550 xml_parse($hXMLParser, $sXML, true);
551 xml_parser_free($hXMLParser);#
555 // assume first is best/right
556 for($i = 0; $i < sizeof($aNominatRecords); $i++)
558 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
559 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
560 $fDiff = sqrt($fDiff);
562 // If it was an unknown type base it on the rank of the found result
563 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
564 if ($iRank <= 4) $fMaxDist = 2;
565 elseif ($iRank <= 8) $fMaxDist = 1;
566 elseif ($iRank <= 10) $fMaxDist = 0.8;
567 elseif ($iRank <= 12) $fMaxDist = 0.6;
568 elseif ($iRank <= 17) $fMaxDist = 0.2;
569 elseif ($iRank <= 18) $fMaxDist = 0.1;
570 elseif ($iRank <= 22) $fMaxDist = 0.02;
571 elseif ($iRank <= 26) $fMaxDist = 0.001;
572 else $fMaxDist = 0.001;
574 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
575 if ($fDiff > $fMaxDist)
577 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
581 $sSQL = "update wikipedia_article set osm_type=";
582 switch($aNominatRecords[$i]['OSM_TYPE'])
584 case 'relation': $sSQL .= "'R'"; break;
585 case 'way': $sSQL .= "'W'"; break;
586 case 'node': $sSQL .= "'N'"; break;
588 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";