4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
10 'Create and setup nominatim search system',
11 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
15 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
36 | rides = 45 park admission rides
39 | owner = [[Six Flags]]
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
55 if ($aCMDResult['drop-tables'])
57 $oDB->query('DROP TABLE wikipedia_article');
58 $oDB->query('DROP TABLE wikipedia_link');
62 if ($aCMDResult['create-tables']) {
64 CREATE TABLE wikipedia_article (
65 language text NOT NULL,
72 importance double precision,
74 osm_type character(1),
82 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
85 CREATE TABLE wikipedia_link (
94 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
96 $sNSEW = strtoupper($sNSEW);
97 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
101 function _parseWikipediaContent($sPageText)
103 $sPageText = str_replace("\n", ' ', $sPageText);
104 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
105 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
107 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
109 $aPageProperties = array();
111 $aTemplates = array();
114 $aTemplateStack = array();
115 $aState = array('body');
116 foreach ($aPageText as $i => $sPart) {
119 array_unshift($aTemplateStack, array('', array()));
120 array_unshift($aState, 'template');
123 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
124 $aTemplate = array_shift($aTemplateStack);
125 array_shift($aState);
127 $aTemplates[] = $aTemplate;
133 array_unshift($aState, 'link');
136 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
137 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
138 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
140 $aLinks[] = array($sLinkPage, $sLinkSyn);
142 array_shift($aState);
143 switch ($aState[0]) {
145 $aTemplateStack[0][0] .= trim($sPart);
147 case 'templateparam':
148 $aTemplateStack[0][1][0] .= $sLinkSyn;
151 $sLinkPage .= trim($sPart);
157 $sPageBody .= $sLinkSyn;
160 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
161 fail('unknown state');
166 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
167 // Create a new template paramater
168 $aState[0] = 'templateparam';
169 array_unshift($aTemplateStack[0][1], '');
171 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
174 switch ($aState[0]) {
176 $aTemplateStack[0][0] .= trim($sPart);
178 case 'templateparam':
179 $aTemplateStack[0][1][0] .= $sPart;
182 $sLinkPage .= trim($sPart);
188 $sPageBody .= $sPart;
191 var_dump($aState, $aPageText);
192 fail('unknown state');
200 function _templatesToProperties($aTemplates)
202 $aPageProperties = array();
203 foreach ($aTemplates as $iTemplate => $aTemplate) {
205 foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
206 if (($iPos = strpos($sParam, '=')) === false) {
207 $aParams[] = trim($sParam);
209 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
212 $aTemplates[$iTemplate][1] = $aParams;
213 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
214 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
215 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
217 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
218 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
220 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
221 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
223 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
224 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
226 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
227 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
228 $aPageProperties['sWebsite'] = $aMatch[1];
229 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
230 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
234 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
235 $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
238 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
239 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
240 // $aPageProperties['aInfoboxParams'] = $aParams;
243 // Assume the first template with lots of params is the type (fallback for infobox)
244 if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
245 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
246 // $aPageProperties['aInfoboxParams'] = $aParams;
249 // do we have a lat/lon
250 if (!isset($aPageProperties['fLat'])) {
251 if (isset($aParams['latd']) && isset($aParams['longd'])) {
252 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
253 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
255 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
256 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
257 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
259 if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
260 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
261 $aPageProperties['fLat'] = (float)$aParams['latitude'];
262 $aPageProperties['fLon'] = (float)$aParams['longitude'];
265 if (strtolower($aTemplate[0]) == 'coord') {
266 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
267 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
268 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
269 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
270 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
271 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
272 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
273 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
274 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
275 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
276 $aPageProperties['fLat'] = (float)$aParams[0];
277 $aPageProperties['fLon'] = (float)$aParams[1];
280 if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
281 $aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']);
282 $aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']);
283 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
284 $aPageProperties['fLat'] =
285 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
286 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
287 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
288 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
291 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
292 $aPageProperties['fLon'] =
293 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
294 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
295 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
296 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
301 if (isset($aPageProperties['sPossibleInfoboxType'])) {
302 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
303 unset($aPageProperties['sPossibleInfoboxType']);
305 return $aPageProperties;
308 if (isset($aCMDResult['parse-wikipedia'])) {
310 $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
311 $sSQL .= $aCMDResult['parse-wikipedia'];
312 $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))'
313 $aArticleNames = $oDB->getCol($sSQL);
314 /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
315 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
316 and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
318 foreach ($aArticleNames as $sArticleName) {
319 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
320 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
322 if (isset($aP['sInfoboxType'])) {
323 $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
324 $sSQL = 'update wikipedia_article set ';
325 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
326 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
329 if (isset($aP['iPopulation'])) {
330 $sSQL = 'update wikipedia_article set ';
331 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
332 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
335 if (isset($aP['sWebsite'])) {
336 $sSQL = 'update wikipedia_article set ';
337 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
338 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
341 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
342 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
343 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
344 $sSQL = 'update wikipedia_article set ';
345 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
346 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
347 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
354 function nominatimXMLStart($hParser, $sName, $aAttr)
356 global $aNominatRecords;
359 $aNominatRecords[] = $aAttr;
365 function nominatimXMLEnd($hParser, $sName)
370 if (isset($aCMDResult['link'])) {
372 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
374 // If you point this script at production OSM you will be blocked
375 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
377 foreach ($aWikiArticles as $aRecord) {
378 $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
380 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
382 echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
383 $fMaxDist = 0.0000001;
385 switch (strtolower($aRecord['infobox_type'])) {
386 case 'former country':
389 $fMaxDist = 60; // effectively turn it off
390 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
396 $fMaxDist = 60; // effectively turn it off
397 $sURL .= '&featuretype=country';
398 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
400 case 'prefecture japan':
401 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
402 // intentionally no break
407 case 'u.s. state symbols':
409 case 'province or territory of canada':
410 case 'indian jurisdiction':
412 case 'french region':
413 case 'region of italy':
415 case '#australia state or territory':
416 case 'russian federal subject':
418 $sURL .= '&featuretype=state';
419 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
421 case 'protected area':
423 $sURL .= '&nearlat='.$aRecord['lat'];
424 $sURL .= '&nearlon='.$aRecord['lon'];
425 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
429 // intentionally no break
430 case 'french commune':
431 case 'italian comune':
433 case 'italian comune':
434 case 'australian place':
440 case 'russian inhabited locality':
441 case 'finnish municipality/land area':
442 case 'england county':
443 case 'israel municipality':
447 $sURL .= '&featuretype=settlement';
448 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
451 case 'mountain pass':
456 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
460 $aTypes = array('wreck');
461 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
462 $sURL .= '&nearlat='.$aRecord['lat'];
463 $sURL .= '&nearlon='.$aRecord['lon'];
470 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
471 $sURL .= '&bounded=1';
472 $sURL .= '&nearlat='.$aRecord['lat'];
473 $sURL .= '&nearlon='.$aRecord['lon'];
478 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
479 // $sURL .= "&bounded=1";
480 $sURL .= '&nearlat='.$aRecord['lat'];
481 $sURL .= '&nearlon='.$aRecord['lon'];
482 echo '-- Unknown: '.$aRecord['infobox_type']."\n";
485 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
488 $sXML = file_get_contents($sNameURL);
490 $aNominatRecords = array();
491 $hXMLParser = xml_parser_create();
492 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
493 xml_parse($hXMLParser, $sXML, true);
494 xml_parser_free($hXMLParser);
496 if (!isset($aNominatRecords[0])) {
497 $aNameParts = preg_split('#[(,]#', $aRecord['name']);
498 if (count($aNameParts) > 1) {
499 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
501 $sXML = file_get_contents($sNameURL);
503 $aNominatRecords = array();
504 $hXMLParser = xml_parser_create();
505 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
506 xml_parse($hXMLParser, $sXML, true);
507 xml_parser_free($hXMLParser);
511 // assume first is best/right
512 for ($i = 0; $i < count($aNominatRecords); $i++) {
513 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
514 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
515 $fDiff = sqrt($fDiff);
517 // If it was an unknown type base it on the rank of the found result
518 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
519 if ($iRank <= 4) $fMaxDist = 2;
520 elseif ($iRank <= 8) $fMaxDist = 1;
521 elseif ($iRank <= 10) $fMaxDist = 0.8;
522 elseif ($iRank <= 12) $fMaxDist = 0.6;
523 elseif ($iRank <= 17) $fMaxDist = 0.2;
524 elseif ($iRank <= 18) $fMaxDist = 0.1;
525 elseif ($iRank <= 22) $fMaxDist = 0.02;
526 elseif ($iRank <= 26) $fMaxDist = 0.001;
527 else $fMaxDist = 0.001;
529 echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
530 echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
531 echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
532 echo " (dist:$fDiff, max:$fMaxDist)\n";
533 if ($fDiff > $fMaxDist) {
534 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
536 $sSQL = 'update wikipedia_article set osm_type=';
537 switch ($aNominatRecords[$i]['OSM_TYPE']) {
548 $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";