3 * SPDX-License-Identifier: GPL-2.0-only
5 * This file is part of Nominatim. (https://nominatim.org)
7 * Copyright (C) 2022 by the Nominatim developer community.
8 * For a full list of authors see the git log.
14 * A word list creator based on simple splitting by space.
16 * Creates possible permutations of split phrases by finding all combination
17 * of splitting the phrase on space boundaries.
21 const MAX_WORDSET_LEN = 20;
22 const MAX_WORDSETS = 100;
24 // The phrase as a list of simple terms (without spaces).
28 * Create a new word list
30 * @param string sPhrase Phrase to create the word list from. The phrase is
31 * expected to be normalised, so that there are no
34 public function __construct($sPhrase)
36 if (strlen($sPhrase) > 0) {
37 $this->aWords = explode(' ', $sPhrase);
39 $this->aWords = array();
44 * Get all possible tokens that are present in this word list.
46 * @return array The list of string tokens in the word list.
48 public function getTokens()
51 $iNumWords = count($this->aWords);
53 for ($i = 0; $i < $iNumWords; $i++) {
54 $sPhrase = $this->aWords[$i];
55 $aTokens[$sPhrase] = $sPhrase;
57 for ($j = $i + 1; $j < $iNumWords; $j++) {
58 $sPhrase .= ' '.$this->aWords[$j];
59 $aTokens[$sPhrase] = $sPhrase;
67 * Compute all possible permutations of phrase splits that result in
68 * words which are in the token list.
70 public function getWordSets($oTokens)
72 $iNumWords = count($this->aWords);
74 if ($iNumWords == 0) {
78 // Caches the word set for the partial phrase up to word i.
79 $aSetCache = array_fill(0, $iNumWords, array());
81 // Initialise first element of cache. There can only be the word.
82 if ($oTokens->containsAny($this->aWords[0])) {
83 $aSetCache[0][] = array($this->aWords[0]);
86 // Now do the next elements using what we already have.
87 for ($i = 1; $i < $iNumWords; $i++) {
88 for ($j = $i; $j > 0; $j--) {
89 $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
90 if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
91 $aPartial = array($sPartial);
92 foreach ($aSetCache[$j - 1] as $aSet) {
93 if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
94 $aSetCache[$i][] = array_merge($aSet, $aPartial);
97 if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
100 array('\Nominatim\SimpleWordList', 'cmpByArraylen')
102 $aSetCache[$i] = array_slice(
105 SimpleWordList::MAX_WORDSETS
111 // finally the current full phrase
112 $sPartial = $this->aWords[0].' '.$sPartial;
113 if ($oTokens->containsAny($sPartial)) {
114 $aSetCache[$i][] = array($sPartial);
118 $aWordSets = $aSetCache[$iNumWords - 1];
119 usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
120 return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
123 public static function cmpByArraylen($aA, $aB)
128 if ($iALen == $iBLen) {
132 return ($iALen < $iBLen) ? -1 : 1;
135 public function debugInfo()
137 return $this->aWords;