]> git.openstreetmap.org Git - nominatim.git/blobdiff - lib-php/SimpleWordList.php
php: make word list a first-class object
[nominatim.git] / lib-php / SimpleWordList.php
diff --git a/lib-php/SimpleWordList.php b/lib-php/SimpleWordList.php
new file mode 100644 (file)
index 0000000..3dd9bda
--- /dev/null
@@ -0,0 +1,131 @@
+<?php
+
+namespace Nominatim;
+
+/**
+ * A word list creator based on simple splitting by space.
+ *
+ * Creates possible permutations of split phrases by finding all combination
+ * of splitting the phrase on space boundaries.
+ */
+class SimpleWordList
+{
+    const MAX_WORDSET_LEN = 20;
+    const MAX_WORDSETS = 100;
+
+    // The phrase as a list of simple terms (without spaces).
+    private $aWords;
+
+    /**
+     * Create a new word list
+     *
+     * @param string sPhrase  Phrase to create the word list from. The phrase is
+     *                        expected to be normalised, so that there are no
+     *                        subsequent spaces.
+     */
+    public function __construct($sPhrase)
+    {
+        if (strlen($sPhrase) > 0) {
+            $this->aWords = explode(' ', $sPhrase);
+        } else {
+            $this->aWords = array();
+        }
+    }
+
+    /**
+     * Get all possible tokens that are present in this word list.
+     *
+     * @return array The list of string tokens in the word list.
+     */
+    public function getTokens()
+    {
+        $aTokens = array();
+        $iNumWords = count($this->aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $this->aWords[$i];
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$this->aWords[$j];
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+
+        return $aTokens;
+    }
+
+    /**
+     * Compute all possible permutations of phrase splits that result in
+     * words which are in the token list.
+     */
+    public function getWordSets($oTokens)
+    {
+        $iNumWords = count($this->aWords);
+
+        if ($iNumWords == 0) {
+            return null;
+        }
+
+        // Caches the word set for the partial phrase up to word i.
+        $aSetCache = array_fill(0, $iNumWords, array());
+
+        // Initialise first element of cache. There can only be the word.
+        if ($oTokens->containsAny($this->aWords[0])) {
+            $aSetCache[0][] = array($this->aWords[0]);
+        }
+
+        // Now do the next elements using what we already have.
+        for ($i = 1; $i < $iNumWords; $i++) {
+            for ($j = $i; $j > 0; $j--) {
+                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
+                    $aPartial = array($sPartial);
+                    foreach ($aSetCache[$j - 1] as $aSet) {
+                        if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
+                            $aSetCache[$i][] = array_merge($aSet, $aPartial);
+                        }
+                    }
+                    if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
+                        usort(
+                            $aSetCache[$i],
+                            array('\Nominatim\SimpleWordList', 'cmpByArraylen')
+                        );
+                        $aSetCache[$i] = array_slice(
+                            $aSetCache[$i],
+                            0,
+                            SimpleWordList::MAX_WORDSETS
+                        );
+                    }
+                }
+            }
+
+            // finally the current full phrase
+            $sPartial = $this->aWords[0].' '.$sPartial;
+            if ($oTokens->containsAny($sPartial)) {
+                $aSetCache[$i][] = array($sPartial);
+            }
+        }
+
+        $aWordSets = $aSetCache[$iNumWords - 1];
+        usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
+        return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
+    }
+
+    public static function cmpByArraylen($aA, $aB)
+    {
+        $iALen = count($aA);
+        $iBLen = count($aB);
+
+        if ($iALen == $iBLen) {
+            return 0;
+        }
+
+        return ($iALen < $iBLen) ? -1 : 1;
+    }
+
+    public function debugInfo()
+    {
+        return $this->aWords;
+    }
+}