From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 15 Oct 2017 16:07:55 +0000 (+0200)
Subject: Merge pull request #814 from lonvia/phrase-as-a-class
X-Git-Tag: v3.1.0~43
X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/fcf7fcee03c8d9a67e1ecace61df81aa81201936?hp=7ea1ef3feb587b58f74784e6802c2e74d90e22cc

Merge pull request #814 from lonvia/phrase-as-a-class

Make phrases a class and add early checking of token validity
---

diff --git a/lib/Geocode.php b/lib/Geocode.php
index 16919bb8..be543012 100644
--- a/lib/Geocode.php
+++ b/lib/Geocode.php
@@ -3,6 +3,7 @@
 namespace Nominatim;
 
 require_once(CONST_BasePath.'/lib/PlaceLookup.php');
+require_once(CONST_BasePath.'/lib/Phrase.php');
 require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
 require_once(CONST_BasePath.'/lib/SearchDescription.php');
 require_once(CONST_BasePath.'/lib/SearchContext.php');
@@ -668,7 +669,7 @@ class Geocode
         return $aSearchResults;
     }
 
-    public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
+    public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured)
     {
         /*
              Calculate all searches using aValidTokens i.e.
@@ -683,15 +684,11 @@ class Geocode
          */
         $iGlobalRank = 0;
 
-        foreach ($aPhrases as $iPhrase => $aPhrase) {
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
             $aNewPhraseSearches = array();
-            if ($bStructuredPhrases) {
-                $sPhraseType = $aPhraseTypes[$iPhrase];
-            } else {
-                $sPhraseType = '';
-            }
+            $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
 
-            foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) {
+            foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
                 // Too many permutations - too expensive
                 if ($iWordSet > 120) break;
 
@@ -710,17 +707,8 @@ class Geocode
                         // If the token is valid
                         if (isset($aValidTokens[' '.$sToken])) {
                             foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) {
-                                // Recheck if the original word shows up in the query.
-                                $bWordInQuery = false;
-                                if (isset($aSearchTerm['word']) && $aSearchTerm['word']) {
-                                    $bWordInQuery = strpos(
-                                        $sNormQuery,
-                                        $this->normTerm($aSearchTerm['word'])
-                                    ) !== false;
-                                }
                                 $aNewSearches = $oCurrentSearch->extendWithFullTerm(
                                     $aSearchTerm,
-                                    $bWordInQuery,
                                     isset($aValidTokens[$sToken])
                                       && strpos($sToken, ' ') === false,
                                     $sPhraseType,
@@ -746,9 +734,8 @@ class Geocode
                             foreach ($aValidTokens[$sToken] as $aSearchTerm) {
                                 $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
                                     $aSearchTerm,
-                                    $bStructuredPhrases,
+                                    $bIsStructured,
                                     $iPhrase,
-                                    $aWordFrequencyScores,
                                     isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
                                 );
 
@@ -806,7 +793,7 @@ class Geocode
         // Revisit searches, drop bad searches and give penalty to unlikely combinations.
         $aGroupedSearches = array();
         foreach ($aSearches as $oSearch) {
-            if (!$oSearch->isValidSearch($this->aCountryCodes)) {
+            if (!$oSearch->isValidSearch()) {
                 continue;
             }
 
@@ -955,10 +942,10 @@ class Geocode
             // Split query into phrases
             // Commas are used to reduce the search space by indicating where phrases split
             if ($this->aStructuredQuery) {
-                $aPhrases = $this->aStructuredQuery;
+                $aInPhrases = $this->aStructuredQuery;
                 $bStructuredPhrases = true;
             } else {
-                $aPhrases = explode(',', $sQuery);
+                $aInPhrases = explode(',', $sQuery);
                 $bStructuredPhrases = false;
             }
 
@@ -967,25 +954,19 @@ class Geocode
             // Get all 'sets' of words
             // Generate a complete list of all
             $aTokens = array();
-            foreach ($aPhrases as $iPhrase => $sPhrase) {
-                $aPhrase = chksql(
-                    $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"),
+            $aPhrases = array();
+            foreach ($aInPhrases as $iPhrase => $sPhrase) {
+                $sPhrase = chksql(
+                    $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'),
                     "Cannot normalize query string (is it a UTF-8 string?)"
                 );
-                if (trim($aPhrase['string'])) {
-                    $aPhrases[$iPhrase] = $aPhrase;
-                    $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']);
-                    $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0);
-                    $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets']));
-                } else {
-                    unset($aPhrases[$iPhrase]);
+                if (trim($sPhrase)) {
+                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
+                    $oPhrase->addTokens($aTokens);
+                    $aPhrases[] = $oPhrase;
                 }
             }
 
-            // Reindex phrases - we make assumptions later on that they are numerically keyed in order
-            $aPhraseTypes = array_keys($aPhrases);
-            $aPhrases = array_values($aPhrases);
-
             if (sizeof($aTokens)) {
                 // Check which tokens we have, get the ID numbers
                 $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
@@ -999,22 +980,29 @@ class Geocode
                     $this->oDB->getAll($sSQL),
                     "Could not get word tokens."
                 );
-                $aPossibleMainWordIDs = array();
                 $aWordFrequencyScores = array();
                 foreach ($aDatabaseWords as $aToken) {
-                    // Very special case - require 2 letter country param to match the country code found
-                    if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country'])
-                        && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code']
+                    // Filter country tokens that do not match restricted countries.
+                    if ($this->aCountryCodes
+                        && $aToken['country_code']
+                        && !in_array($aToken['country_code'], $this->aCountryCodes)
                     ) {
                         continue;
                     }
 
+                    // Special terms need to appear in their normalized form.
+                    if ($aToken['word'] && $aToken['class']) {
+                        $sNormWord = $this->normTerm($aToken['word']);
+                        if (strpos($sNormQuery, $sNormWord) === false) {
+                            continue;
+                        }
+                    }
+
                     if (isset($aValidTokens[$aToken['word_token']])) {
                         $aValidTokens[$aToken['word_token']][] = $aToken;
                     } else {
                         $aValidTokens[$aToken['word_token']] = array($aToken);
                     }
-                    if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1;
                     $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1;
                 }
                 if (CONST_Debug) var_Dump($aPhrases, $aValidTokens);
@@ -1046,19 +1034,18 @@ class Geocode
                 // Any words that have failed completely?
                 // TODO: suggestions
 
-                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
+                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases);
 
                 if ($this->bReverseInPlan) {
                     // Reverse phrase array and also reverse the order of the wordsets in
                     // the first and final phrase. Don't bother about phrases in the middle
                     // because order in the address doesn't matter.
                     $aPhrases = array_reverse($aPhrases);
-                    $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0);
+                    $aPhrases[0]->invertWordSets();
                     if (sizeof($aPhrases) > 1) {
-                        $aFinalPhrase = end($aPhrases);
-                        $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
+                        $aPhrases[sizeof($aPhrases)-1]->invertWordSets();
                     }
-                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
+                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false);
 
                     foreach ($aGroupedSearches as $aSearches) {
                         foreach ($aSearches as $aSearch) {
@@ -1288,8 +1275,7 @@ class Geocode
 
             $aResult['name'] = $aResult['langaddress'];
 
-            if ($oCtx->hasNearPoint())
-            {
+            if ($oCtx->hasNearPoint()) {
                 $aResult['importance'] = 0.001;
                 $aResult['foundorder'] = $aResult['addressimportance'];
             } else {
diff --git a/lib/Phrase.php b/lib/Phrase.php
new file mode 100644
index 00000000..b39079d9
--- /dev/null
+++ b/lib/Phrase.php
@@ -0,0 +1,116 @@
+<?php
+
+namespace Nominatim;
+
+/**
+ * Segment of a query string.
+ *
+ * The parts of a query strings are usually separated by commas.
+ */
+class Phrase
+{
+    const MAX_DEPTH = 7;
+
+    // Complete phrase as a string.
+    private $sPhrase;
+    // Element type for structured searches.
+    private $sPhraseType;
+    // Space-separated words of the phrase.
+    private $aWords;
+    // Possible segmentations of the phrase.
+    private $aWordSets;
+
+
+    public function __construct($sPhrase, $sPhraseType)
+    {
+        $this->sPhrase = trim($sPhrase);
+        $this->sPhraseType = $sPhraseType;
+        $this->aWords = explode(' ', $this->sPhrase);
+        $this->aWordSets = $this->createWordSets($this->aWords, 0);
+    }
+
+    /**
+     * Return the element type of the phrase.
+     *
+     * @return string Pharse type if the phrase comes from a structured query
+     *                or empty string otherwise.
+     */
+    public function getPhraseType()
+    {
+        return $this->sPhraseType;
+    }
+
+    /**
+     * Return the array of possible segmentations of the phrase.
+     *
+     * @return string[][] Array of segmentations, each consisting of an
+     *                    array of terms.
+     */
+    public function getWordSets()
+    {
+        return $this->aWordSets;
+    }
+
+    /**
+     * Add the tokens from this phrase to the given list of tokens.
+     *
+     * @param string[] $aTokens List of tokens to append.
+     *
+     * @return void
+     */
+    public function addTokens(&$aTokens)
+    {
+        foreach ($this->aWordSets as $aSet) {
+            foreach ($aSet as $sWord) {
+                $aTokens[' '.$sWord] = ' '.$sWord;
+                $aTokens[$sWord] = $sWord;
+            }
+        }
+    }
+
+    /**
+     * Invert the set of possible segmentations.
+     *
+     * @return void
+     */
+    public function invertWordSets()
+    {
+        $this->aWordSets = $this->createInverseWordSets($this->aWords, 0);
+    }
+
+    private function createWordSets($aWords, $iDepth)
+    {
+        $aResult = array(array(join(' ', $aWords)));
+        $sFirstToken = '';
+        if ($iDepth < Phrase::MAX_DEPTH) {
+            while (sizeof($aWords) > 1) {
+                $sWord = array_shift($aWords);
+                $sFirstToken .= ($sFirstToken?' ':'').$sWord;
+                $aRest = $this->createWordSets($aWords, $iDepth + 1);
+                foreach ($aRest as $aSet) {
+                    $aResult[] = array_merge(array($sFirstToken), $aSet);
+                }
+            }
+        }
+
+        return $aResult;
+    }
+
+    private function createInverseWordSets($aWords, $iDepth)
+    {
+        $aResult = array(array(join(' ', $aWords)));
+        $sFirstToken = '';
+        if ($iDepth < Phrase::MAX_DEPTH) {
+            while (sizeof($aWords) > 1) {
+                $sWord = array_pop($aWords);
+                $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
+                $aRest = $this->createInverseWordSets($aWords, $iDepth + 1);
+                foreach ($aRest as $aSet) {
+                    $aResult[] = array_merge(array($sFirstToken), $aSet);
+                }
+            }
+        }
+
+        return $aResult;
+    }
+}
diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php
index 1f3765ab..eba5f6a9 100644
--- a/lib/SearchDescription.php
+++ b/lib/SearchDescription.php
@@ -155,22 +155,17 @@ class SearchDescription
     /**
      * Check if the combination of parameters is sensible.
      *
-     * @param string[] $aCountryCodes List of country codes.
-     *
      * @return bool True, if the search looks valid.
      */
-    public function isValidSearch(&$aCountryCodes)
+    public function isValidSearch()
     {
         if (!sizeof($this->aName)) {
             if ($this->sHouseNumber) {
                 return false;
             }
-        }
-        if ($aCountryCodes
-            && $this->sCountryCode
-            && !in_array($this->sCountryCode, $aCountryCodes)
-        ) {
-            return false;
+            if (!$this->sClass && !$this->sCountryCode) {
+                return false;
+            }
         }
 
         return true;
@@ -183,8 +178,6 @@ class SearchDescription
      * Derive new searches by adding a full term to the existing search.
      *
      * @param mixed[] $aSearchTerm  Description of the token.
-     * @param bool    $bWordInQuery True, if the normalised version of the word
-     *                              is contained in the query.
      * @param bool    $bHasPartial  True if there are also tokens of partial terms
      *                              with the same name.
      * @param string  $sPhraseType  Type of phrase the token is contained in.
@@ -198,7 +191,7 @@ class SearchDescription
      *
      * @return SearchDescription[] List of derived search descriptions.
      */
-    public function extendWithFullTerm($aSearchTerm, $bWordInQuery, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank)
+    public function extendWithFullTerm($aSearchTerm, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank)
     {
         $aNewSearches = array();
 
@@ -229,7 +222,8 @@ class SearchDescription
             // We need to try the case where the postal code is the primary element
             // (i.e. no way to tell if it is (postalcode, city) OR (city, postalcode)
             // so try both.
-            if (!$this->sPostcode && $bWordInQuery
+            if (!$this->sPostcode
+                && $aSearchTerm['word']
                 && pg_escape_string($aSearchTerm['word']) == $aSearchTerm['word']
             ) {
                 // If we have structured search or this is the first term,
@@ -278,16 +272,8 @@ class SearchDescription
                 }
                 $aNewSearches[] = $oSearch;
             }
-        } elseif ($sPhraseType == ''
-                  && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null
-        ) {
-            // require a normalized exact match of the term
-            // if we have the normalizer version of the query
-            // available
-            if ($this->iOperator == Operator::NONE
-                && (isset($aSearchTerm['word']) && $aSearchTerm['word'])
-                && $bWordInQuery
-            ) {
+        } elseif ($sPhraseType == '' && $aSearchTerm['class']) {
+            if ($this->iOperator == Operator::NONE) {
                 $oSearch = clone $this;
                 $oSearch->iSearchRank++;
 
@@ -302,7 +288,10 @@ class SearchDescription
                 $oSearch->setPoiSearch($iOp, $aSearchTerm['class'], $aSearchTerm['type']);
                 $aNewSearches[] = $oSearch;
             }
-        } elseif (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) {
+        } elseif (isset($aSearchTerm['word_id'])
+                  && $aSearchTerm['word_id']
+                  && $sPhraseType != 'country'
+        ) {
             $iWordID = $aSearchTerm['word_id'];
             if (sizeof($this->aName)) {
                 if (($sPhraseType == '' || !$bFirstPhrase)
@@ -330,17 +319,15 @@ class SearchDescription
     /**
      * Derive new searches by adding a partial term to the existing search.
      *
-     * @param mixed[] $aSearchTerm          Description of the token.
-     * @param bool    $bStructuredPhrases   True if the search is structured.
-     * @param integer $iPhrase              Number of the phrase the token is in.
-     * @param mixed[] $aWordFrequencyScores Number of times tokens appears
-     *                                      overall in a planet database.
-     * @param array[] $aFullTokens          List of full term tokens with the
-     *                                      same name.
+     * @param mixed[] $aSearchTerm        Description of the token.
+     * @param bool    $bStructuredPhrases True if the search is structured.
+     * @param integer $iPhrase            Number of the phrase the token is in.
+     * @param array[] $aFullTokens        List of full term tokens with the
+     *                                    same name.
      *
      * @return SearchDescription[] List of derived search descriptions.
      */
-    public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, &$aWordFrequencyScores, $aFullTokens)
+    public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
     {
         // Only allow name terms.
         if (!(isset($aSearchTerm['word_id']) && $aSearchTerm['word_id'])) {
@@ -354,7 +341,7 @@ class SearchDescription
             && sizeof($this->aName)
             && strpos($aSearchTerm['word_token'], ' ') === false
         ) {
-            if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) {
+            if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) {
                 $oSearch = clone $this;
                 $oSearch->iSearchRank++;
                 $oSearch->aAddress[$iWordID] = $iWordID;
@@ -397,7 +384,7 @@ class SearchDescription
             if (preg_match('#^[0-9]+$#', $aSearchTerm['word_token'])) {
                 $oSearch->iSearchRank += 2;
             }
-            if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) {
+            if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) {
                 $oSearch->aName[$iWordID] = $iWordID;
             } else {
                 $oSearch->aNameNonSearch[$iWordID] = $iWordID;
diff --git a/lib/lib.php b/lib/lib.php
index b5fbee3e..76775d6c 100644
--- a/lib/lib.php
+++ b/lib/lib.php
@@ -60,54 +60,6 @@ function byImportance($a, $b)
 }
 
 
-function getWordSets($aWords, $iDepth)
-{
-    $aResult = array(array(join(' ', $aWords)));
-    $sFirstToken = '';
-    if ($iDepth < 7) {
-        while (sizeof($aWords) > 1) {
-            $sWord = array_shift($aWords);
-            $sFirstToken .= ($sFirstToken?' ':'').$sWord;
-            $aRest = getWordSets($aWords, $iDepth+1);
-            foreach ($aRest as $aSet) {
-                $aResult[] = array_merge(array($sFirstToken), $aSet);
-            }
-        }
-    }
-    return $aResult;
-}
-
-function getInverseWordSets($aWords, $iDepth)
-{
-    $aResult = array(array(join(' ', $aWords)));
-    $sFirstToken = '';
-    if ($iDepth < 8) {
-        while (sizeof($aWords) > 1) {
-            $sWord = array_pop($aWords);
-            $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
-            $aRest = getInverseWordSets($aWords, $iDepth+1);
-            foreach ($aRest as $aSet) {
-                $aResult[] = array_merge(array($sFirstToken), $aSet);
-            }
-        }
-    }
-    return $aResult;
-}
-
-
-function getTokensFromSets($aSets)
-{
-    $aTokens = array();
-    foreach ($aSets as $aSet) {
-        foreach ($aSet as $sWord) {
-            $aTokens[' '.$sWord] = ' '.$sWord;
-            $aTokens[$sWord] = $sWord;
-        }
-    }
-    return $aTokens;
-}
-
-
 function getClassTypes()
 {
     return array(
diff --git a/test/php/Nominatim/NominatimTest.php b/test/php/Nominatim/NominatimTest.php
index 33bb6d32..cae3ebb8 100644
--- a/test/php/Nominatim/NominatimTest.php
+++ b/test/php/Nominatim/NominatimTest.php
@@ -66,76 +66,6 @@ class NominatimTest extends \PHPUnit_Framework_TestCase
     }
 
 
-    public function testGetWordSets()
-    {
-        // given an array of arrays like
-        // array( array('a','b'), array('c','d') )
-        // returns a summary as string: '(a|b),(c|d)'
-
-
-        function serializeSets($aSets)
-        {
-            $aParts = array();
-            foreach ($aSets as $aSet) {
-                $aParts[] = '(' . join('|', $aSet) . ')';
-            }
-            return join(',', $aParts);
-        }
-
-        $this->assertEquals(
-            array(array('')),
-            getWordSets(array(), 0)
-        );
-
-        $this->assertEquals(
-            '(a)',
-            serializeSets(getWordSets(array("a"), 0))
-        );
-
-        $this->assertEquals(
-            '(a b),(a|b)',
-            serializeSets(getWordSets(array('a', 'b'), 0))
-        );
-
-        $this->assertEquals(
-            '(a b c),(a|b c),(a|b|c),(a b|c)',
-            serializeSets(getWordSets(array('a', 'b', 'c'), 0))
-        );
-
-        $this->assertEquals(
-            '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
-            serializeSets(getWordSets(array('a', 'b', 'c', 'd'), 0))
-        );
-
-
-        // Inverse
-        $this->assertEquals(
-            '(a b c),(c|a b),(c|b|a),(b c|a)',
-            serializeSets(getInverseWordSets(array('a', 'b', 'c'), 0))
-        );
-
-
-        // make sure we don't create too many sets
-        // 4 words => 8 sets
-        // 10 words => 511 sets
-        // 15 words => 12911 sets
-        // 18 words => 65536 sets
-        // 20 words => 169766 sets
-        // 22 words => 401930 sets
-        // 28 words => 3505699 sets (needs more than 4GB via 'phpunit -d memory_limit=' to run)
-        $this->assertEquals(
-            8,
-            count(getWordSets(array_fill(0, 4, 'a'), 0))
-        );
-
-
-        $this->assertEquals(
-            41226,
-            count(getWordSets(array_fill(0, 18, 'a'), 0))
-        );
-    }
-
-
     public function testCreatePointsAroundCenter()
     {
         // you might say we're creating a circle
diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php
new file mode 100644
index 00000000..db8d8b50
--- /dev/null
+++ b/test/php/Nominatim/PhraseTest.php
@@ -0,0 +1,87 @@
+<?php
+
+namespace Nominatim;
+
+require_once '../../lib/Phrase.php';
+
+class PhraseTest extends \PHPUnit_Framework_TestCase
+{
+    private function serializeSets($aSets)
+    {
+        $aParts = array();
+        foreach ($aSets as $aSet) {
+            $aParts[] = '(' . join('|', $aSet) . ')';
+        }
+        return join(',', $aParts);
+    }
+
+
+    public function testEmptyPhrase()
+    {
+        $oPhrase = new Phrase('', '');
+
+        $this->assertEquals(
+            array(array('')),
+            $oPhrase->getWordSets()
+        );
+    }
+
+
+    public function testSingleWordPhrase()
+    {
+        $oPhrase = new Phrase('a', '');
+
+        $this->assertEquals(
+            '(a)',
+            $this->serializeSets($oPhrase->getWordSets())
+        );
+    }
+
+
+    public function testMultiWordPhrase()
+    {
+        $oPhrase = new Phrase('a b', '');
+        $this->assertEquals(
+            '(a b),(a|b)',
+            $this->serializeSets($oPhrase->getWordSets())
+        );
+
+        $oPhrase = new Phrase('a b c', '');
+        $this->assertEquals(
+            '(a b c),(a|b c),(a|b|c),(a b|c)',
+            $this->serializeSets($oPhrase->getWordSets())
+        );
+
+        $oPhrase = new Phrase('a b c d', '');
+        $this->assertEquals(
+            '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
+            $this->serializeSets($oPhrase->getWordSets())
+        );
+    }
+
+
+    public function testInverseWordSets()
+    {
+        $oPhrase = new Phrase('a b c', '');
+        $oPhrase->invertWordSets();
+
+        $this->assertEquals(
+            '(a b c),(c|a b),(c|b|a),(b c|a)',
+            $this->serializeSets($oPhrase->getWordSets())
+        );
+    }
+
+
+    public function testMaxDepth()
+    {
+        $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
+        $this->assertEquals(8, count($oPhrase->getWordSets()));
+        $oPhrase->invertWordSets();
+        $this->assertEquals(8, count($oPhrase->getWordSets()));
+
+        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
+        $this->assertEquals(41226, count($oPhrase->getWordSets()));
+        $oPhrase->invertWordSets();
+        $this->assertEquals(41226, count($oPhrase->getWordSets()));
+    }
+}