search.php

Go to the documentation of this file.
00001 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
00002 <html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
00003 <title>Search</title>
00004 <link href="doxygen.css" rel="stylesheet" type="text/css">
00005 </head><body>
00006 <!-- Generated by Doxygen 1.4.0 -->
00007 <div class="qindex">  <form class="search" action="search.php" method="get">
00008 <a class="qindex" href="main.html">Main&nbsp;Page</a> | <a class="qindex" href="namespaces.html">Namespace List</a> | <a class="qindex" href="hierarchy.html">Class&nbsp;Hierarchy</a> | <a class="qindex" href="annotated.html">Class&nbsp;List</a> | <a class="qindex" href="dirs.html">Directories</a> | <a class="qindex" href="files.html">File&nbsp;List</a> | <a class="qindex" href="functions.html">Class&nbsp;Members</a> | <a class="qindex" href="globals.html">File&nbsp;Members</a>  | <span class="searchHL"><u>S</u>earch&nbsp;for&nbsp;
00009 <?php
00010 
00011 function search_results()
00012 {
00013   return "Search Results";
00014 }
00015 
00016 function matches_text($num)
00017 {
00018   if ($num==0)
00019   {
00020     return "Sorry, no documents matching your query.";
00021   }
00022   else if ($num==1)
00023   {
00024     return "Found <b>1</b> document matching your query.";
00025   }
00026   else // $num>1
00027   {
00028     return "Found <b>$num</b> documents matching your query. Showing best matches first.";
00029   }
00030 }
00031 
00032 function report_matches()
00033 {
00034   return "Matches: ";
00035 }
00036 
00037 function readInt($file)
00038 {
00039   $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
00040   $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
00041   return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
00042 }
00043 
00044 function readString($file)
00045 {
00046   $result="";
00047   while (ord($c=fgetc($file))) $result.=$c;
00048   return $result;
00049 }
00050 
00051 function readHeader($file)
00052 {
00053   $header =fgetc($file); $header.=fgetc($file);
00054   $header.=fgetc($file); $header.=fgetc($file);
00055   return $header;
00056 }
00057 
00058 function computeIndex($word)
00059 {
00060   if (strlen($word)<2) return -1;
00061   // high char of the index
00062   $hi = ord($word{0});
00063   if ($hi==0) return -1;
00064   // low char of the index
00065   $lo = ord($word{1});
00066   if ($lo==0) return -1;
00067   // return index
00068   return $hi*256+$lo;
00069 }
00070 
00071 function search($file,$word,&$statsList)
00072 {
00073   $index = computeIndex($word);
00074   if ($index!=-1) // found a valid index
00075   {
00076     fseek($file,$index*4+4); // 4 bytes per entry, skip header
00077     $index = readInt($file);
00078     if ($index) // found words matching first two characters
00079     {
00080       $start=sizeof($statsList);
00081       $count=$start;
00082       fseek($file,$index);
00083       $w = readString($file);
00084       while ($w)
00085       {
00086         $statIdx = readInt($file);
00087         if ($word==substr($w,0,strlen($word)))
00088         { // found word that matches (as substring)
00089           $statsList[$count++]=array(
00090               "word"=>$word,
00091               "match"=>$w,
00092               "index"=>$statIdx,
00093               "full"=>strlen($w)==strlen($word),
00094               "docs"=>array()
00095               );
00096         }
00097         $w = readString($file);
00098       }
00099       $totalHi=0;
00100       $totalFreqHi=0;
00101       $totalFreqLo=0;
00102       for ($count=$start;$count<sizeof($statsList);$count++)
00103       {
00104         $statInfo = &$statsList[$count];
00105         $multiplier = 1;
00106         // whole word matches have a double weight
00107         if ($statInfo["full"]) $multiplier=2;
00108         fseek($file,$statInfo["index"]); 
00109         $numDocs = readInt($file);
00110         $docInfo = array();
00111         // read docs info + occurrence frequency of the word
00112         for ($i=0;$i<$numDocs;$i++)
00113         {
00114           $idx=readInt($file); 
00115           $freq=readInt($file); 
00116           $docInfo[$i]=array("idx"  => $idx,
00117                              "freq" => $freq>>1,
00118                              "rank" => 0.0,
00119                              "hi"   => $freq&1
00120                             );
00121           if ($freq&1) // word occurs in high priority doc
00122           {
00123             $totalHi++;
00124             $totalFreqHi+=$freq*$multiplier;
00125           }
00126           else // word occurs in low priority doc
00127           {
00128             $totalFreqLo+=$freq*$multiplier;
00129           }
00130         }
00131         // read name and url info for the doc
00132         for ($i=0;$i<$numDocs;$i++)
00133         {
00134           fseek($file,$docInfo[$i]["idx"]);
00135           $docInfo[$i]["name"]=readString($file);
00136           $docInfo[$i]["url"]=readString($file);
00137         }
00138         $statInfo["docs"]=$docInfo;
00139       }
00140       $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;
00141       for ($count=$start;$count<sizeof($statsList);$count++)
00142       {
00143         $statInfo = &$statsList[$count];
00144         $multiplier = 1;
00145         // whole word matches have a double weight
00146         if ($statInfo["full"]) $multiplier=2;
00147         for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
00148         {
00149           $docInfo = &$statInfo["docs"];
00150           // compute frequency rank of the word in each doc
00151           $freq=$docInfo[$i]["freq"];
00152           if ($docInfo[$i]["hi"])
00153           {
00154             $statInfo["docs"][$i]["rank"]=
00155               (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;
00156           }
00157           else
00158           {
00159             $statInfo["docs"][$i]["rank"]=
00160               (float)($freq*$multiplier)/$totalFreq;
00161           }
00162         }
00163       }
00164     }
00165   }
00166   return $statsList;
00167 }
00168 
00169 function combine_results($results,&$docs)
00170 {
00171   foreach ($results as $wordInfo)
00172   {
00173     $docsList = &$wordInfo["docs"];
00174     foreach ($docsList as $di)
00175     {
00176       $key=$di["url"];
00177       $rank=$di["rank"];
00178       if (in_array($key, array_keys($docs)))
00179       {
00180         $docs[$key]["rank"]+=$rank;
00181       }
00182       else
00183       {
00184         $docs[$key] = array("url"=>$key,
00185             "name"=>$di["name"],
00186             "rank"=>$rank
00187             );
00188       }
00189       $docs[$key]["words"][] = array(
00190                "word"=>$wordInfo["word"],
00191                "match"=>$wordInfo["match"],
00192                "freq"=>$di["freq"]
00193                );
00194     }
00195   }
00196   return $docs;
00197 }
00198 
00199 function filter_results($docs,&$requiredWords,&$forbiddenWords)
00200 {
00201   $filteredDocs=array();
00202   while (list ($key, $val) = each ($docs)) 
00203   {
00204     $words = &$docs[$key]["words"];
00205     $copy=1; // copy entry by default
00206     if (sizeof($requiredWords)>0)
00207     {
00208       foreach ($requiredWords as $reqWord)
00209       {
00210         $found=0;
00211         foreach ($words as $wordInfo)
00212         { 
00213           $found = $wordInfo["word"]==$reqWord;
00214           if ($found) break;
00215         }
00216         if (!$found) 
00217         {
00218           $copy=0; // document contains none of the required words
00219           break;
00220         }
00221       }
00222     }
00223     if (sizeof($forbiddenWords)>0)
00224     {
00225       foreach ($words as $wordInfo)
00226       {
00227         if (in_array($wordInfo["word"],$forbiddenWords))
00228         {
00229           $copy=0; // document contains a forbidden word
00230           break;
00231         }
00232       }
00233     }
00234     if ($copy) $filteredDocs[$key]=$docs[$key];
00235   }
00236   return $filteredDocs;
00237 }
00238 
00239 function compare_rank($a,$b)
00240 {
00241   if ($a["rank"] == $b["rank"]) 
00242   {
00243     return 0;
00244   }
00245   return ($a["rank"]>$b["rank"]) ? -1 : 1; 
00246 }
00247 
00248 function sort_results($docs,&$sorted)
00249 {
00250   $sorted = $docs;
00251   usort($sorted,"compare_rank");
00252   return $sorted;
00253 }
00254 
00255 function report_results(&$docs)
00256 {
00257   echo "<table cellspacing=\"2\">\n";
00258   echo "  <tr>\n";
00259   echo "    <td colspan=\"2\"><h2>".search_results()."</h2></td>\n";
00260   echo "  </tr>\n";
00261   $numDocs = sizeof($docs);
00262   if ($numDocs==0)
00263   {
00264     echo "  <tr>\n";
00265     echo "    <td colspan=\"2\">".matches_text(0)."</td>\n";
00266     echo "  </tr>\n";
00267   }
00268   else
00269   {
00270     echo "  <tr>\n";
00271     echo "    <td colspan=\"2\">".matches_text($numDocs);
00272     echo "\n";
00273     echo "    </td>\n";
00274     echo "  </tr>\n";
00275     $num=1;
00276     foreach ($docs as $doc)
00277     {
00278       echo "  <tr>\n";
00279       echo "    <td align=\"right\">$num.</td>";
00280       echo     "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
00281       echo "  <tr>\n";
00282       echo "    <td></td><td class=\"tiny\">".report_matches()." ";
00283       foreach ($doc["words"] as $wordInfo)
00284       {
00285         $word = $wordInfo["word"];
00286         $matchRight = substr($wordInfo["match"],strlen($word));
00287         echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
00288       }
00289       echo "    </td>\n";
00290       echo "  </tr>\n";
00291       $num++;
00292     }
00293   }
00294   echo "</table>\n";
00295 }
00296 
00297 function main()
00298 {
00299   if(strcmp('4.1.0', phpversion()) > 0) 
00300   {
00301     die("Error: PHP version 4.1.0 or above required!");
00302   }
00303   if (!($file=fopen("search.idx","rb"))) 
00304   {
00305     die("Error: Search index file could NOT be opened!");
00306   }
00307   if (readHeader($file)!="DOXS")
00308   {
00309     die("Error: Header of index file is invalid!");
00310   }
00311   $query="";
00312   if (array_key_exists("query", $_GET))
00313   {
00314     $query=$_GET["query"];
00315   }
00316   echo "<input class=\"search\" type=\"text\" name=\"query\" value=\"$query\" size=\"20\" accesskey=\"s\"/>\n";
00317   echo "</span>\n";
00318   echo "</form>\n";
00319   echo "</div>\n";
00320   $results = array();
00321   $requiredWords = array();
00322   $forbiddenWords = array();
00323   $foundWords = array();
00324   $word=strtok($query," ");
00325   while ($word) // for each word in the search query
00326   {
00327     if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
00328     if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
00329     if (!in_array($word,$foundWords))
00330     {
00331       $foundWords[]=$word;
00332       search($file,$word,$results);
00333     }
00334     $word=strtok(" ");
00335   }
00336   $docs = array();
00337   combine_results($results,$docs);
00338   // filter out documents with forbidden word or that do not contain
00339   // required words
00340   $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
00341   // sort the results based on rank
00342   $sorted = array();
00343   sort_results($filteredDocs,$sorted);
00344   // report results to the user
00345   report_results($sorted);
00346   fclose($file);
00347 }
00348 
00349 main();
00350 
00351 
00352 ?>
00353 <hr size="1"><address style="align: right;"><small>Generated on Sat Jan 15 16:00:33 2005 for GXSM by&nbsp;
00354 <a href="http://www.doxygen.org/index.html">
00355 <img src="doxygen.png" alt="doxygen" align="middle" border="0"></a> 1.4.0 </small></address>
00356 </body>
00357 </html>

Generated on Sat Apr 1 09:03:44 2006 for GXSM by  doxygen 1.4.6