[ PHPXref.com ] [ Generated: Thu Aug 19 03:35:06 2010 ] [ FluxBB 1.4.2 ]
[ Index ]     [ Variables ]     [ Functions ]     [ Classes ]     [ Constants ]     [ Statistics ]

title

Body

[close]

/include/ -> search_idx.php (source)

   1  <?php
   2  
   3  /**
   4   * Copyright (C) 2008-2010 FluxBB
   5   * based on code by Rickard Andersson copyright (C) 2002-2008 PunBB
   6   * License: http://www.gnu.org/licenses/gpl.html GPL version 2 or higher
   7   */
   8  
   9  // The contents of this file are very much inspired by the file functions_search.php
  10  // from the phpBB Group forum software phpBB2 (http://www.phpbb.com)
  11  
  12  
  13  // Make sure no one attempts to run this script "directly"
  14  if (!defined('PUN'))
  15      exit;
  16  
  17  
  18  // Make a regex that will match CJK or Hangul characters
  19  define('PUN_CJK_HANGUL_REGEX', '['.
  20      '\x{1100}-\x{11FF}'.        // Hangul Jamo                            1100-11FF        (http://www.fileformat.info/info/unicode/block/hangul_jamo/index.htm)
  21      '\x{3130}-\x{318F}'.        // Hangul Compatibility Jamo            3130-318F        (http://www.fileformat.info/info/unicode/block/hangul_compatibility_jamo/index.htm)
  22      '\x{AC00}-\x{D7AF}'.        // Hangul Syllables                        AC00-D7AF        (http://www.fileformat.info/info/unicode/block/hangul_syllables/index.htm)
  23  
  24      // Hiragana
  25      '\x{3040}-\x{309F}'.        // Hiragana                                3040-309F        (http://www.fileformat.info/info/unicode/block/hiragana/index.htm)
  26  
  27      // Katakana
  28      '\x{30A0}-\x{30FF}'.        // Katakana                                30A0-30FF        (http://www.fileformat.info/info/unicode/block/katakana/index.htm)
  29      '\x{31F0}-\x{31FF}'.        // Katakana Phonetic Extensions            31F0-31FF        (http://www.fileformat.info/info/unicode/block/katakana_phonetic_extensions/index.htm)
  30  
  31      // CJK Unified Ideographs    (http://en.wikipedia.org/wiki/CJK_Unified_Ideographs)
  32      '\x{2E80}-\x{2EFF}'.        // CJK Radicals Supplement                2E80-2EFF        (http://www.fileformat.info/info/unicode/block/cjk_radicals_supplement/index.htm)
  33      '\x{2F00}-\x{2FDF}'.        // Kangxi Radicals                        2F00-2FDF        (http://www.fileformat.info/info/unicode/block/kangxi_radicals/index.htm)
  34      '\x{2FF0}-\x{2FFF}'.        // Ideographic Description Characters    2FF0-2FFF        (http://www.fileformat.info/info/unicode/block/ideographic_description_characters/index.htm)
  35      '\x{3000}-\x{303F}'.        // CJK Symbols and Punctuation            3000-303F        (http://www.fileformat.info/info/unicode/block/cjk_symbols_and_punctuation/index.htm)
  36      '\x{31C0}-\x{31EF}'.        // CJK Strokes                            31C0-31EF        (http://www.fileformat.info/info/unicode/block/cjk_strokes/index.htm)
  37      '\x{3200}-\x{32FF}'.        // Enclosed CJK Letters and Months        3200-32FF        (http://www.fileformat.info/info/unicode/block/enclosed_cjk_letters_and_months/index.htm)
  38      '\x{3400}-\x{4DBF}'.        // CJK Unified Ideographs Extension A    3400-4DBF        (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_a/index.htm)
  39      '\x{4E00}-\x{9FFF}'.        // CJK Unified Ideographs                4E00-9FFF        (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs/index.htm)
  40      '\x{20000}-\x{2A6DF}'.        // CJK Unified Ideographs Extension B    20000-2A6DF        (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_b/index.htm)
  41  ']');
  42  
  43  
  44  //
  45  // "Cleans up" a text string and returns an array of unique words
  46  // This function depends on the current locale setting
  47  //
  48  function split_words($text, $idx)
  49  {
  50      // Remove BBCode
  51      $text = preg_replace('/\[\/?(b|u|s|ins|del|em|i|h|colou?r|quote|code|img|url|email|list)(?:\=[^\]]*)?\]/', ' ', $text);
  52  
  53      // Remove any apostrophes or dashes which aren't part of words
  54      $text = substr(preg_replace('/((?<=\W)[\'\-]|[\'\-](?=\W))/', '', ' '.$text.' '), 1, -1);
  55  
  56      // Remove symbols and multiple whitespace, allow % and * if we aren't indexing
  57      $text = preg_replace('/[\^\$&\(\)<>`"„\|,@_\?~\+\[\]{}:=\/#\\\\;!\.…\s•'.($idx ? '%\*' : '').']+/u', ' ', $text);
  58  
  59      // Replace multiple dashes with just one
  60      $text = preg_replace('/-{2,}/', '-', $text);
  61  
  62      // Fill an array with all the words
  63      $words = array_unique(explode(' ', $text));
  64  
  65      // Remove any words that should not be indexed
  66      foreach ($words as $key => $value)
  67      {
  68          // If the word shouldn't be indexed, remove it
  69          if (!validate_search_word($value, $idx))
  70              unset($words[$key]);
  71      }
  72  
  73      return $words;
  74  }
  75  
  76  
  77  //
  78  // Checks if a word is a valid searchable word
  79  //
  80  function validate_search_word($word, $idx)
  81  {
  82      global $pun_user, $pun_config;
  83      static $stopwords;
  84  
  85      // If the word is a keyword we don't want to index it, but we do want to be allowed to search it
  86      if (is_keyword($word))
  87          return !$idx;
  88  
  89      $language = isset($pun_user['language']) ? $pun_user['language'] : $pun_config['o_default_lang'];
  90      if (!isset($stopwords))
  91      {
  92          if (file_exists(PUN_ROOT.'lang/'.$language.'/stopwords.txt'))
  93          {
  94              $stopwords = file(PUN_ROOT.'lang/'.$language.'/stopwords.txt');
  95              $stopwords = array_map('pun_trim', $stopwords);
  96              $stopwords = array_filter($stopwords);
  97          }
  98          else
  99              $stopwords = array();
 100      }
 101  
 102      // If it is a stopword it isn't valid
 103      if (in_array($word, $stopwords))
 104          return false;
 105  
 106      // If the word if CJK we don't want to index it, but we do want to be allowed to search it
 107      if (is_cjk($word))
 108          return !$idx;
 109  
 110      // Check the word is within the min/max length
 111      $num_chars = pun_strlen($word);
 112      return $num_chars >= PUN_SEARCH_MIN_WORD && $num_chars <= PUN_SEARCH_MAX_WORD;
 113  }
 114  
 115  
 116  //
 117  // Check a given word is a search keyword.
 118  //
 119  function is_keyword($word)
 120  {
 121      return $word == 'and' || $word == 'or' || $word == 'not';
 122  }
 123  
 124  
 125  //
 126  // Check if a given word is CJK or Hangul.
 127  //
 128  function is_cjk($word)
 129  {
 130      return preg_match('/^'.PUN_CJK_HANGUL_REGEX.'+$/u', $word) ? true : false;
 131  }
 132  
 133  
 134  //
 135  // Strip [img] [url] and [email] out of the message so we don't index their contents
 136  //
 137  function strip_bbcode($text)
 138  {
 139      static $patterns;
 140  
 141      if (!isset($patterns))
 142      {
 143          $patterns = array(
 144              '%\[img=([^\]]*+)\][^[]*+\[/img\]%'                                    =>    '$1',    // Keep the alt description
 145              '%\[(url|email)=[^\]]*+\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' =>    '$2',    // Keep the text
 146              '%\[(img|url|email)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%'        =>    '',        // Remove the whole thing
 147          );
 148      }
 149  
 150      return preg_replace(array_keys($patterns), array_values($patterns), $text);
 151  }
 152  
 153  
 154  //
 155  // Updates the search index with the contents of $post_id (and $subject)
 156  //
 157  function update_search_index($mode, $post_id, $message, $subject = null)
 158  {
 159      global $db_type, $db;
 160  
 161      $message = utf8_strtolower($message);
 162      $subject = utf8_strtolower($subject);
 163  
 164      // Remove any bbcode that we shouldn't index
 165      $message = strip_bbcode($message);
 166  
 167      // Split old and new post/subject to obtain array of 'words'
 168      $words_message = split_words($message, true);
 169      $words_subject = ($subject) ? split_words($subject, true) : array();
 170  
 171      if ($mode == 'edit')
 172      {
 173          $result = $db->query('SELECT w.id, w.word, m.subject_match FROM '.$db->prefix.'search_words AS w INNER JOIN '.$db->prefix.'search_matches AS m ON w.id=m.word_id WHERE m.post_id='.$post_id, true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error());
 174  
 175          // Declare here to stop array_keys() and array_diff() from complaining if not set
 176          $cur_words['post'] = array();
 177          $cur_words['subject'] = array();
 178  
 179          while ($row = $db->fetch_row($result))
 180          {
 181              $match_in = ($row[2]) ? 'subject' : 'post';
 182              $cur_words[$match_in][$row[1]] = $row[0];
 183          }
 184  
 185          $db->free_result($result);
 186  
 187          $words['add']['post'] = array_diff($words_message, array_keys($cur_words['post']));
 188          $words['add']['subject'] = array_diff($words_subject, array_keys($cur_words['subject']));
 189          $words['del']['post'] = array_diff(array_keys($cur_words['post']), $words_message);
 190          $words['del']['subject'] = array_diff(array_keys($cur_words['subject']), $words_subject);
 191      }
 192      else
 193      {
 194          $words['add']['post'] = $words_message;
 195          $words['add']['subject'] = $words_subject;
 196          $words['del']['post'] = array();
 197          $words['del']['subject'] = array();
 198      }
 199  
 200      unset($words_message);
 201      unset($words_subject);
 202  
 203      // Get unique words from the above arrays
 204      $unique_words = array_unique(array_merge($words['add']['post'], $words['add']['subject']));
 205  
 206      if (!empty($unique_words))
 207      {
 208          $result = $db->query('SELECT id, word FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $unique_words)).'\')', true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error());
 209  
 210          $word_ids = array();
 211          while ($row = $db->fetch_row($result))
 212              $word_ids[$row[1]] = $row[0];
 213  
 214          $db->free_result($result);
 215  
 216          $new_words = array_diff($unique_words, array_keys($word_ids));
 217          unset($unique_words);
 218  
 219          if (!empty($new_words))
 220          {
 221              switch ($db_type)
 222              {
 223                  case 'mysql':
 224                  case 'mysqli':
 225                  case 'mysql_innodb':
 226                  case 'mysqli_innodb':
 227                      $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.implode('\'),(\'', array_map(array($db, 'escape'), $new_words)).'\')');
 228                      break;
 229  
 230                  default:
 231                      foreach ($new_words as $word)
 232                          $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.$db->escape($word).'\')');
 233                      break;
 234              }
 235          }
 236  
 237          unset($new_words);
 238      }
 239  
 240      // Delete matches (only if editing a post)
 241      foreach ($words['del'] as $match_in => $wordlist)
 242      {
 243          $subject_match = ($match_in == 'subject') ? 1 : 0;
 244  
 245          if (!empty($wordlist))
 246          {
 247              $sql = '';
 248              foreach ($wordlist as $word)
 249                  $sql .= (($sql != '') ? ',' : '').$cur_words[$match_in][$word];
 250  
 251              $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE word_id IN('.$sql.') AND post_id='.$post_id.' AND subject_match='.$subject_match) or error('Unable to delete search index word matches', __FILE__, __LINE__, $db->error());
 252          }
 253      }
 254  
 255      // Add new matches
 256      foreach ($words['add'] as $match_in => $wordlist)
 257      {
 258          $subject_match = ($match_in == 'subject') ? 1 : 0;
 259  
 260          if (!empty($wordlist))
 261              $db->query('INSERT INTO '.$db->prefix.'search_matches (post_id, word_id, subject_match) SELECT '.$post_id.', id, '.$subject_match.' FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $wordlist)).'\')') or error('Unable to insert search index word matches', __FILE__, __LINE__, $db->error());
 262      }
 263  
 264      unset($words);
 265  }
 266  
 267  
 268  //
 269  // Strip search index of indexed words in $post_ids
 270  //
 271  function strip_search_index($post_ids)
 272  {
 273      global $db_type, $db;
 274  
 275      switch ($db_type)
 276      {
 277          case 'mysql':
 278          case 'mysqli':
 279          case 'mysql_innodb':
 280          case 'mysqli_innodb':
 281          {
 282              $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error());
 283  
 284              if ($db->num_rows($result))
 285              {
 286                  $word_ids = '';
 287                  while ($row = $db->fetch_row($result))
 288                      $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0];
 289  
 290                  $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN('.$word_ids.') GROUP BY word_id HAVING COUNT(word_id)=1') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error());
 291  
 292                  if ($db->num_rows($result))
 293                  {
 294                      $word_ids = '';
 295                      while ($row = $db->fetch_row($result))
 296                          $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0];
 297  
 298                      $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN('.$word_ids.')') or error('Unable to delete search index word', __FILE__, __LINE__, $db->error());
 299                  }
 300              }
 301  
 302              break;
 303          }
 304  
 305          default:
 306              $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id) GROUP BY word_id HAVING COUNT(word_id)=1)') or error('Unable to delete from search index', __FILE__, __LINE__, $db->error());
 307              break;
 308      }
 309  
 310      $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.')') or error('Unable to delete search index word match', __FILE__, __LINE__, $db->error());
 311  }


[ Powered by PHPXref - Served by Debian GNU/Linux ]