| [ PHPXref.com ] | [ Generated: Thu Aug 19 03:35:06 2010 ] | [ FluxBB 1.4.2 ] |
| [ Index ] [ Variables ] [ Functions ] [ Classes ] [ Constants ] [ Statistics ] | ||
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * Copyright (C) 2008-2010 FluxBB 5 * based on code by Rickard Andersson copyright (C) 2002-2008 PunBB 6 * License: http://www.gnu.org/licenses/gpl.html GPL version 2 or higher 7 */ 8 9 // The contents of this file are very much inspired by the file functions_search.php 10 // from the phpBB Group forum software phpBB2 (http://www.phpbb.com) 11 12 13 // Make sure no one attempts to run this script "directly" 14 if (!defined('PUN')) 15 exit; 16 17 18 // Make a regex that will match CJK or Hangul characters 19 define('PUN_CJK_HANGUL_REGEX', '['. 20 '\x{1100}-\x{11FF}'. // Hangul Jamo 1100-11FF (http://www.fileformat.info/info/unicode/block/hangul_jamo/index.htm) 21 '\x{3130}-\x{318F}'. // Hangul Compatibility Jamo 3130-318F (http://www.fileformat.info/info/unicode/block/hangul_compatibility_jamo/index.htm) 22 '\x{AC00}-\x{D7AF}'. // Hangul Syllables AC00-D7AF (http://www.fileformat.info/info/unicode/block/hangul_syllables/index.htm) 23 24 // Hiragana 25 '\x{3040}-\x{309F}'. // Hiragana 3040-309F (http://www.fileformat.info/info/unicode/block/hiragana/index.htm) 26 27 // Katakana 28 '\x{30A0}-\x{30FF}'. // Katakana 30A0-30FF (http://www.fileformat.info/info/unicode/block/katakana/index.htm) 29 '\x{31F0}-\x{31FF}'. // Katakana Phonetic Extensions 31F0-31FF (http://www.fileformat.info/info/unicode/block/katakana_phonetic_extensions/index.htm) 30 31 // CJK Unified Ideographs (http://en.wikipedia.org/wiki/CJK_Unified_Ideographs) 32 '\x{2E80}-\x{2EFF}'. // CJK Radicals Supplement 2E80-2EFF (http://www.fileformat.info/info/unicode/block/cjk_radicals_supplement/index.htm) 33 '\x{2F00}-\x{2FDF}'. // Kangxi Radicals 2F00-2FDF (http://www.fileformat.info/info/unicode/block/kangxi_radicals/index.htm) 34 '\x{2FF0}-\x{2FFF}'. // Ideographic Description Characters 2FF0-2FFF (http://www.fileformat.info/info/unicode/block/ideographic_description_characters/index.htm) 35 '\x{3000}-\x{303F}'. // CJK Symbols and Punctuation 3000-303F (http://www.fileformat.info/info/unicode/block/cjk_symbols_and_punctuation/index.htm) 36 '\x{31C0}-\x{31EF}'. // CJK Strokes 31C0-31EF (http://www.fileformat.info/info/unicode/block/cjk_strokes/index.htm) 37 '\x{3200}-\x{32FF}'. // Enclosed CJK Letters and Months 3200-32FF (http://www.fileformat.info/info/unicode/block/enclosed_cjk_letters_and_months/index.htm) 38 '\x{3400}-\x{4DBF}'. // CJK Unified Ideographs Extension A 3400-4DBF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_a/index.htm) 39 '\x{4E00}-\x{9FFF}'. // CJK Unified Ideographs 4E00-9FFF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs/index.htm) 40 '\x{20000}-\x{2A6DF}'. // CJK Unified Ideographs Extension B 20000-2A6DF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_b/index.htm) 41 ']'); 42 43 44 // 45 // "Cleans up" a text string and returns an array of unique words 46 // This function depends on the current locale setting 47 // 48 function split_words($text, $idx) 49 { 50 // Remove BBCode 51 $text = preg_replace('/\[\/?(b|u|s|ins|del|em|i|h|colou?r|quote|code|img|url|email|list)(?:\=[^\]]*)?\]/', ' ', $text); 52 53 // Remove any apostrophes or dashes which aren't part of words 54 $text = substr(preg_replace('/((?<=\W)[\'\-]|[\'\-](?=\W))/', '', ' '.$text.' '), 1, -1); 55 56 // Remove symbols and multiple whitespace, allow % and * if we aren't indexing 57 $text = preg_replace('/[\^\$&\(\)<>`"„\|,@_\?~\+\[\]{}:=\/#\\\\;!\.…\s•'.($idx ? '%\*' : '').']+/u', ' ', $text); 58 59 // Replace multiple dashes with just one 60 $text = preg_replace('/-{2,}/', '-', $text); 61 62 // Fill an array with all the words 63 $words = array_unique(explode(' ', $text)); 64 65 // Remove any words that should not be indexed 66 foreach ($words as $key => $value) 67 { 68 // If the word shouldn't be indexed, remove it 69 if (!validate_search_word($value, $idx)) 70 unset($words[$key]); 71 } 72 73 return $words; 74 } 75 76 77 // 78 // Checks if a word is a valid searchable word 79 // 80 function validate_search_word($word, $idx) 81 { 82 global $pun_user, $pun_config; 83 static $stopwords; 84 85 // If the word is a keyword we don't want to index it, but we do want to be allowed to search it 86 if (is_keyword($word)) 87 return !$idx; 88 89 $language = isset($pun_user['language']) ? $pun_user['language'] : $pun_config['o_default_lang']; 90 if (!isset($stopwords)) 91 { 92 if (file_exists(PUN_ROOT.'lang/'.$language.'/stopwords.txt')) 93 { 94 $stopwords = file(PUN_ROOT.'lang/'.$language.'/stopwords.txt'); 95 $stopwords = array_map('pun_trim', $stopwords); 96 $stopwords = array_filter($stopwords); 97 } 98 else 99 $stopwords = array(); 100 } 101 102 // If it is a stopword it isn't valid 103 if (in_array($word, $stopwords)) 104 return false; 105 106 // If the word if CJK we don't want to index it, but we do want to be allowed to search it 107 if (is_cjk($word)) 108 return !$idx; 109 110 // Check the word is within the min/max length 111 $num_chars = pun_strlen($word); 112 return $num_chars >= PUN_SEARCH_MIN_WORD && $num_chars <= PUN_SEARCH_MAX_WORD; 113 } 114 115 116 // 117 // Check a given word is a search keyword. 118 // 119 function is_keyword($word) 120 { 121 return $word == 'and' || $word == 'or' || $word == 'not'; 122 } 123 124 125 // 126 // Check if a given word is CJK or Hangul. 127 // 128 function is_cjk($word) 129 { 130 return preg_match('/^'.PUN_CJK_HANGUL_REGEX.'+$/u', $word) ? true : false; 131 } 132 133 134 // 135 // Strip [img] [url] and [email] out of the message so we don't index their contents 136 // 137 function strip_bbcode($text) 138 { 139 static $patterns; 140 141 if (!isset($patterns)) 142 { 143 $patterns = array( 144 '%\[img=([^\]]*+)\][^[]*+\[/img\]%' => '$1', // Keep the alt description 145 '%\[(url|email)=[^\]]*+\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2', // Keep the text 146 '%\[(img|url|email)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '', // Remove the whole thing 147 ); 148 } 149 150 return preg_replace(array_keys($patterns), array_values($patterns), $text); 151 } 152 153 154 // 155 // Updates the search index with the contents of $post_id (and $subject) 156 // 157 function update_search_index($mode, $post_id, $message, $subject = null) 158 { 159 global $db_type, $db; 160 161 $message = utf8_strtolower($message); 162 $subject = utf8_strtolower($subject); 163 164 // Remove any bbcode that we shouldn't index 165 $message = strip_bbcode($message); 166 167 // Split old and new post/subject to obtain array of 'words' 168 $words_message = split_words($message, true); 169 $words_subject = ($subject) ? split_words($subject, true) : array(); 170 171 if ($mode == 'edit') 172 { 173 $result = $db->query('SELECT w.id, w.word, m.subject_match FROM '.$db->prefix.'search_words AS w INNER JOIN '.$db->prefix.'search_matches AS m ON w.id=m.word_id WHERE m.post_id='.$post_id, true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); 174 175 // Declare here to stop array_keys() and array_diff() from complaining if not set 176 $cur_words['post'] = array(); 177 $cur_words['subject'] = array(); 178 179 while ($row = $db->fetch_row($result)) 180 { 181 $match_in = ($row[2]) ? 'subject' : 'post'; 182 $cur_words[$match_in][$row[1]] = $row[0]; 183 } 184 185 $db->free_result($result); 186 187 $words['add']['post'] = array_diff($words_message, array_keys($cur_words['post'])); 188 $words['add']['subject'] = array_diff($words_subject, array_keys($cur_words['subject'])); 189 $words['del']['post'] = array_diff(array_keys($cur_words['post']), $words_message); 190 $words['del']['subject'] = array_diff(array_keys($cur_words['subject']), $words_subject); 191 } 192 else 193 { 194 $words['add']['post'] = $words_message; 195 $words['add']['subject'] = $words_subject; 196 $words['del']['post'] = array(); 197 $words['del']['subject'] = array(); 198 } 199 200 unset($words_message); 201 unset($words_subject); 202 203 // Get unique words from the above arrays 204 $unique_words = array_unique(array_merge($words['add']['post'], $words['add']['subject'])); 205 206 if (!empty($unique_words)) 207 { 208 $result = $db->query('SELECT id, word FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $unique_words)).'\')', true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); 209 210 $word_ids = array(); 211 while ($row = $db->fetch_row($result)) 212 $word_ids[$row[1]] = $row[0]; 213 214 $db->free_result($result); 215 216 $new_words = array_diff($unique_words, array_keys($word_ids)); 217 unset($unique_words); 218 219 if (!empty($new_words)) 220 { 221 switch ($db_type) 222 { 223 case 'mysql': 224 case 'mysqli': 225 case 'mysql_innodb': 226 case 'mysqli_innodb': 227 $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.implode('\'),(\'', array_map(array($db, 'escape'), $new_words)).'\')'); 228 break; 229 230 default: 231 foreach ($new_words as $word) 232 $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.$db->escape($word).'\')'); 233 break; 234 } 235 } 236 237 unset($new_words); 238 } 239 240 // Delete matches (only if editing a post) 241 foreach ($words['del'] as $match_in => $wordlist) 242 { 243 $subject_match = ($match_in == 'subject') ? 1 : 0; 244 245 if (!empty($wordlist)) 246 { 247 $sql = ''; 248 foreach ($wordlist as $word) 249 $sql .= (($sql != '') ? ',' : '').$cur_words[$match_in][$word]; 250 251 $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE word_id IN('.$sql.') AND post_id='.$post_id.' AND subject_match='.$subject_match) or error('Unable to delete search index word matches', __FILE__, __LINE__, $db->error()); 252 } 253 } 254 255 // Add new matches 256 foreach ($words['add'] as $match_in => $wordlist) 257 { 258 $subject_match = ($match_in == 'subject') ? 1 : 0; 259 260 if (!empty($wordlist)) 261 $db->query('INSERT INTO '.$db->prefix.'search_matches (post_id, word_id, subject_match) SELECT '.$post_id.', id, '.$subject_match.' FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $wordlist)).'\')') or error('Unable to insert search index word matches', __FILE__, __LINE__, $db->error()); 262 } 263 264 unset($words); 265 } 266 267 268 // 269 // Strip search index of indexed words in $post_ids 270 // 271 function strip_search_index($post_ids) 272 { 273 global $db_type, $db; 274 275 switch ($db_type) 276 { 277 case 'mysql': 278 case 'mysqli': 279 case 'mysql_innodb': 280 case 'mysqli_innodb': 281 { 282 $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); 283 284 if ($db->num_rows($result)) 285 { 286 $word_ids = ''; 287 while ($row = $db->fetch_row($result)) 288 $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; 289 290 $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN('.$word_ids.') GROUP BY word_id HAVING COUNT(word_id)=1') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); 291 292 if ($db->num_rows($result)) 293 { 294 $word_ids = ''; 295 while ($row = $db->fetch_row($result)) 296 $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; 297 298 $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN('.$word_ids.')') or error('Unable to delete search index word', __FILE__, __LINE__, $db->error()); 299 } 300 } 301 302 break; 303 } 304 305 default: 306 $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id) GROUP BY word_id HAVING COUNT(word_id)=1)') or error('Unable to delete from search index', __FILE__, __LINE__, $db->error()); 307 break; 308 } 309 310 $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.')') or error('Unable to delete search index word match', __FILE__, __LINE__, $db->error()); 311 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| [ Powered by PHPXref - Served by Debian GNU/Linux ] |