#||
||
Compatible with: !!(green)**R6.1**!!
Current version: 0.1 !!Draft!!
Credits: ((user:WikiAdmin WikiAdmin))
|{{toc numerate=1}} ||
||#
Binary search allows users to do a global string search on all the content pages of a wiki using **exact** or **regex** search.
1. **exact** - binary search, include only results that match the exact search term (##BINARY LIKE##)
1. **regexp** - regex search, interpret the search term as a regular expression (##BINARY REGEXP##)
1. **union** - union search, include results that match any of the whitespace separated keywords (##AGAINST IN BOOLEAN MODE## & ##LIKE lower##)
##~{{search_binary}}##
**Work in progress**
This puts the ((/Doc/English/Actions/AdminReplace admin_replace)) binary search functionality into a separate action to be available for all users.
file:binary_search_example.png
TODO
* merge into search action (?)
* preview and context is different
* --add message sets for-- DONE
* %%
'SearchInPages' => 'Search in page contents',
'SearchInComments' => 'Search in comments',
'SearchInPageTitles' => 'Search in page titles',
%%
action/search_binary.php
%%(php)
<?php
if (!defined('IN_WACKO'))
{
exit;
}
/*
TODO:
- put target with regex data in session
- improve search, possibly merge with search action
- allow binary search only for registered users (?)
*/
$info = <<<EOD
Description:
binary search allows users to do a global string search on all the content pages of a wiki.
Usage:
{{search_binary}}
Options:
[page="PageName"]
[options=1]
[lang="en"]
[max=Number]
EOD;
// functions
$search_text = function ($target, $tag, $use_regex, $limit, $filter = [], $deleted = false)
{
$category_ids = null;
$lang = null;
$comments = null;
$pages = null;
$titles = null;
$prefix = $this->prefix;
extract($filter, EXTR_IF_EXISTS);
// choose match scope
if (($pages || $comments) && $titles)
{
$match =
($use_regex
? '(BINARY a.body REGEXP ' . $this->db->q($target) . ' ' .
'OR BINARY a.title REGEXP ' . $this->db->q($target) . ') '
: '(BINARY a.body LIKE ' . $this->db->q('%' . $target . '%') . ' ' .
'OR BINARY a.title LIKE ' . $this->db->q('%' . $target . '%') . ') '
);
}
else if ($pages || $comments)
{
$match =
($use_regex
? '(BINARY a.body REGEXP ' . $this->db->q($target) . ') '
: '(BINARY a.body LIKE ' . $this->db->q('%' . $target . '%') . ') '
);
}
else if ($titles)
{
$match =
($use_regex
? '(BINARY a.title REGEXP ' . $this->db->q($target) . ') '
: '(BINARY a.title LIKE ' . $this->db->q('%' . $target . '%') . ') '
);
}
// namespace: include tag and tag/%, not tag%
$selector =
($category_ids
? 'LEFT JOIN ' . $prefix . 'category_assignment ca ON (a.page_id = ca.object_id) '
: '') .
($tag
? 'LEFT JOIN ' . $prefix . 'page b ON (a.comment_on_id = b.page_id) '
: '') .
'WHERE ' .
$match .
($tag
? 'AND (a.tag = ' . $this->db->q($tag) . ' ' .
'OR a.tag LIKE ' . $this->db->q($tag . '/%') . ' ' .
'OR b.tag = ' . $this->db->q($tag) . ' ' .
'OR b.tag LIKE ' . $this->db->q($tag . '/%') . ') '
: '') .
($comments
? ($pages
? ''
: 'AND a.comment_on_id <> 0 ')
: 'AND a.comment_on_id = 0 ') .
(!empty($this->sess->replace_unset)
? 'AND a.page_id NOT IN (' . $this->ids_string($this->sess->replace_unset) . ') '
: '') .
($lang
? 'AND a.page_lang = ' . $this->db->q($lang) . ' '
: '') .
($category_ids
? 'AND ca.category_id IN (' . $this->ids_string($category_ids) . ') ' .
'AND ca.object_type_id = ' . (int) OBJECT_PAGE . ' '
: '') .
($deleted
? ''
: ($tag
? 'AND (a.deleted <> 1 OR b.deleted <> 1) '
: 'AND a.deleted <> 1 ')) .
' ';
$count = $this->db->load_single(
'SELECT COUNT(a.page_id) AS n ' .
'FROM ' . $prefix . 'page a ' .
$selector, true);
$pagination = $this->pagination($count['n'], $limit, 'p', ['target' => $target]
+ (!empty($lang) ? ['lang' => $lang] : [])
+ (!empty($category_ids) ? ['category_id' => $category_ids] : [])
+ (!empty($comments) ? ['comments' => $comments] : [])
+ (!empty($lang) ? ['lang' => $lang] : [])
+ (!empty($titles) ? ['titles' => $titles] : [])
+ (!empty($pages) ? ['pages' => $pages] : [])
+ (!empty($use_regex) ? ['use_regex' => $use_regex] : []));
// load search results
$results = $this->db->load_all(
'SELECT a.page_id, a.owner_id, a.user_id, a.tag, a.title, a.created, a.modified, a.body, a.comment_on_id, a.page_lang, a.page_size, a.comments,
u.user_name, o.user_name as owner_name ' .
'FROM ' . $prefix . 'page a ' .
'LEFT JOIN ' . $prefix . 'user u ON (a.user_id = u.user_id) ' .
'LEFT JOIN ' . $prefix . 'user o ON (a.owner_id = o.user_id) ' .
$selector .
'ORDER BY a.tag ' .
$pagination['limit']);
foreach ($results as $result)
{
$this->cache_page($result, true);
$page_ids[] = $result['page_id'];
$this->page_id_cache[$result['tag']] = $result['page_id'];
}
if (!empty($page_ids))
{
$this->preload_acl($page_ids);
$this->preload_categories($page_ids);
}
return [$results, $pagination, $count['n']];
};
$space_to_nbsp = function ($message)
{
$msg = Ut::html($message);
$msg = preg_replace('/^ /m', NBSP . ' ', $msg);
$msg = preg_replace('/ $/m', ' ' . NBSP, $msg);
$msg = preg_replace('/ /', NBSP . ' ', $msg);
return $msg;
};
/**
* Remove bytes that represent an incomplete Unicode character
* at the end of string (e.g. bytes of the char are missing)
*/
$remove_bad_char_last = function ($string)
{
if ($string != '')
{
$char = ord($string[strlen($string) - 1]);
$match = [];
if ($char >= 0xc0)
{
// got only the first byte of a multibyte char; remove it
$string = substr($string, 0, -1);
}
else if ($char >= 0x80 &&
// use the /s modifier so (.*) also matches newlines
preg_match('/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
'[\xf0-\xf7][\x80-\xbf]{1,2})$/s', $string, $match)
)
{
// chopped in the middle of a character; remove it
$string = $match[1];
}
}
return $string;
};
/**
* Remove bytes that represent an incomplete Unicode character
* at the start of string (e.g. bytes of the char are missing)
*/
$remove_bad_char_first = function ($string)
{
if ($string != '')
{
$char = ord($string[0]);
if ($char >= 0x80 && $char < 0xc0)
{
// chopped in the middle of a character; remove the whole thing
$string = preg_replace('/^[\x80-\xbf]+/', '', $string);
}
}
return $string;
};
/**
* Truncate a string to a specified length in bytes, appending an optional
* string (e.g. for ellipsis)
*/
$truncate_string = function ($string, $length = 80, $ellipsis = '...') use ($remove_bad_char_first, $remove_bad_char_last)
{
if (strlen($string) <= abs($length))
{
return $string;
}
if ($length == 0)
{
return $ellipsis;
}
if ($length > 0)
{
$string = substr($string, 0, $length); // text...
$string = $remove_bad_char_last($string);
$string = rtrim($string) . $ellipsis;
}
else
{
$string = substr($string, $length); // ...text
$string = $remove_bad_char_first($string);
$string = $ellipsis . ltrim($string);
}
return $string;
};
$extract_context = function ($text, $target, $use_regex = false, $padding = 40) use ($space_to_nbsp, $truncate_string)
{
// get all indexes
if ($use_regex)
{
$target_q = str_replace('/', "\\/", $target);
$target_str = "/$target_q/Uu";
}
else
{
$target_q = preg_quote($target, '/');
$target_str = "/$target_q/u";
}
preg_match_all($target_str, $text, $matches, PREG_OFFSET_CAPTURE);
$pos = $matches[0] ?? [];
$cuts = [];
// [0] => matched string, [1] => offset
for ($i = 0; $i < count($pos); $i++)
{
$index = $pos[$i][1];
$len = strlen($pos[$i][0]);
// merge to the next if possible
while (isset($pos[$i + 1][1]))
{
if ($pos[$i + 1][1] < $index + $len + $padding * 2)
{
$len += $pos[$i + 1][1] - $pos[$i][1];
$i++;
}
else
{
// can't merge, exit the inner loop
break;
}
}
$cuts[] = [$index, $len];
}
if (!$use_regex)
{
$target_q = preg_quote($space_to_nbsp($target), '/');
$target_str = "/$target_q/u";
}
$context = '';
foreach ($cuts as $cut)
{
[$index, $len, ] = $cut;
$context_before = substr($text, 0, $index);
$context_after = substr($text, $index + $len);
$context_before = $truncate_string($context_before, -$padding);
$context_after = $truncate_string($context_after, $padding);
$context .= $space_to_nbsp($context_before);
$snippet = $space_to_nbsp(substr($text, $index, $len));
$context .= preg_replace($target_str, '<code>\0</code>', $snippet);
$context .= $space_to_nbsp($context_after);
}
$context = str_replace("\n", '↵', $context);
return $context;
};
$search_form = function (array $o) use ($tpl)
{
$category_ids = $o['filter']['category_ids'];
$tpl->enter('search_');
$tpl->target = $o['target'];
$tpl->regex = $o['use_regex'];
$tpl->tag = $o['page'];
$tpl->pages = $o['pages'];
$tpl->titles = $o['titles'];
$tpl->comments = $o['comments'];
$tpl->cancel = true;
if ($o['options'])
{
$tpl->options = true;
$tpl->enter('options_');
$tpl->c_categories = $this->show_category_form($this->page_lang, null, OBJECT_PAGE, false, false, $category_ids);
if ($this->db->multilanguage)
{
$languages = $this->_t('LanguageArray');
$langs = $this->http->available_languages();
$tpl->l_selected = $o['lang'] ? null : ' selected';
foreach ($langs as $iso)
{
$tpl->l_o_iso = $iso;
$tpl->l_o_lang = $languages[$iso];
if ($iso == $o['lang'])
{
$tpl->l_o_selected = ' selected';
}
}
}
$tpl->leave(); // options_
}
$tpl->leave(); // search_
};
$show_matches = function ($pages, $pagination, $tcount, $max, array $o) use ($tpl, $search_text, $extract_context)
{
$tpl->enter('matches_');
$tpl->replace = true;
$tpl->cancel = true;
if (count($pages) > 5)
{
$tpl->invert = true;
}
$hidden = [
'categories' => implode(',', $o['filter']['category_ids']),
'comments' => $o['comments'],
'pages' => $o['pages'],
'titles' => $o['titles'],
'lang' => $o['lang'],
'tag' => $o['page'],
'target' => $o['target'],
'use_regex' => $o['use_regex'],
];
$tpl->offset = $pagination['offset'] + 1;
$tpl->pagination_text = $pagination['text'];
$tpl->enter('l_');
foreach ($pages as $n => $p)
{
if (!$this->db->hide_locked || $this->has_access('read', $p['page_id']))
{
$tpl->delim = $n++;
$preview = '';
// generate preview
if (($o['pages'] || $o['comments']) && $this->has_access('read', $p['page_id']))
{
$preview = $extract_context($p['body'], $o['target'], $o['use_regex'], $o['padding']);
}
$this->sess->replace_set[] = $p['page_id'];
$tpl->enter('l_');
$tpl->pageid = $p['page_id'];
$tpl->link = $this->link('/' . $p['tag'], '', ($o['title'] ? $p['title'] : $p['tag']), '', false);
$tpl->userlink = $this->user_link($p['user_name'], false, false);
$tpl->mtime = $p['modified'];
$tpl->psize = $this->factor_multiples($p['page_size'], 'binary', true, true);
if ($this->db->multilanguage)
{
$tpl->lang = '- ' . $p['page_lang'];
}
$tpl->category = $this->get_categories($p['page_id'], OBJECT_PAGE);
$tpl->preview = $preview;
if ($p['comments'])
{
$tpl->comments_n = $p['comments'];
}
$tpl->leave(); // l_
}
}
unset($p);
$tpl->leave(); // l_
if ($n)
{
$batch_count = $tcount > $pagination['perpage']
? $pagination['perpage'] . ' / ' . $tcount
: $tcount;
$tpl->mark_diag = $this->_t(($mode == 'topic' ? 'Topic' : '') . 'SearchResults');
$tpl->mark_phrase = $o['msg_target'];
$tpl->mark_count = $tcount; // TODO: count only accessible results
$tpl->emark = true;
}
$tpl->leave(); // matches_
};
// --------------------------------------------------------------------------------
// set defaults
$help ??= 0;
$lang ??= '';
$max ??= 50; // (10 ... 100)
$options ??= 0;
$padding ??= 40;
$page ??= '/';
$title ??= 1;
if ($help)
{
$tpl->help = $this->help($info, 'search_binary');
return;
}
/*
if (!$this->is_admin())
{
return;
}
*/
$action = (string) ($_POST['_action'] ?? null);
$categories = (string) ($_POST['categories'] ?? ($_GET['categories'] ?? ''));
$comments = (bool) ($_POST['comments'] ?? ($_GET['comments'] ?? 0));
$pages = (bool) ($_POST['pages'] ?? ($_GET['pages'] ?? 0));
$titles = (bool) ($_POST['titles'] ?? ($_GET['titles'] ?? 0));
$lang = (string) ($_POST['lang'] ?? ($_GET['lang'] ?? $lang));
$page = (string) ($_POST['page'] ?? ($_GET['page'] ?? $page));
$target = (string) trim(($_POST['target'] ?? ($_GET['target'] ?? '')));
#$phrase || $phrase = trim(($_GET['phrase'] ?? ''));
$use_regex = (bool) ($_POST['use_regex'] ?? ($_GET['use_regex'] ?? 0));
$show_search_form = true;
// remove \r (body contains only \n)
$target = str_replace("\r\n", "\n", $target);
// visualize line breaks in message sets
$msg_target = str_replace("\n", '↵', $target);
if ($lang && !$this->known_language($lang))
{
$lang = '';
$this->set_message($this->_t('FilterLangNotAvailable'));
}
// empty page parameter, use root not page context
$page = $page ?: '/';
$tag = $this->unwrap_link($page);
// category filter
$category_ids = [];
if ($categories)
{
$category_ids = explode(',', $categories);
}
foreach ($_POST as $key => $val)
{
if (preg_match('/^category(\d+)$/', $key, $ids) && $val == 'set')
{
$category_ids[] = $ids[1];
}
}
$o = [
'comments' => $comments,
'pages' => $pages,
'titles' => $titles,
'filter' => [
'category_ids' => $category_ids,
'comments' => $comments,
'lang' => $lang,
'pages' => $pages,
'titles' => $titles,
],
'lang' => $lang,
'msg_target' => $msg_target,
'options' => (bool) $options,
'padding' => (int) $padding,
'page' => $page,
'tag' => $tag,
'target' => $target,
'title' => $title,
'use_regex' => $use_regex,
];
// [B] show search matches
if ($target)
{
$error = null;
if (!$pages && !$comments && !$titles)
{
$error = ['ReplaceTextNoOption', 'hint'];
}
if ($error)
{
$tpl->message = $this->show_message($this->_t($error[0]), $error[1], false);
$show_search_form = true;
}
else if (mb_strlen($target) >= 3)
{
// search for target matches
[$pages, $pagination, $tcount] = $search_text($target, $tag, $use_regex, $max, $o['filter']);
if ($pages)
{
$show_matches($pages, $pagination, $tcount, $max, $o);
}
else
{
if ( (($pages || $comments)
|| ($pages || $comments) && $titles))
{
$msg = 'ReplaceTextNoMatch';
}
else
{
$msg = 'ReplaceTextNoTitleMatch';
}
$tpl->message = $this->show_message(
Ut::perc_replace(
$this->_t($msg),
'<code>' . Ut::html($msg_target) . '</code>'),
'note', false);
$show_search_form = true;
}
}
}
else
{
$show_search_form = true;
}
// [A] search form
if ($show_search_form)
{
unset(
$this->sess->replace_set,
$this->sess->replace_unset);
$search_form($o);
}
%%
action/template/search_binary.tpl
%%(php)
[ === main === ]
[ ' help ' ]
[ ' message ' ]<br>
[= search _ =
<form action="[ ' href: ' ]" method="post" name="select_pages">
[ ' csrf: select_pages ' ]
<label for="text_target">[ ' _t: SearchFor ' ]</label><br>
<textarea id="text_target" name="target" class="cols-100" cols="100" rows="5" title="[ ' _t: ReplaceTextGiveTarget ' ]" required>[ ' target | pre ' ]</textarea><br>
<input type="checkbox" id="use_regex" name="use_regex"[ ' regex | format ' checked' ' ]>
<label for="use_regex">[ ' _t: ReplaceTextRegex ' ]</label><br><br>
<label for="cluster" title="[ ' _t: ReplaceTextCluster ' ]">[ ' _t: Namespace ' ]</label><br>
<input type="text" id="cluster" name="page" value="[ ' tag | e attr ' ]" size="80" maxlength="255"><br>
<input type="checkbox" id="pages" name="pages"[ ' pages | format ' checked' ' ]>
<label for="pages">[ ' _t: SearchInPages ' ]</label><br>
<input type="checkbox" id="comments" name="comments"[ ' comments | format ' checked' ' ]>
<label for="comments">[ ' _t: SearchInComments ' ]</label><br>
<input type="checkbox" id="titles" name="titles"[ ' titles | format ' checked' ' ]>
<label for="titles">[ ' _t: SearchInPageTitles ' ]</label><br><br>
[= options _ =
<details open>
<summary>[ ' _t: OptionalFilters ' ]</summary>
<div class="form-options">
[= l _ =
<label for="language">[ ' _t: AccountLanguage ' ]</label><br>
<select id="language" name="lang">
<option value=""[ ' selected ' ]>[ ' _t: Any ' ]</option>
[= o _ =
<option value="[ ' iso ' ]"[ ' selected ' ]>[ ' lang ' ] ([ ' iso ' ])</option>
=]
</select><br>
=]
[= c _ =
<br>[ ' _t: Categories ' ]:
[ ' categories ' ]
=]
</div>
</details><br>
=]
<br>
<button type="submit" class="btn-ok">[ ' _t: SearchButton ' ]</button>
[ '' cancel '' ]<br>
</form>
<br>
=]
[= matches _ =
[= warning _ =
<p class="msg warning">[ ' msg ' ]</p><br>
=]
<br><br>
[ '' pagination '' ]
[= mark _ =
<div class="layout-box">
<p>
<span>[ ' diag ' ] «<strong>[ ' phrase | e ' ]</strong>» ([ ' count | e ' ]):</span>
</p>
=]
<ol id="search-results" start="[ ' offset ' ]">
[= l _ =
[ ' delim | void ' ]
<li>
[ '' l SearchItem '' ]
</li>
=]
</ol>
[= emark _ =
[ ' nonstatic ' ]
</div>
=]
[ '' pagination '' ]
<br>
<br>
=]
[= cancel =]
<a href="[ ' href: ' ]" class="btn-link">
<button type="button" class="btn-cancel">[ ' _t: CancelButton ' ]</button>
</a>
[= SearchItem =]
<h3>
[ ' link ' ]
</h3>
<span class="search-meta">[ ' mtime | time_format ' ] - [ ' userlink ' ] - [ ' psize ' ] [ ' lang ' ]
[= comments =
- <img src="[ ' db: theme_url ' ]icon/spacer.png" class="btn-comment btn-sm">[ ' n ' ]
=]
</span><br>
[ ' preview | nl2br ' ]
[ ' category ' ]
[= pagination =]
<nav class="pagination">[ ' text ' ]</nav>
%%
===Using regular expressions===
See ((/Doc/English/Actions/AdminReplace AdminReplace)) action
===See also===
* ((/Doc/English/Actions/AdminReplace AdminReplace)) - allows administrators to do a global string find-and-replace on all the content pages
* ((/Dev/PatchesHacks/MassRegexReplace MassRegexReplace)) - Mass edit using regular expressions